In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import ast
import warnings
warnings.filterwarnings('ignore')

In [2]:
base_dir = r"C:\Users\rs577\Project Data\Movie_recommanded_system"
DATA_PATH = os.path.join(base_dir, "data", "movies_metadata.csv")
df = pd.read_csv(DATA_PATH)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [4]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [5]:
df.shape

(45466, 24)

In [6]:
df.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(13)

In [8]:
df = df.drop_duplicates().reset_index(drop=True)

In [9]:
df = df[['title','overview','genres','tagline', 'vote_average','popularity','original_language']]

In [10]:
df.head()

Unnamed: 0,title,overview,genres,tagline,vote_average,popularity,original_language
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,7.7,21.946943,en
1,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Roll the dice and unleash the excitement!,6.9,17.015539,en
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Still Yelling. Still Fighting. Still Ready for...,6.5,11.7129,en
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",Friends are the people who let you be yourself...,6.1,3.859495,en
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",Just When His World Is Back To Normal... He's ...,5.7,8.387519,en


In [11]:
df = df.dropna(subset=['title'])

In [12]:
df['overview'] = df['overview'].fillna('')
df['tagline'] = df['tagline'].fillna('')
df['original_language'] = df['original_language'].fillna('')

In [13]:
df['genres'] = df['genres'].apply(lambda x: ' '.join([i['name'] for i in ast.literal_eval(x)]))

In [14]:
df.isnull().sum()

title                0
overview             0
genres               0
tagline              0
vote_average         0
popularity           0
original_language    0
dtype: int64

In [15]:
df['tags'] = df['overview'] + ' ' + df['genres'] + ' ' + df['tagline'] + ' ' + df['original_language']

In [16]:
df['tags'][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences. Animation Comedy Family  en"

In [17]:
import nltk
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
import re

In [18]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rs577\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rs577\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [20]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text) 
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [21]:
df['tags'] = df['tags'].apply(preprocess_text)

In [22]:
df['tags'][0]

'led woody andys toy live happily room andys birthday brings buzz lightyear onto scene afraid losing place andys heart woody plot buzz circumstance separate buzz woody owner duo eventually learns put aside difference animation comedy family en'

In [23]:
df = df.reset_index(drop=True)

In [24]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Subdue                         45442
Century of Birthing            45443
Betrayal                       45444
Satan Triumphant               45445
Queerama                       45446
Length: 45447, dtype: int64

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
tfidf = TfidfVectorizer(max_features=12000, stop_words='english', ngram_range=(1,2))


In [27]:
tfidf_matrix = tfidf.fit_transform(df['tags'])

In [28]:
tfidf_matrix.shape

(45447, 12000)

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
def recommend(title, n = 10):
    if title not in indices:
        return "Movie not found in the database."
    
    idx = indices[title]
    sim_scores = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    similar_idx = sim_scores.argsort()[-n-1:-1][::-1]

    return df['title'].iloc[similar_idx]

In [40]:
recommend("Toy Story")

2996                Toy Story 2
24512                 Small Fry
15344               Toy Story 3
6434     What's Up, Tiger Lily?
11396    For Your Consideration
39593                     Ozzie
28972           Superstar Goofy
1071      Rebel Without a Cause
1931                  Condorman
28860                   Wild 90
Name: title, dtype: object

In [41]:
import pickle
pickle.dump(tfidf, open('tfidf_matrix.pkl', 'wb'))
pickle.dump(df, open('df.pkl', 'wb'))
pickle.dump(indices, open('indices.pkl', 'wb'))
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))