<h1 style="font-family:Georgia; font-weight:normal; letter-spacing: 2px; color:#1192AA; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #1192AA">Libraries</h1>

In [1]:
import pandas as pd
import numpy as np
import ast
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import NearestNeighbors

<h1 style="font-family:Georgia; font-weight:normal; letter-spacing: 2px; color:#1192AA; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #1192AA">Load Data</h1>

In [2]:
# Set the file paths for the credits and movies datasets
credits_path = 'tmdb_5000_credits.csv'
movies_path = 'tmdb_5000_movies.csv'

df_credits = pd.read_csv(credits_path)
print(f'{credits_path} loaded successfully')

df_movies = pd.read_csv(movies_path)
print(f'{movies_path} loaded successfully')

print('-'*50)
print(f'[INFO] Shapes:'
      f'\n credits: {df_credits.shape}'
      f'\n movies: {df_movies.shape}\n')

print(f'[INFO] Any missing values:'
      f'\n creadits: {df_credits.isna().any().any()}'
      f'\n movies: {df_movies.isna().any().any()}\n')

tmdb_5000_credits.csv loaded successfully
tmdb_5000_movies.csv loaded successfully
--------------------------------------------------
[INFO] Shapes:
 credits: (4803, 4)
 movies: (4803, 20)

[INFO] Any missing values:
 creadits: False
 movies: True



In [3]:
df_credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [4]:
df_movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


<h1 style="font-family:Georgia; font-weight:normal; letter-spacing: 2px; color:#1192AA; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #1192AA">Data Preprocessing</h1>

In [5]:
df = df_movies.merge(df_credits, on='title')
df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [6]:
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [7]:
df.drop(columns=['homepage', 'tagline'], inplace=True)
df.dropna(inplace=True)

In [8]:
df.isnull().sum()

budget                  0
genres                  0
id                      0
keywords                0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
status                  0
title                   0
vote_average            0
vote_count              0
movie_id                0
cast                    0
crew                    0
dtype: int64

In [9]:
df = df.loc[:,['movie_id','title','overview','genres','keywords','cast','crew']]

In [10]:
df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [11]:
df.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
# Convert the list of dictionaries stored as a string to a list of 'name' values
def convert(text):
    l = []
    for i in ast.literal_eval(text):
        l.append(i['name']) 
    return l 

# Convert only the first 3 elements of the list of dictionaries stored as a string
def convert3(text):
    l = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            l.append(i['name'])
        counter+=1
    return l 

# Fetch names of directors from the list of dictionaries stored as a string
def fetch_director(text):
    l = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            l.append(i['name'])
    return l 

# Remove spaces from the elements in a list of strings
def collapse(L):
    l = []
    for i in L:
        l.append(i.replace(" ",""))
    return l

# Process and create tags
def create_tags(df):
    # Convert 'genres', 'keywords', 'cast', and 'crew' columns to lists of names
    df['genres'] = df['genres'].apply(convert)
    df['keywords'] = df['keywords'].apply(convert)
    df['cast'] = df['cast'].apply(convert3)
    df['crew'] = df['crew'].apply(fetch_director)
    
    # Remove spaces from the names in 'cast', 'crew', 'genres', and 'keywords' columns
    df['cast'] = df['cast'].apply(collapse)
    df['crew'] = df['crew'].apply(collapse)
    df['genres'] = df['genres'].apply(collapse)
    df['keywords'] = df['keywords'].apply(collapse)
    
    # Split the 'overview' column into lists of words
    df['overview'] = df['overview'].apply(lambda x:x.split())
    
    # Combine the lists of words from different columns to create 'tags' column
    df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']
    
    # Drop unnecessary columns and join the elements in the 'tags' column to form a string
    df = df.drop(columns=['overview','genres','keywords','cast','crew'])
    df['tags'] = df['tags'].apply(lambda x: " ".join(x))

    return df

In [13]:
movies_df = create_tags(df)
movies_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [14]:
movies_df.tags[0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

<h1 style="font-family:Georgia; font-weight:normal; letter-spacing: 2px; color:#1192AA; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #1192AA">Recommendation System</h1>

In [15]:
def tfidf_scores(df):

    # Stopword Removal ("the," "is," "and," etc.)
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df['tags'])
    
    # TF-IDF Calculation
    tfidf_scores = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

    return pd.concat([df, tfidf_scores], axis=1)

In [16]:
movies_df = tfidf_scores(movies_df)
movies_df.dropna(inplace=True)
movies_df.head()

Unnamed: 0,movie_id,title,tags,00,000,007,07am,10,100,1000,...,única,über,đỗthịhảiyến,špelacolja,γη,юлияснигирь,卧底肥妈,张立,绝地奶霸,超级妈妈
0,19995.0,Avatar,"In the 22nd century, a paraplegic Marine is di...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,285.0,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,206647.0,Spectre,A cryptic message from Bond’s past sends him o...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,49026.0,The Dark Knight Rises,Following the death of District Attorney Harve...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,49529.0,John Carter,"John Carter is a war-weary, former military ca...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
title_series = movies_df.iloc[:, 1]
title_series

0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
                          ...                   
4800                Sanctuary: Quite a Conundrum
4801                                        Bang
4802                                      Primer
4803                                      Cavite
4804                                 El Mariachi
Name: title, Length: 4801, dtype: object

In [18]:
# Create KNN model
knn_model = NearestNeighbors(n_neighbors=6, metric='cosine', algorithm='brute')

# Extract the TF-IDF features from the DataFrame
X = movies_df.drop(['movie_id', 'title', 'tags'], axis=1).values

# Fit the KNN model on the TF-IDF features
knn_model.fit(X)

In [23]:
# Find the indices of the top 5 similar movies for a given movie
def recommend_movie(title):
    
    #find index of given movie
    matching_entries = title_series[title_series == title]
    
    if not matching_entries.empty:
        index = matching_entries.index[0]
    else:
        print("'{}' entry not found in our dataset. Try different title".format(search_entry))
        return None
    
    query_movie_index = index  # Replace with the index of the movie you want to get recommendations for
    distances, indices = knn_model.kneighbors(X[query_movie_index].reshape(1, -1), n_neighbors=6)

    # 'indices' will contain the indices of the top 5 similar movies
    similar_movie_indices = indices[0][1:]

    # Get the titles of the similar movies
    similar_movie_titles = movies_df.iloc[similar_movie_indices, 1].values

    print(f"Top 5 similar movies for '{title}' are:")
    for t in similar_movie_titles:
        print(f'-{t}')
    return similar_movie_titles

<h1 style="font-family:Georgia; font-weight:normal; letter-spacing: 2px; color:#1192AA; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #1192AA">Testing System</h1>

In [25]:
movie_title = "Iron Man" # type movie title you want to get recommendations for 

similar_movie_titles = recommend_movie(movie_title)

Top 5 similar movies for 'Iron Man' are:
-Iron Man 2
-Iron Man 3
-Avengers: Age of Ultron
-The Avengers
-Captain America: Civil War


<h1 style="font-family:Georgia; font-weight:normal; letter-spacing: 2px; color:#1192AA; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #1192AA">...</h1>

In [34]:
import pickle
pickle.dump(movies_df,open('movies_df.pkl','wb'))
pickle.dump(knn_model,open('knn_model.pkl','wb'))
pickle.dump(title_series,open('movie_list.pkl','wb'))