In [1]:
#Import Pandas
import pandas as pd

# Import Numpy
import numpy as np


In [2]:
# Load Movies Metadata
metadata = pd.read_csv('fileM.csv', low_memory=False)

# Load keywords and credits
credits = pd.read_csv('fileC.csv')
keywords = pd.read_csv('fileK.csv')



In [3]:
# Print the first three rows
metadata.head(3)



Unnamed: 0.1,Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,status,tagline,title,video,vote_average,vote_count,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,Released,,Toy Story,False,7.7,5415,,,,
1,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413,,,,
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92,,,,


In [4]:
#Print plot overviews of the first 5 movies.
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [5]:
# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

# Print the first two movies of your newly merged metadata
metadata.head(2)

Unnamed: 0.1,Unnamed: 0_x,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,vote_count,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 0_y,cast,crew,Unnamed: 0,keywords
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,5415,,,,,0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,...,2413,,,,,1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",1,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [6]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

# Import Numpy
import numpy as np

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []


In [7]:
# Define new director, cast, genres and keywords features that are in a suitable form.
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)


#Print the new features of the first 3 films
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)


def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

metadata[['soup']].head(2)

Unnamed: 0,soup
0,tomhanks timallen donrickles johnlasseter ani...
1,robinwilliams jonathanhyde kirstendunst joejo...


In [8]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['soup' ] = metadata['soup'].fillna('')



In [9]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix2 = tfidf.fit_transform(metadata['soup'])

#Output the shape of tfidf_matrix
tfidf_matrix2.shape

#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names_out()[5000:5010]

# Import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim2 = cosine_similarity(tfidf_matrix2, tfidf_matrix2)









In [10]:
cosine_sim2.shape

(20493, 20493)

In [11]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

indices[:10]
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim_2 = cosine_sim2):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
   
    sim_scores = list(enumerate(cosine_sim2[idx]))
    
    

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices].drop_duplicates()



get_recommendations('The Godfather')

966       The Godfather: Part II
1580     The Godfather: Part III
3371            Gardens of Stone
953               Apocalypse Now
839          Looking for Richard
15595        Hide in Plain Sight
11354             Chinese Coffee
4209              One-Eyed Jacks
1310               The Rainmaker
1666               The Outsiders
Name: title, dtype: object

In [12]:
get_recommendations('Spider-Man')

6786               Spider-Man 2
10038              Spider-Man 3
255      The Quick and the Dead
960            Army of Darkness
12285               Daybreakers
1839              Pleasantville
1481             Small Soldiers
1004               Evil Dead II
5835                    Darkman
2687                Wonder Boys
Name: title, dtype: object

In [13]:
get_recommendations('The Dark Knight')

8607             Batman Begins
14292    The Dark Knight Rises
9667              The Prestige
16199                Doodlebug
4492                  Insomnia
6192                 Ned Kelly
2073                 Following
7802                 Two Hands
11737           Public Enemies
Name: title, dtype: object

In [14]:
get_recommendations('Toy Story')

2523                  Toy Story 2
9073                     Luxo Jr.
14536                     Tin Toy
14556                 Red's Dream
14572                 Knick Knack
12815                 Toy Story 3
13350        Crazy on the Outside
13975                      Cars 2
833            That Thing You Do!
15479    Mater and the Ghostlight
Name: title, dtype: object

In [15]:
get_recommendations('Toy Story')

2523                  Toy Story 2
9073                     Luxo Jr.
14536                     Tin Toy
14556                 Red's Dream
14572                 Knick Knack
12815                 Toy Story 3
13350        Crazy on the Outside
13975                      Cars 2
833            That Thing You Do!
15479    Mater and the Ghostlight
Name: title, dtype: object

In [16]:
get_recommendations('Jumanji')

484                          The Pagemaster
1610               Honey, I Shrunk the Kids
17248             The Prince and the Pauper
2006                            October Sky
13509             Cirque du Soleil: Varekai
6786                           Spider-Man 2
10038                          Spider-Man 3
1481                         Small Soldiers
14010    Captain America: The First Avenger
2823                                   Hook
Name: title, dtype: object

In [17]:
get_recommendations('The Dark Knight')


8607             Batman Begins
14292    The Dark Knight Rises
9667              The Prestige
16199                Doodlebug
4492                  Insomnia
6192                 Ned Kelly
2073                 Following
7802                 Two Hands
11737           Public Enemies
Name: title, dtype: object

In [18]:
import pickle


In [19]:
pickle.dump(metadata,open('movies.pkl','wb'))

In [20]:
pickle.dump(metadata.to_dict,open('movies_dict.pkl','wb'))

In [21]:
pickle.dump(cosine_sim2,open('cosine_sim2.pkl','wb'))