In [71]:
import pandas as pd 
import numpy as np

movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [72]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [73]:
# merging movies and credits
movies = movies.merge(credits, on='title')

In [74]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [75]:
# Taking relevant features
# genres 
# id 
# keywords
# title
# overview
# cast 
# crew

movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [76]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [77]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [78]:
# removing the rows where overview is missing
movies.dropna(inplace=True)

In [79]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [80]:
# checking for duplicated rows
movies.duplicated().sum()

0

In [81]:
# function for extracting names of genres and keywords in a list
# as they are in an object form
import ast 

def convert(obj):
    L = []
    for i in ast.literal_eval(obj): # converts the obj into list
        L.append(i['name'])
    return L

In [82]:
movies['genres'] = movies['genres'].apply(convert)

In [83]:
movies['keywords'] = movies['keywords'].apply(convert)

In [84]:
# same for cast, taking top three casts of movie
def convert3(obj):
    L = []
    cnt = 0
    for i in ast.literal_eval(obj): # converts the obj into list
        if cnt != 3: 
            L.append(i['name'])
            cnt += 1
        else:
            break
    return L

In [85]:
movies['cast'] = movies['cast'].apply(convert3)

In [86]:
# same for crew, extracting the director name
def convert_crew(obj):
    L = []
    for i in ast.literal_eval(obj): # converts the obj into list
        if i['job'] == 'Director': 
            L.append(i['name'])
            break
    return L

In [87]:
movies['crew'] = movies['crew'].apply(convert_crew)

In [88]:
movies.head(1) 

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [89]:
# converting overview string into list of words
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [90]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [91]:
# removing spaces from genres, keywords, cast and crew
# if a name has space, it will become two different tags
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

In [92]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [93]:
# merging all overview, genres, keywords, cast, crew into a single list and taking in new col
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [94]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [95]:
# taking only tags col
new_movies_df = movies[['movie_id', 'title', 'tags']]

In [96]:
new_movies_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [97]:
# convert the tags list into string
new_movies_df['tags'] = new_movies_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_df['tags'] = new_movies_df['tags'].apply(lambda x:" ".join(x))


In [98]:
new_movies_df['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

In [99]:
# convert all in lowercase
new_movies_df['tags'] = new_movies_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_df['tags'] = new_movies_df['tags'].apply(lambda x:x.lower())


In [100]:
new_movies_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."


In [101]:
# perform stemming to avoid redundant words during vectorization
# porterStemmer converts word into its base word
# eg., dancing -> danc, smoking -> smok
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [102]:
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [103]:
new_movies_df['tags'] = new_movies_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_df['tags'] = new_movies_df['tags'].apply(stem)


In [104]:
# vectorizing the tags 
# a dataframe of 4806 movies(X) and 5000 words(Y) 
# It contains count of each word in each movie
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [105]:
vectors = cv.fit_transform(new_movies_df['tags']).toarray()

In [106]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [107]:
vectors.shape

(4806, 5000)

In [108]:
# calculating similarity using cosine distance, calculating angle between vectors
# compare each movie's vector with all other movies' vectors
from sklearn.metrics.pairwise import cosine_similarity

In [109]:
# values lie between 0 to 1, if completely similar then value is 1
similarity = cosine_similarity(vectors)

In [110]:
# recommending the 5 highest movies in similarity
# taking index of movie by name
def recommend(movie):
    movie_index = new_movies_df[new_movies_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movies_list:
        movie_name = new_movies_df.iloc[i[0]].title
        print(movie_name)
    return 

In [111]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [112]:
import pickle
pickle.dump(new_movies_df, open('movies.pkl', 'wb'))

In [114]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))