In [1]:
# importing the libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# loading the datasets
movies = pd.read_csv('movies_metadata.csv')
credits = pd.read_csv('credits.csv')

In [54]:
movies.sample(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
42682,False,,3000000,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,51021,tt1196197,en,Lure: Teen Fight Club,A community is under siege as three Belmont Hi...,...,2010-11-16,0.0,93.0,[],Released,High School Can Be Brutal,Lure: Teen Fight Club,False,3.0,5.0
24082,False,,50000000,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,216282,tt2106361,en,Into the Storm,The town of Silverton is in one day destroyed ...,...,2014-08-06,160602194.0,89.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Prepare to go,Into the Storm,False,5.8,829.0
5660,False,,0,"[{'id': 35, 'name': 'Comedy'}]",,39497,tt0180679,en,Escanaba in da Moonlight,A macho man in a family on the eve of deer-hun...,...,2001-01-26,0.0,91.0,[],Released,,Escanaba in da Moonlight,False,4.4,4.0
375,False,,115000000,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,36955,tt0111503,en,True Lies,Harry Tasker is a secret agent for the United ...,...,1994-07-14,378882411.0,141.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"When he said I do, he never said what he did.",True Lies,False,6.8,1138.0
26882,False,,0,"[{'id': 35, 'name': 'Comedy'}]",,43131,tt0023498,en,Speak Easily,A professor gets mixed up with chorus girls in...,...,1932-08-13,0.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Speak Easily,False,0.0,0.0


In [53]:
credits.sample(5)

Unnamed: 0,cast,crew,id
33672,"[{'cast_id': 0, 'character': 'Choi Ik-Ho', 'cr...","[{'credit_id': '5572e3b6c3a368376f003c21', 'de...",320150
11596,"[{'cast_id': 21, 'character': 'Walter Sparrow ...","[{'credit_id': '52fe439fc3a36847f8062b8d', 'de...",3594
25407,"[{'cast_id': 10, 'character': 'Ringo', 'credit...","[{'credit_id': '52fe4900c3a36847f81847ab', 'de...",56431
943,"[{'cast_id': 8, 'character': 'Billy Dannreuthe...","[{'credit_id': '52fe444dc3a368484e01b667', 'de...",22733
9747,"[{'cast_id': 2, 'character': 'June Mills', 'cr...","[{'credit_id': '52fe44f6c3a368484e040b7f', 'de...",26243


In [3]:
movies.shape

(45466, 24)

In [4]:
movies['id'] = pd.to_numeric(movies['id'],errors='coerce')

In [5]:
movies.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                       float64
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

In [6]:
movies.dropna(subset=['id'],inplace=True)

In [7]:
movies.shape

(45463, 24)

In [8]:
# converting id column to int type
movies['id'] = movies['id'].astype(int)

In [9]:
movies.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   3
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      3
runtime                    260
spoken_languages             3
status                      84
tagline                  25051
title                        3
video                        3
vote_average                 3
vote_count                   3
dtype: int64

In [10]:
# merging both the datasets
comb = movies.merge(credits,on='id')

In [11]:
comb.shape

(45538, 26)

In [12]:
comb.duplicated().sum()

76

In [13]:
# dropping duplicate rows
comb.drop_duplicates(inplace=True)

In [14]:
comb.duplicated().sum()

0

In [15]:
comb.isnull().sum()

adult                        0
belongs_to_collection    40969
budget                       0
genres                       0
homepage                 37685
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   3
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      3
runtime                    260
spoken_languages             3
status                      84
tagline                  25050
title                        3
video                        3
vote_average                 3
vote_count                   3
cast                         0
crew                         0
dtype: int64

In [16]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [17]:
comb['crew'] = comb['crew'].apply(fetch_director)

In [18]:
def convert3(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L 

In [19]:
comb['cast'] = comb['cast'].apply(convert3)

In [20]:
comb['cast'] = comb['cast'].apply(lambda x:x[0:3])

In [21]:
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 

In [22]:
comb['genres'] = comb['genres'].apply(convert)

In [23]:
new2 = comb[['id','title','overview','genres','cast','crew']] 

In [24]:
new2.shape

(45462, 6)

In [25]:
new2.isnull().sum()

id            0
title         3
overview    954
genres        0
cast          0
crew          0
dtype: int64

In [26]:
new2.dropna(inplace=True)

In [27]:
new2['overview'] = new2['overview'].astype('str')

In [28]:
new2['overview'] = new2['overview'].apply(lambda x:x.split())

In [29]:
new2['title'][0]

'Toy Story'

In [30]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [31]:
new2['cast'] = new2['cast'].apply(collapse)
new2['crew'] = new2['crew'].apply(collapse)
new2['genres'] = new2['genres'].apply(collapse)

In [32]:
new2['tags'] = new2['overview'] + new2['genres'] + new2['cast'] + new2['crew']

In [33]:
new3 = new2[['id','title','tags']]

In [34]:
new3['tags'] = new3['tags'].apply(lambda x:" ".join(x))

In [35]:
new3['tags'] = new3['tags'].apply(lambda x:x.lower())

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500,stop_words='english')

In [37]:
vector = cv.fit_transform(new3['tags']).toarray()

In [39]:
similarity = cosine_similarity(vector)

MemoryError: Unable to allocate 14.8 GiB for an array with shape (44505, 44505) and data type float64

In [40]:
def recommend(movie):
    index = new3[new3['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new3.iloc[i[0]].title)

In [55]:
recommend('True Lies')

NameError: name 'similarity' is not defined