In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity


# Loading the data and preprocessing the data

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
_credits = pd.read_csv('tmdb_5000_credits.csv')


Merging the dataframe

In [3]:
dataset = movies.merge(_credits,on = 'title')
dataset = dataset[['movie_id','title','overview','genres','keywords','cast','crew']]

Checking missing data

In [4]:
dataset.isnull().sum()
### Gives us 3 rows where overview is not present. Since it is very insignificant in amount, I will drop it
dataset.dropna(inplace = True)
dataset.duplicated().sum()

0

Converting the genres column to a list which is more interpretable and informative for our purpose

In [5]:
def convert(obj):
    l=[]
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

dataset['genres']=dataset['genres'].apply(convert)
dataset['keywords'] = dataset['keywords'].apply(convert)

COnverting the cast column to meaningful first 3 casts.. Rest of the information can be ignored


In [6]:
def convert_for_cast(obj):
    l=[]
    c =0 
    for i in ast.literal_eval(obj):
        if c!=3:
            #s = i['name']
            #s=s.replace(' ','')
            l.append(i['name'])
            c+=1
        else:
            break
        
    return l
dataset['cast']=dataset['cast'].apply(convert_for_cast)

In [7]:
def get_director(obj):
    l=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            l.append(i['name'])
            break
    return l
dataset['director'] = dataset['crew'].apply(get_director)

Removing spaces and making overview split

In [8]:
dataset.drop(columns = 'crew',inplace=True)
dataset['overview'] = dataset['overview'].apply(lambda x: x.split())
dataset['genres'] = dataset['genres'].apply(lambda x: [i.replace(' ','') for i in x])
dataset['cast'] = dataset['cast'].apply(lambda x: [i.replace(' ','')for i in x])
dataset['keywords'] = dataset['keywords'].apply(lambda x: [i.replace(' ','')for i in x])
dataset['director'] = dataset['director'].apply(lambda x: [i.replace(' ','')for i in x])

In [9]:
dataset['tags'] = dataset['overview']+ dataset['genres']+ dataset['keywords']+ dataset['cast'] + dataset['director']

Making a new Dataset

In [10]:
dataset_new =dataset.drop(columns = ['overview','genres','keywords','cast','director'])
dataset_new['tags'] = dataset_new['tags'].apply(lambda x:" ".join(x))
dataset_new['tags'] = dataset_new['tags'].apply(lambda x: x.lower())

Stem the words


In [11]:
ps = PorterStemmer()

def stem(y):
    ls = []

    for i in y.split():
        ls.append(ps.stem(i))
    
    return ' '.join(ls)

dataset_new['tags'] = dataset_new['tags'].apply(stem)

# Make a Count Vecoriser

In [12]:
cv = CountVectorizer(max_features= 5000, stop_words='english')
array = cv.fit_transform(dataset_new['tags']).toarray()


# Use cosine similarity to find the similarity of each data with other

In [13]:
similarity  = cosine_similarity(array)

# Use KNN with a  value of k=5 to get the 5 closest movies for recommendation

In [22]:
def recommend(movie):
    idx =  dataset_new[dataset_new['title'] == movie].index[0]
    array_m = similarity[idx]
    array_m = sorted(list(enumerate(array_m)),reverse=True, key= lambda x: x[1])[1:6]
    
    for i in array_m:
        print(dataset_new.loc[i[0]].title)

    
    
    return 

In [23]:
recommend('Batman')

Batman
Batman & Robin
Batman Begins
Batman Returns
Osama


In [19]:
import pickle
pickle.dump(dataset_new,open('movies.pkl','wb'))

In [20]:
pickle.dump(similarity,open('similarity.pkl','wb'))