In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credit = pd.read_csv('tmdb_5000_credits.csv')
movies = movies.merge(credit,on='title')

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [4]:
movies = movies[['genres','id','keywords','title','overview','cast','crew']]

In [5]:
movies.isnull().sum()

genres      0
id          0
keywords    0
title       0
overview    3
cast        0
crew        0
dtype: int64

In [6]:
movies.dropna(inplace=True)

In [7]:
movies.isnull().sum()

genres      0
id          0
keywords    0
title       0
overview    0
cast        0
crew        0
dtype: int64

In [8]:
def F1(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [9]:
movies['genres'] = movies['genres'].apply(F1)

In [10]:
movies['keywords'] = movies['keywords'].apply(F1)

In [11]:
def F2(names):
    L = []
    R = 0 
    for i in ast.literal_eval(names):
        if R != 3:
            L.append(i['name'])
            R += 1
        else:
            break
    return L

In [12]:
movies['cast'] = movies['cast'].apply(F2)

In [13]:
def F3(names):
    L = []
    for i in ast.literal_eval(names):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [14]:
movies['crew'] = movies['crew'].apply(F3)

In [15]:
movies['overview'] = movies['overview'].apply(lambda x : x.split())

In [16]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [17]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [38]:
movies['tags'].values

array([list(['In', 'the', '22nd', 'century,', 'a', 'paraplegic', 'Marine', 'is', 'dispatched', 'to', 'the', 'moon', 'Pandora', 'on', 'a', 'unique', 'mission,', 'but', 'becomes', 'torn', 'between', 'following', 'orders', 'and', 'protecting', 'an', 'alien', 'civilization.', 'Action', 'Adventure', 'Fantasy', 'ScienceFiction', 'cultureclash', 'future', 'spacewar', 'spacecolony', 'society', 'spacetravel', 'futuristic', 'romance', 'space', 'alien', 'tribe', 'alienplanet', 'cgi', 'marine', 'soldier', 'battle', 'loveaffair', 'antiwar', 'powerrelations', 'mindandsoul', '3d', 'SamWorthington', 'ZoeSaldana', 'SigourneyWeaver', 'JamesCameron']),
       list(['Captain', 'Barbossa,', 'long', 'believed', 'to', 'be', 'dead,', 'has', 'come', 'back', 'to', 'life', 'and', 'is', 'headed', 'to', 'the', 'edge', 'of', 'the', 'Earth', 'with', 'Will', 'Turner', 'and', 'Elizabeth', 'Swann.', 'But', 'nothing', 'is', 'quite', 'as', 'it', 'seems.', 'Adventure', 'Fantasy', 'Action', 'ocean', 'drugabuse', 'exoticisl

In [18]:
df = movies[['id','title','tags']]

In [19]:
df['tags'] = df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x: " ".join(x))


In [20]:
df['tags'] = df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x: x.lower())


In [21]:
ps = PorterStemmer()

In [22]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [23]:
cv = CountVectorizer(max_features=5000,stop_words = 'english')
vectors = cv.fit_transform(df['tags']).toarray()

In [24]:
df['tags'] = df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(stem)


In [40]:
# use this code to see the most used words
# cv.get_feature_names()

In [26]:
# calculating cosine distance
similarity = cosine_similarity(vectors)

In [27]:
def recommend(movie):
    Index = df[df['title'] == movie].index[0]
    Distance = similarity[Index]
    List = sorted(list(enumerate(Distance)),reverse=True, key=lambda x:x[1])[1:6]
    
    for i in List:
        print(df.iloc[i[0]].title)

In [28]:
recommend('Batman Begins')

The Dark Knight
The Dark Knight Rises
Batman
Batman & Robin
Batman


In [29]:
pickle.dump(df.to_dict(), open('movies_dict.pkl','wb'))

In [30]:
pickle.dump(similarity,open('similarity.pkl','wb'))

In [32]:
pickle.dump(movies.to_dict,open('all_tags.pkl','wb'))