In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [24]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

In [25]:
print(movies.shape, credits.shape)

(4803, 20) (4803, 4)


In [26]:
# Merging the datasets:-
movies = movies.merge(credits,on='title')
movies.shape

(4809, 23)

In [27]:
# Keeping important columns for recommendation
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
movies.shape

(4809, 7)

# Data Preprocesing:-

In [28]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [29]:
movies.dropna(inplace=True)

In [30]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [31]:
movies.duplicated().sum()

0

In [32]:
# removing duplicates:- 
movies = movies.drop_duplicates()
movies.duplicated().sum()

0

In [33]:
# to convert string to list 

In [34]:
import ast

def convert(text):
    list =[]
    for i in ast.literal_eval(text):
        list.append(i['name'])
    return list

In [35]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [36]:
# keeping the top 3 casts
def convert_cast(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L

In [37]:
movies['cast'] = movies['cast'].apply(convert_cast)

In [38]:
# to fetch the director from the crew
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [39]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [40]:
# concatenating the strings
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [41]:
# to remove spaces
def remove_space(word):
    list =[]
    for i in word:
        list.append(i.replace(" ", ""))
        return list 

In [42]:
movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)
movies['genres'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...",[Action],[cultureclash],[SamWorthington],[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...",[Adventure],[ocean],[JohnnyDepp],[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...",[Action],[spy],[DanielCraig],[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...",[Action],[dccomics],[ChristianBale],[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...",[Action],[basedonnovel],[TaylorKitsch],[AndrewStanton]


In [43]:
# concatenating all columns 
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...",[Action],[cultureclash],[SamWorthington],[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...",[Adventure],[ocean],[JohnnyDepp],[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...",[Action],[spy],[DanielCraig],[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...",[Action],[dccomics],[ChristianBale],[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...",[Action],[basedonnovel],[TaylorKitsch],[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [44]:
# droping the extra columns
new_df = movies[['movie_id','title','tags']]
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [46]:
# converting the list to a string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))
new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [47]:
# Converting to lower case
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())
new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [49]:
# converting words to their root word
import nltk
from nltk import PorterStemmer

In [50]:
ps = PorterStemmer()

In [51]:
def stems(text):
    list = []
    for i in text.split():
        list.append(ps.stem(i))
    return " ".join(list)

In [52]:
new_df['tags'] = new_df['tags'].apply(stems)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stems)


In [53]:
# to counter vectorize the text
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
cv = CountVectorizer(max_features=5000, stop_words="english")

In [55]:
vector = cv.fit_transform(new_df['tags']).toarray()

In [57]:
vector.shape

(4806, 5000)

Measuring cosine similarity 

In [59]:
from sklearn.metrics.pairwise import cosine_similarity

In [60]:
similarity = cosine_similarity(vector)
similarity.shape

(4806, 4806)

In [63]:
# function to find similar movies
def recommend(movie):
  index = new_df[new_df['title'] == movie].index[0]
  distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
  for i in distances[1:6]:
     print(new_df.iloc[i[0]].title)

In [64]:
recommend('Spider-Man 2')

Spider-Man 3
The Amazing Spider-Man
All Is Lost
The Legend of Hercules
Spider-Man


In [65]:
recommend('The Dark Knight Rises')

Batman Forever
The Dark Knight
Batman
Batman Returns
Batman


In [66]:
import pickle

In [68]:
# dumping the dataframe and similarity score in a pickle file
pickle.dump(new_df,open('artifacts/movie_list.pkl','wb'))
pickle.dump(similarity,open('artifacts/similarity.pkl','wb'))

In [73]:
pd.__version__

'2.1.4'