### Importing the Libraries

In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# pd.set_option("max_columns",500)
# pd.set_option("max_rows",500)
import pickle
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

### Loading the dataset

In [38]:
movies_df=pd.read_csv("./data/tmdb_5000_movies.csv")
credits_df=pd.read_csv("./data/tmdb_5000_credits.csv")

In [39]:
print(f'Rows : {movies_df.shape[0]} Columns: {movies_df.shape[1]}')

movies_df.columns

Rows : 4803 Columns: 20


Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [40]:
print(f'Rows : {credits_df.shape[0]} Columns: {credits_df.shape[1]}')

credits_df.columns

Rows : 4803 Columns: 4


Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

### Merging both the dataframes on "title"

In [41]:
movies_df=pd.merge(left=movies_df,right=credits_df,how='inner',on='title')
movies_df.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


### Columns to be Kept
> genres

> id

> keywords

> title

> overview

> cast 

> crew



### Filtering the dataset with the desired columns

In [42]:
movies=movies_df[['movie_id','title','overview','genres','keywords','cast','crew']]

In [43]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


### Now we want our dataframe to be like this
- movie_id
- title
- tags   --> we will merge [ overview,genres,keywords,cast,crew]

### Data Preprocessing

> Checking for null values

In [44]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

- overview has 3 missing values
- since it is a very important column for the recommendation engine we will not try to impute it by any means
- dropping it will be a right move

> Dropping null values

In [45]:
movies.dropna(inplace=True)

In [46]:
movies.duplicated().sum()

0

> No duplicated values in the dataset

In [47]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


> ### Data Cleaning

#### Cleaning Genres Column

In [48]:
def fetchGenreNames(obj):
    return [dicts['name'] for dicts in literal_eval(obj)]

movies['genres']=movies['genres'].apply(fetchGenreNames)

#### Cleaning keyword column

In [49]:
movies['keywords']=movies['keywords'].apply(fetchGenreNames)

#### Cleaning cast column
- fetching only top 3 cast from the movie

In [50]:
def getTop3Cast(obj):
    actor_names=[]
    counter=0
    for i in literal_eval(obj):
        if counter !=3:
            actor_names.append(i['name'])
            counter+=1
        else:
            break
    return actor_names
        
movies['cast']=movies['cast'].apply(getTop3Cast)

#### Cleaning crew column
- fetching only the directors name

In [51]:
def fetchDirectorName(obj):
    producerName=[]
    for i in literal_eval(obj):
        if i['job']=='Director':
            producerName.append(i['name'])
            break
    return producerName

movies['crew']=movies['crew'].apply(fetchDirectorName)

#### Converting the overview column into a list

In [52]:
movies['overview']=movies['overview'].apply(lambda x: x.split())

##### Now the next step is to combine all these columns [overview genres keyword cast crew] to create tags column
- but before this we need to remove spaces 

    - for example 'Sam Worthington' to this 'SamWorthington'
    - so that 'Sam' and 'Worthington' will not be treated as a seperate entity

In [53]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [54]:
# removing spaces from genres, keywords, cast, crew

movies['genres']=movies['genres'].apply(lambda x:[i.replace(' ','') for i in x])
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(' ','') for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(' ','') for i in x])
movies['crew']=movies['crew'].apply(lambda x:[i.replace(' ','') for i in x])

### Creating tags column

In [55]:
movies['tags']=movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']

#### Creating new dataframe with movie_id , title and tags column

In [56]:
new_df=movies[['movie_id','title','tags']]
new_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui..."
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended..."
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


#### Converting tags column into a string

In [57]:
new_df['tags']=new_df['tags'].apply(lambda x : ' '.join(x))

In [58]:
new_df['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

#### Converting tags into lower case

In [59]:
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())

In [60]:
new_df.head(2)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."


### Applying Stemming on the tags
- this will treat similar words ***[loved , loving , love ]*** as ***love*** and other similar words like that

In [61]:
### creating an object of porter stemmer
ps=PorterStemmer()

In [62]:
def stem(text):
    stemmed_words=[]

    for i in text.split():
        stemmed_words.append(ps.stem(i))
    return ' '.join(stemmed_words)

new_df['tags']=new_df['tags'].apply(stem)

### Now we need to recommend movies based on the closest matching tags, but tags in in textual form
- #### so for this we will convert tags into vectors using ***Bag of Words*** techniuque
- #### and in this we will not consider **Stop Words**

> Stop Words are those words ...which are used in sentence formation but they do not contribute in the meaning of the sentence for example "[and] [are] [if] [to] [from]"

### Creating an object of CountVectorizer 

In [63]:
cv=CountVectorizer(max_features=5000,stop_words='english')

"""
By default this function will return sparse matrix so we are explicitly converting this into numpy array
"""

'\nBy default this function will return sparse matrix so we are explicitly converting this into numpy array\n'

### Transforming the tags into vectors

In [64]:
vectors=cv.fit_transform(new_df['tags']).toarray()

In [65]:
cv.get_feature_names_out()[100:110]

array(['adrienbrodi', 'adult', 'adultanim', 'adulteri', 'adulthood',
       'advanc', 'adventur', 'adventure', 'adventures', 'advertis'],
      dtype=object)

### Calculating distances of each movie with all movies

In [66]:
similarity=cosine_similarity(vectors)

#### Here is the similarity score of each movie with all the movies
- and the similarity of a movie with itself will be 1 that is why in diagonal the value will always be 1

In [67]:
pd.DataFrame(similarity)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4796,4797,4798,4799,4800,4801,4802,4803,4804,4805
0,1.000000,0.083462,0.086031,0.073472,0.189299,0.108389,0.040242,0.146735,0.059235,0.096730,...,0.000000,0.000000,0.042239,0.052632,0.000000,0.019252,0.046829,0.044992,0.000000,0.000000
1,0.083462,1.000000,0.060634,0.038837,0.075047,0.114587,0.021272,0.129272,0.062622,0.102262,...,0.000000,0.000000,0.022327,0.027821,0.000000,0.040706,0.000000,0.023783,0.000000,0.026153
2,0.086031,0.060634,1.000000,0.060048,0.077357,0.070868,0.021926,0.133250,0.064550,0.105409,...,0.085749,0.000000,0.000000,0.000000,0.017590,0.041959,0.000000,0.024515,0.000000,0.000000
3,0.073472,0.038837,0.060048,1.000000,0.033032,0.060523,0.056177,0.068279,0.041345,0.202548,...,0.027462,0.027462,0.058964,0.055104,0.022533,0.067188,0.000000,0.031404,0.048526,0.086335
4,0.189299,0.075047,0.077357,0.033032,1.000000,0.097460,0.054277,0.197910,0.079894,0.108721,...,0.035377,0.000000,0.075960,0.023662,0.145141,0.155799,0.000000,0.020228,0.083351,0.044488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4801,0.019252,0.040706,0.041959,0.067188,0.155799,0.079295,0.029440,0.143131,0.130005,0.035383,...,0.000000,0.057567,0.139055,0.057756,0.259796,1.000000,0.000000,0.000000,0.152586,0.126688
4802,0.046829,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.070014,0.000000,0.000000,0.028724,0.000000,1.000000,0.120096,0.000000,0.000000
4803,0.044992,0.023783,0.024515,0.031404,0.020228,0.018531,0.000000,0.000000,0.050637,0.020672,...,0.067267,0.033634,0.018054,0.044992,0.013799,0.000000,0.120096,1.000000,0.039621,0.042295
4804,0.000000,0.000000,0.000000,0.048526,0.083351,0.057270,0.035438,0.043073,0.104328,0.021296,...,0.000000,0.034648,0.092992,0.000000,0.142148,0.152586,0.000000,0.039621,1.000000,0.087142


In [68]:
def recommend(movie):
    
    # index of the input movie given by user
    Index=new_df[new_df['title']==movie].index[0]

    # getting the recommended movie index
    recommend_movie_index=pd.Series(similarity[Index]).sort_values(ascending=False)[1:6].index

    # giving recommendations
    for movies in list(new_df.loc[recommend_movie_index,['movie_id','title']].values):
        print(movies)

In [69]:
recommend("Batman Begins")

[155 'The Dark Knight']
[2661 'Batman']
[268 'Batman']
[49026 'The Dark Knight Rises']
[36597 'Wuthering Heights']


### Pickling the dataset

In [70]:
pickle.dump(similarity,open("similarity.pkl",'wb'))
pickle.dump(new_df,open("movies.pkl",'wb'))