In [120]:
import numpy as np
import pandas as pd

In [121]:
movies=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')

In [122]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [123]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Merging two dataframes on the basis of the column "title"

In [125]:
movies=movies.merge(credits,on="title")

In [126]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Now we need to analyse which columns are unimportant
#### Since we're making a "content based recommender system" we need to ask ourselves which columns we need to create the "tags"
1. genres
2. id(for picking movie teasers)
3. keywords
4. title(not the original one, since we may not know the regional language)
5. overview(testing similarity of movies)
6. cast
7. crew
<br>
(Numerical columns have not been included)

In [128]:
movies=movies[['movie_id','title','overview','genres','keywords','cast','crew']]

## Preprocessing
### Target: Merge Overviews, genres, keywords, 'cast' and 'crew' to make a single column called tag.
#### 1. Missing Values

In [130]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [131]:
movies.dropna(inplace=True)

#### 2. Check for Duplicate values

In [133]:
movies.duplicated().sum()

0

#### 3. We need to format the genres column as:
[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]
<br>
#### ['Action','Adventure','Fantasy','Science Fiction']

In [135]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

#### Notice here, the entire object is a string. We need to convert it to a proper dictionary format. Hence we use the ast.literal_eval() function

In [137]:
import ast

In [138]:
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [139]:
movies['genres']=movies['genres'].apply(convert)

In [140]:
movies['keywords']=movies['keywords'].apply(convert)

In [141]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


#### 4. For the cast column, we need to pick the actual names of first 3 actors

In [143]:
def convert_cast(obj):
    L=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter!=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [144]:
movies['cast']=movies['cast'].apply(convert_cast)

In [145]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


#### 5. For the crew column, we need to pick the 'name' of that element whose 'job' field is 'Director'

In [147]:
def fetch_director(obj):
    l=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            l.append(i['name'])
            break
    return l

In [148]:
movies['crew']=movies['crew'].apply(fetch_director)
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


#### 6. Overview is also in 'string' format. We'll convert it into a list for easier processing and comaptibility with other columns

In [150]:
movies['overview']=movies['overview'].apply(lambda x:x.split())
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


#### 7. Next, the thing is, space between two words of a same entity has to be removed. Why?
##### Imagine, two Directors Sam Worthington and Sam Mendes. Suppose a user loves Sam , Worthington movies. If our recommendation system solely segregates on the basis of 'Sam', Sam Mendes movies may get recommended instead of Worthington's. So for easier understanding by our model, we remove the space and hence change it to: 'SanWorthington'

In [152]:
movies['genres']=movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords']=movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast']=movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew']=movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [153]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [154]:
movies['tags']=movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']

In [155]:
new_df=movies[['movie_id','title','tags']]

#### 8. Converting the lists to strings

In [157]:
new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))


In [158]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [159]:
new_df['tags'].apply(lambda x: x.lower())

0       in the 22nd century, a paraplegic marine is di...
1       captain barbossa, long believed to be dead, ha...
2       a cryptic message from bond’s past sends him o...
3       following the death of district attorney harve...
4       john carter is a war-weary, former military ca...
                              ...                        
4804    el mariachi just wants to play his guitar and ...
4805    a newlywed couple's honeymoon is upended by th...
4806    "signed, sealed, delivered" introduces a dedic...
4807    when ambitious new york attorney sam is sent t...
4808    ever since the second grade when he first saw ...
Name: tags, Length: 4806, dtype: object

#### Applying stemming here

In [161]:
import nltk
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [162]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [163]:
new_df['tags']=new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stem)


## Training the model

#### Here we perform text vectorisation using library

In [166]:
from sklearn.feature_extraction.text import CountVectorizer

#### Tuning parameters

In [168]:
cv=CountVectorizer(max_features=5000,stop_words='english')
vectors=cv.fit_transform(new_df['tags']).toarray()#returns a sparse matrix

In [169]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

#### Now, one more problem arises
#### The words 'loved','loves','love','loving', are getting counted as 'different' words(features) even theough they all mean the same. To resolve this issue, we use stemming

In [171]:
from sklearn.metrics.pairwise import cosine_similarity

#### similarity matrix stores the distance of each movie with every other movie, hence has a shape of 4806*4806(since we have 4806 movies). Remember, it's a matrix ie, a vector of vectors

In [173]:
similarity=cosine_similarity(vectors)

### Problem: Sometime hence, we're gonna sort the similarity array to understand which movies are closer. But after sorting, we'll lose their indices, ie we cannot trace which movie is pointed to by the current distance in the sorted list. To solve this issue, use 'enumeration'

In [175]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]

[(1214, 0.2867696673382022),
 (2405, 0.26901379342448517),
 (3728, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

#### Notice here, we included the lambda function to sort against the second value, ie the distance, not the enumerated index

##### How to fetch index of a movie given its title?

In [178]:
new_df[new_df['title']=='Batman Begins'].index[0]

119

In [231]:
def recommend(movie):
    movie_index=new_df[new_df['title']==movie].index[0]
    distances=similarity[movie_index]
    movies_list=sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]
    for i in movies_list:
        movie_id=i[0]#get the movie id(needed for displaying movie posters)
        #fetch poster from api
        print(new_df.iloc[i[0]].title)
        

In [180]:
# recommend('Avatar')

In [181]:
import pickle

In [182]:
new_df['title'].values

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)

In [183]:
pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))

In [184]:
pickle.dump(similarity,open('similarity.pkl','wb'))