<h2>Fetch the data from CSV file, filter and process it to suit the requirements of recommender system. </h2>

In [1]:
import numpy as np
import pandas as pd #for csv file, data processing
import ast

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
# merge movies and credits into a single dataframe
movies = movies.merge(credits, on='title')

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [5]:
# check keys of all columns of type objects

# print keys of column object data
def print_column_keys(col_names, df=movies):
    for name in col_names:
        json_data = df.head(1)[name][0]
        parser_data = ast.literal_eval(json_data)
        print(name, "->", parser_data[0].keys())

print_column_keys(col_names=['crew','cast','genres','keywords'])

crew -> dict_keys(['credit_id', 'department', 'gender', 'id', 'job', 'name'])
cast -> dict_keys(['cast_id', 'character', 'credit_id', 'gender', 'id', 'name', 'order'])
genres -> dict_keys(['id', 'name'])
keywords -> dict_keys(['id', 'name'])


In [6]:
# filter columns and keep only required columns for the model
# take movie_id, title, overview, genresm keywords, cast, crew
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [7]:
# function to parse string literal containg list and extract tags
def convert_obj_list(text):
    L = []
    for item in ast.literal_eval(text):
        L.append(item['name'])
    return L

In [8]:
# drop rows containg null values
print(movies.shape)
movies = movies.dropna()
print(movies.shape)

(4809, 7)
(4806, 7)


In [9]:
movies['genres'] = movies['genres'].apply(convert_obj_list)
movies['keywords'] = movies['keywords'].apply(convert_obj_list)
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [10]:
# convert function for cast, take top three cast members
def convert_cast_list(text):
    L = []
    counter = 0
    for item in ast.literal_eval(text):
        L.append(item['name'])
        counter+=1
        if(counter == 3):
            break
    return L

In [11]:
movies['cast'] = movies['cast'].apply(convert_cast_list)
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [12]:
# from the crew column fetch only directors
def filter_with_name(text, filter_name, filter_key_name, target_name):
    L = []
    for item in ast.literal_eval(text):
        if item[filter_key_name] == filter_name:
            L.append(item[target_name])
    
    return L

In [13]:
movies['crew'] = movies['crew'].apply(filter_with_name, args=('Director','job', 'name'))
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [14]:
# remove all spaces
def remove_spaces(keys, df=movies):
    for key in keys:
        df[key] = df[key].apply(lambda x: [i.replace(" ","") for i in x])
    
    return df

In [15]:
# convert paragraph to list of words by splitting on spaces
def convert_para_list(keys, df=movies):
    for key in keys:
        df[key] = df[key].apply(lambda x: x.split(" "))
    return df

In [16]:
movies = remove_spaces(['genres','keywords','cast','crew'])
movies = convert_para_list(keys=['overview'])
# convert paragraph to list of words

In [17]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [18]:
new_movies_df = movies[['movie_id', 'title', 'tags']]
new_movies_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui..."
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended..."
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


In [19]:
new_movies_df['tags'] = new_movies_df['tags'].apply(lambda x: " ".join(x).lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_df['tags'] = new_movies_df['tags'].apply(lambda x: " ".join(x).lower())


<h2>Perform stemming on the data and remove repeated or similar words</h2>

In [21]:
import nltk
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [22]:
# stemming is an action of replacing similar words like acts, acting, act and replace with a simple word
def perform_stem(text):
    L = []
    for item in text.split():
        L.append(ps.stem(item))
    
    return " ".join(L)

In [23]:
new_movies_df['tags'] = new_movies_df['tags'].apply(perform_stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_df['tags'] = new_movies_df['tags'].apply(perform_stem)


<h2>Design the Recommder System using Scikit Learn by Vectorization.</h2>

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

# Make object of CountVectorization with max_features of 500 and stop words of english
# Stop words are words like 'a', 'and', 'an', 'the',... which are not much significant.
cv = CountVectorizer(stop_words='english', max_features=5000)

In [25]:
# define vectors which contain features in vectorized format
vectors = cv.fit_transform(new_movies_df['tags']).toarray()

<h2>Generate Cosine similarity vector table.</h2>

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

# Cosine similarity is preferred over euclidean distance over multi-dimentional data or spaces
similarity = cosine_similarity(vectors)

In [27]:
similarity

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]])

<h2>Recommend Movies.</h2>

In [28]:
# recommend movies, fetch top 5 similar movies
def recommend_movies(movie):
    index = new_movies_df[new_movies_df["title"]==movie].index[0]
    distances = similarity[index]
    rec_movies = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    
    for item in rec_movies:
        print(new_movies_df.iloc[item[0]].title)

In [29]:
recommend_movies('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


<h2>Save dataframes as pickle file for further use with frontend libraries.</h2>

In [30]:
import pickle

In [35]:
pickle.dump(new_movies_df.to_dict(), open("movie_dict.pkl", "wb"))
pickle.dump(similarity, open("similarity.pkl", "wb"))