# Movie Recommendation System

A personal project focused on recommending movie titles based on the cosine similarity among movies in the given IMDB dataset. I have followed the article: https://towardsdatascience.com/how-to-build-from-scratch-a-content-based-movie-recommender-with-natural-language-processing-25ad400eb243
However, my approach differs in data cleaning, data preprocessing and feature selection.


In [1]:
#import libraries

import numpy as np
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
#loading the IMDB Dataset

df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')

#exploring what the data looks like 
df.head()



Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,...,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,1,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",...,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,2,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",...,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,3,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",...,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,4,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",...,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,5,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",...,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


## Data Preprocessing and Cleaning

In [3]:
#removing unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()


Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...,...,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...,...,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...,...,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...,...,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...,...,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


In [4]:
# A lot of the columns are irrelevant for movie recommendations and can be removed

df = df[['Title','Year','Rated','Runtime',
         'Genre','Director','Writer','Actors',
         'Plot','Production']]
df.head()


Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Writer,Actors,Plot,Production
0,The Shawshank Redemption,1994,R,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...,Columbia Pictures
1,The Godfather,1972,R,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...,Paramount Pictures
2,The Godfather: Part II,1974,R,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...,Paramount Pictures
3,The Dark Knight,2008,PG-13,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...,Warner Bros. Pictures/Legendary
4,12 Angry Men,1957,APPROVED,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...,Criterion Collection


In [5]:
'''Checking if the dataframe has any rows 
with null values and then removing those 
rows from the dataframe
'''
print(df.isnull().sum())
df = df.dropna(how='any',axis=0)
print("After removing the null values")
print(df.isnull().sum())

Title         0
Year          0
Rated         0
Runtime       0
Genre         0
Director      0
Writer        1
Actors        0
Plot          0
Production    0
dtype: int64
After removing the null values
Title         0
Year          0
Rated         0
Runtime       0
Genre         0
Director      0
Writer        0
Actors        0
Plot          0
Production    0
dtype: int64


In [6]:
#Converting all columns except the title to lowercase 
title = df['Title']
df = df.applymap(lambda s:s.lower() if type(s) == str else s)
df['Title']=title
df.head()


Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Writer,Actors,Plot,Production
0,The Shawshank Redemption,1994,r,142 min,"crime, drama",frank darabont,"stephen king (short story ""rita hayworth and s...","tim robbins, morgan freeman, bob gunton, willi...",two imprisoned men bond over a number of years...,columbia pictures
1,The Godfather,1972,r,175 min,"crime, drama",francis ford coppola,"mario puzo (screenplay), francis ford coppola ...","marlon brando, al pacino, james caan, richard ...",the aging patriarch of an organized crime dyna...,paramount pictures
2,The Godfather: Part II,1974,r,202 min,"crime, drama",francis ford coppola,"francis ford coppola (screenplay), mario puzo ...","al pacino, robert duvall, diane keaton, robert...",the early life and career of vito corleone in ...,paramount pictures
3,The Dark Knight,2008,pg-13,152 min,"action, crime, drama",christopher nolan,"jonathan nolan (screenplay), christopher nolan...","christian bale, heath ledger, aaron eckhart, m...",when the menace known as the joker emerges fro...,warner bros. pictures/legendary
4,12 Angry Men,1957,approved,96 min,"crime, drama",sidney lumet,"reginald rose (story), reginald rose (screenplay)","martin balsam, john fiedler, lee j. cobb, e.g....",a jury holdout attempts to prevent a miscarria...,criterion collection


In [8]:
#Converting all features into keywords that will be used to calculate cosine similarity later on 

#Genre
#replacing comma with a space 
df['Genre']=df['Genre'].apply(lambda x: str(x.replace(',',' ')))

#Director
#Combining the first and last names together 
df['Director']=df['Director'].apply(lambda x: str(x.replace(' ','')))

#Production
#Combining the name into one 
df['Production']=df['Production'].apply(lambda x: str(x.replace(' ','')))
df['Production']=df['Production'].apply(lambda x: str(x.replace('.','')))

#Actors 
#Selecting the first two(lead) actors
df['Actors']=df['Actors'].apply(lambda x: str(x.replace(' ','')))
df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:2])

#Writer
df['Writer'] = df['Writer'].apply(lambda x: str(x.replace(' ','')))
df['Writer'] = df['Writer'].map(lambda x: x.split('(')[:1])

#Runtime 
df['Runtime'] = df['Runtime'].map(lambda x: x.split(' ')[:1])

#Plot 
#Remove punctuations, stop words
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()
df['Plot'] = df['Plot'].apply(lambda x: ' '.join(
    [word for word in x.split() if word not in 
    (stop_words)]))
df['Plot'] = df['Plot'].str.replace('[^\w\s]','')




In [9]:
df.head()

Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Writer,Actors,Plot,Production
0,The Shawshank Redemption,1994,r,[142],crime drama,frankdarabont,[stephenking],"[timrobbins, morganfreeman]",two imprisoned men bond number years finding s...,columbiapictures
1,The Godfather,1972,r,[175],crime drama,francisfordcoppola,[mariopuzo],"[marlonbrando, alpacino]",aging patriarch organized crime dynasty transf...,paramountpictures
2,The Godfather: Part II,1974,r,[202],crime drama,francisfordcoppola,[francisfordcoppola],"[alpacino, robertduvall]",early life career vito corleone 1920s new york...,paramountpictures
3,The Dark Knight,2008,pg-13,[152],action crime drama,christophernolan,[jonathannolan],"[christianbale, heathledger]",menace known joker emerges mysterious past wre...,warnerbrospictures/legendary
4,12 Angry Men,1957,approved,[96],crime drama,sidneylumet,[reginaldrose],"[martinbalsam, johnfiedler]",jury holdout attempts prevent miscarriage just...,criterioncollection


In [10]:
#merging all the keywords together into one 

data = pd.DataFrame()

data['Keywords'] = df[df.columns[1:]].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1
)
data['Title'] = df['Title']
data.set_index('Title',inplace=True)
data['Keywords']=data['Keywords'].str.replace('[','')
data['Keywords']=data['Keywords'].str.replace(']','') 
data['Keywords']=data['Keywords'].str.replace("'","") 

data.head()

Unnamed: 0_level_0,Keywords
Title,Unnamed: 1_level_1
The Shawshank Redemption,1994 r 142 crime drama frankdarabont stephenk...
The Godfather,1972 r 175 crime drama francisfordcoppola mar...
The Godfather: Part II,1974 r 202 crime drama francisfordcoppola fra...
The Dark Knight,2008 pg-13 152 action crime drama christophe...
12 Angry Men,1957 approved 96 crime drama sidneylumet regi...


## Recommendation System

In [11]:
#Cosine similarity
count_vector_matrix = CountVectorizer().fit_transform(data['Keywords'])
cos_similarity = cosine_similarity(count_vector_matrix,count_vector_matrix)
print('Cosine Similarity Matrix')
print(cos_similarity)

Cosine Similarity Matrix
[[1.         0.13636364 0.11677484 ... 0.09090909 0.04652421 0.04652421]
 [0.13636364 1.         0.38924947 ... 0.04545455 0.04652421 0.04652421]
 [0.11677484 0.38924947 1.         ... 0.03892495 0.03984095 0.03984095]
 ...
 [0.09090909 0.04545455 0.03892495 ... 1.         0.04652421 0.04652421]
 [0.04652421 0.04652421 0.03984095 ... 0.04652421 1.         0.04761905]
 [0.04652421 0.04652421 0.03984095 ... 0.04652421 0.04761905 1.        ]]


In [12]:
#returns the n(input) most similar movies to the title(input) based on cosine_similarity
def recommend_similar(title,n):
    #selecting the row of the movie title 
    rows = pd.Series(data.index)
    row = rows[rows==title]
    row = row.index[0]
    similar_movies = pd.Series(cos_similarity[row])
    similar_movies = similar_movies.sort_values(ascending=False)
    top_three_movies = similar_movies.iloc[1:n+1]
    top_three_movies = list(top_three_movies.index)
    movie_titles = []
    for movie in top_three_movies:
        movie_titles.append(list(data.index)[movie])
    return movie_titles
    
    

In [13]:
#testing the recommendation function
print(recommend_similar('The Shawshank Redemption',4))
print(recommend_similar('The Dark Knight',2))

['Pulp Fiction', 'Rope', 'Goodfellas', 'Se7en']
['The Dark Knight Rises', 'Batman Begins']
