# Content based Movie Recommendation System

## Importing Required Libraries and Dataset

In [130]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_columns', 100)
df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings.Source,Ratings.Value,Metascore,imdbRating,imdbVotes,imdbID,Type,tomatoMeter,tomatoImage,tomatoRating,tomatoReviews,tomatoFresh,tomatoRotten,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,1,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...,English,USA,Nominated for 7 Oscars. Another 19 wins & 30 n...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.3/10,80.0,9.3,1825626,tt0111161,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,2,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...,"English, Italian, Latin",USA,Won 3 Oscars. Another 23 wins & 27 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.2/10,100.0,9.2,1243444,tt0068646,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,3,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...,"English, Italian, Spanish, Latin, Sicilian",USA,Won 6 Oscars. Another 10 wins & 20 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,85.0,9.0,856870,tt0071562,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,4,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...,"English, Mandarin","USA, UK",Won 2 Oscars. Another 151 wins & 153 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,82.0,9.0,1802351,tt0468569,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,5,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...,English,USA,Nominated for 3 Oscars. Another 16 wins & 8 no...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,8.9/10,96.0,8.9,494215,tt0050083,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


In [131]:
df.shape

(250, 38)

## Taking only tthe columns required for recommendation system

In [132]:
df = df[['Title','Genre','Director','Actors','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [133]:
df.shape


(250, 5)

# Cleaning the data and removing stopwords using NLTK's Rake library

In [134]:
# discarding the commas between the actors' full names and getting only the first three names
df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])

# putting the genres in a list of words
df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))

df['Director'] = df['Director'].map(lambda x: x.split(' '))

# merging together first and last name for each actor and director, so it's considered as one word 
# and there is no mix up between people sharing a first name
for index, row in df.iterrows():
    row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
    row['Director'] = ''.join(row['Director']).lower()

In [135]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default is uses english stopwords from NLTK
    # and discard all puntuation characters
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words and their scores
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['Plot'], inplace = True)

In [136]:
df.set_index('Title', inplace = True)
df.head()

Unnamed: 0_level_0,Genre,Director,Actors,Key_words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[two, imprisoned, men, bond, years, common, de..."
The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[aging, patriarch, clandestine, empire, organi..."
The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[1920s, new, york, expands, family, crime, syn..."
The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[dark, knight, must, accept, one, ability, fig..."
12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[prevent, evidence, miscarriage, colleagues, r..."


# Merging all columns text into 1 on which we will apply vectorizer

In [137]:
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        if col != 'Director':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
print(df['bag_of_words'])
#df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)

Title
The Shawshank Redemption                             crime  drama frankdarabont timrobbins morganfr...
The Godfather                                        crime  drama francisfordcoppola marlonbrando a...
The Godfather: Part II                               crime  drama francisfordcoppola alpacino rober...
The Dark Knight                                      action  crime  drama christophernolan christia...
12 Angry Men                                         crime  drama sidneylumet martinbalsam johnfied...
Schindler's List                                     biography  drama  history stevenspielberg liam...
The Lord of the Rings: The Return of the King        adventure  drama  fantasy peterjackson noelapp...
Pulp Fiction                                         crime  drama quentintarantino timroth amandapl...
Fight Club                                           drama davidfincher edwardnorton bradpitt meatl...
The Lord of the Rings: The Fellowship of the Ring    adventure  dra

In [138]:
df.head()

Unnamed: 0_level_0,Genre,Director,Actors,Key_words,bag_of_words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[two, imprisoned, men, bond, years, common, de...",crime drama frankdarabont timrobbins morganfr...
The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[aging, patriarch, clandestine, empire, organi...",crime drama francisfordcoppola marlonbrando a...
The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[1920s, new, york, expands, family, crime, syn...",crime drama francisfordcoppola alpacino rober...
The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[dark, knight, must, accept, one, ability, fig...",action crime drama christophernolan christia...
12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[prevent, evidence, miscarriage, colleagues, r...",crime drama sidneylumet martinbalsam johnfied...


In [139]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(df.index)
indices[:5]

0    The Shawshank Redemption
1               The Godfather
2      The Godfather: Part II
3             The Dark Knight
4                12 Angry Men
Name: Title, dtype: object

# Apppling Cosine Similarity to find the similarity matrix between Genre,Actors,Directors and Plots

In [140]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.15789474, 0.13764944, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.15789474, 1.        , 0.36706517, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.13764944, 0.36706517, 1.        , ..., 0.04588315, 0.04588315,
        0.04850713],
       ...,
       [0.05263158, 0.05263158, 0.04588315, ..., 1.        , 0.05263158,
        0.05564149],
       [0.05263158, 0.05263158, 0.04588315, ..., 0.05263158, 1.        ,
        0.05564149],
       [0.05564149, 0.05564149, 0.04850713, ..., 0.05564149, 0.05564149,
        1.        ]])

In [145]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(df.index[i]+"  -->"+str(df.Genre[i]))
        
        
    return recommended_movies

# Recommendation based on Action/Adventure movies

In [146]:
recommendations('Star Wars: The Force Awakens')

["Star Wars: Episode V - The Empire Strikes Back  -->['action', ' adventure', ' fantasy']",
 "Star Wars: Episode IV - A New Hope  -->['action', ' adventure', ' fantasy']",
 "Star Wars: Episode VI - Return of the Jedi  -->['action', ' adventure', ' fantasy']",
 "Indiana Jones and the Last Crusade  -->['action', ' adventure', ' fantasy']",
 "Raiders of the Lost Ark  -->['action', ' adventure']",
 "Spider-Man: Homecoming  -->['action', ' adventure', ' sci-fi']",
 "Monty Python and the Holy Grail  -->['adventure', ' comedy', ' fantasy']",
 "Pirates of the Caribbean: The Curse of the Black Pearl  -->['action', ' adventure', ' fantasy']",
 "Harry Potter and the Deathly Hallows: Part 2  -->['adventure', ' drama', ' fantasy']",
 "North by Northwest  -->['action', ' adventure', ' mystery']"]

# Recommendation based on Drama/Biography movies

In [147]:
recommendations('The Straight Story')

["The Elephant Man  -->['biography', ' drama']",
 "Dog Day Afternoon  -->['biography', ' crime', ' drama']",
 "Raging Bull  -->['biography', ' drama', ' sport']",
 "In the Name of the Father  -->['biography', ' drama']",
 "12 Years a Slave  -->['biography', ' drama', ' history']",
 "Papillon  -->['biography', ' crime', ' drama']",
 "Unforgiven  -->['drama', ' western']",
 "Gandhi  -->['biography', ' drama', ' history']",
 "Léon: The Professional  -->['crime', ' drama', ' thriller']",
 "Cool Hand Luke  -->['crime', ' drama']"]

# Recommendation based on Animation movies

In [148]:
recommendations('Aladdin')

["Monsters, Inc.  -->['animation', ' adventure', ' comedy']",
 "Roman Holiday  -->['comedy', ' romance']",
 "Toy Story  -->['animation', ' adventure', ' comedy']",
 "Finding Nemo  -->['animation', ' adventure', ' comedy']",
 "Up  -->['animation', ' adventure', ' comedy']",
 "Toy Story 3  -->['animation', ' adventure', ' comedy']",
 "Zootopia  -->['animation', ' adventure', ' comedy']",
 "Beauty and the Beast  -->['animation', ' family', ' fantasy']",
 "Song of the Sea  -->['animation', ' adventure', ' family']",
 "Inside Out  -->['animation', ' adventure', ' comedy']"]

# Recommendation based on Sci-Fi movies

In [150]:
recommendations('The Avengers')

["Guardians of the Galaxy Vol. 2  -->['action', ' adventure', ' sci-fi']",
 "Aliens  -->['action', ' adventure', ' sci-fi']",
 "Guardians of the Galaxy  -->['action', ' adventure', ' sci-fi']",
 "The Martian  -->['adventure', ' drama', ' sci-fi']",
 "Interstellar  -->['adventure', ' drama', ' sci-fi']",
 "Blade Runner  -->['sci-fi', ' thriller']",
 "Terminator 2: Judgment Day  -->['action', ' sci-fi', ' thriller']",
 "The Thing  -->['horror', ' mystery', ' sci-fi']",
 "The Terminator  -->['action', ' sci-fi']",
 "Spider-Man: Homecoming  -->['action', ' adventure', ' sci-fi']"]