We are going to design a simple **Content-Based Recommender System**, which searches and recommend `similar` items to what the user has searched. The similarity is defined in terms of `Cosine Similarity`.

In [1]:
import pandas as pd

# using inbuilt cosine similarity function
from sklearn.metrics.pairwise import cosine_similarity

# computes count_matrix or Document term frequency matrix
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_columns',20)

In [2]:
# import the data
df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,...,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,1,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",...,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,2,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",...,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,3,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",...,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,4,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",...,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,5,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",...,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


In [3]:
df.shape

(250, 38)

In [4]:
# we will extract only the columns of our interest
df = df[['Title','Genre','Director','Actors','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [5]:
# discarding the commas between the actors' full names and getting only the first three names
df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])

In [6]:
df['Actors']

0            [Tim Robbins,  Morgan Freeman,  Bob Gunton]
1               [Marlon Brando,  Al Pacino,  James Caan]
2             [Al Pacino,  Robert Duvall,  Diane Keaton]
3        [Christian Bale,  Heath Ledger,  Aaron Eckhart]
4           [Martin Balsam,  John Fiedler,  Lee J. Cobb]
                             ...                        
245           [Ray Milland,  Jane Wyman,  Phillip Terry]
246    [Brie Larson,  John Gallagher Jr.,  Stephanie ...
247      [Cary Grant,  Rosalind Russell,  Ralph Bellamy]
248    [Sissy Spacek,  Jane Galloway Heitz,  Joseph A...
249           [Dev Patel,  Saurabh Shukla,  Anil Kapoor]
Name: Actors, Length: 250, dtype: object

In [7]:
# putting the genres in a list of word(case insensitive)
df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))

In [8]:
df['Genre']

0                 [crime,  drama]
1                 [crime,  drama]
2                 [crime,  drama]
3        [action,  crime,  drama]
4                 [crime,  drama]
                  ...            
245           [drama,  film-noir]
246                       [drama]
247    [comedy,  drama,  romance]
248           [biography,  drama]
249                       [drama]
Name: Genre, Length: 250, dtype: object

In [9]:
# iterating over rows and joining actors and directors
for index, row in df.iterrows():
    print(row['Actors'])
    print(row['Director'])
    break

['Tim Robbins', ' Morgan Freeman', ' Bob Gunton']
Frank Darabont


In [10]:
# iterating over rows and joining actors and directors
for index, row in df.iterrows():
    row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
    row['Director'] = row['Director'].lower().replace(' ','')

In [11]:
df['Actors']

0                 [timrobbins, morganfreeman, bobgunton]
1                    [marlonbrando, alpacino, jamescaan]
2                  [alpacino, robertduvall, dianekeaton]
3             [christianbale, heathledger, aaroneckhart]
4                 [martinbalsam, johnfiedler, leej.cobb]
                             ...                        
245                [raymilland, janewyman, phillipterry]
246     [brielarson, johngallagherjr., stephaniebeatriz]
247           [carygrant, rosalindrussell, ralphbellamy]
248    [sissyspacek, janegallowayheitz, josepha.carpe...
249                [devpatel, saurabhshukla, anilkapoor]
Name: Actors, Length: 250, dtype: object

In [12]:
df['Director']

0                  frankdarabont
1             francisfordcoppola
2             francisfordcoppola
3               christophernolan
4                    sidneylumet
                 ...            
245                  billywilder
246          destindanielcretton
247                  howardhawks
248                   davidlynch
249    dannyboyle,loveleentandan
Name: Director, Length: 250, dtype: object

In [13]:
# !pip install rake-nltk
# this library is used for key word extraction

# import nltk
# nltk.download('stopwords')
# used to download stopwords
# nltk.download('punkt')

In [14]:
# Rapid Automatic Keyword Extraction
from rake_nltk import Rake

In [15]:
# extracting keywords from Plot

# initialising a blank column
df['Key_words'] = ''
for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default is uses english stopwords from NLTK
    # and discard all puntuation characters
    r = Rake()
    
    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)
    
    # getting the dictionary whith key words and their scores
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column
    row['Key_words'] = list(key_words_dict_scores.keys())

In [16]:
# dropping the Plot column because keywords are extracted
df.drop(columns = ['Plot'], inplace = True)

In [17]:
df.set_index('Title',inplace=True)
df.head()

Unnamed: 0_level_0,Genre,Director,Actors,Key_words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[two, imprisoned, men, bond, number, years, fi..."
The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[aging, patriarch, organized, crime, dynasty, ..."
The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[early, life, career, vito, corleone, 1920s, n..."
The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[menace, known, joker, emerges, mysterious, pa..."
12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[jury, holdout, attempts, prevent, miscarriage..."


In [18]:
# extracting all words into a bag of words or a document for Count Vectorizer to
# form a count_matrix or document term frequency
df['bag_of_words'] = ''
for index, row in df.iterrows():
    # initialising a string to hold all words
    words = ''
    
    for col in df.columns:
        if col == 'Director':
            words = words + row[col]
        else:
            words = words + ' '.join(row[col]) + ' '
    row['bag_of_words'] = words

In [19]:
# dropping all other columns
df.drop(columns= [col for col in df.columns if col!='bag_of_words'],inplace=True)

In [20]:
df.head()

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
The Shawshank Redemption,crime drama frankdarabonttimrobbins morganfre...
The Godfather,crime drama francisfordcoppolamarlonbrando al...
The Godfather: Part II,crime drama francisfordcoppolaalpacino robert...
The Dark Knight,action crime drama christophernolanchristian...
12 Angry Men,crime drama sidneylumetmartinbalsam johnfiedl...


In [21]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

In [22]:
# creating a Series for the movie titles so they are associated to an ordered numerical
indices = pd.Series(df.index)
indices

0      The Shawshank Redemption
1                 The Godfather
2        The Godfather: Part II
3               The Dark Knight
4                  12 Angry Men
                 ...           
245            The Lost Weekend
246               Short Term 12
247             His Girl Friday
248          The Straight Story
249         Slumdog Millionaire
Name: Title, Length: 250, dtype: object

In [23]:
c=count_matrix.todense()
c

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
count.vocabulary_

{'crime': 577,
 'drama': 762,
 'frankdarabonttimrobbins': 1006,
 'morganfreeman': 1780,
 'bobgunton': 303,
 'two': 2745,
 'imprisoned': 1256,
 'men': 1704,
 'bond': 308,
 'number': 1867,
 'years': 2927,
 'finding': 960,
 'solace': 2458,
 'eventual': 880,
 'redemption': 2151,
 'acts': 59,
 'common': 511,
 'decency': 653,
 'francisfordcoppolamarlonbrando': 999,
 'alpacino': 120,
 'jamescaan': 1342,
 'aging': 81,
 'patriarch': 1944,
 'organized': 1901,
 'dynasty': 787,
 'transfers': 2710,
 'control': 546,
 'clandestine': 469,
 'empire': 836,
 'reluctant': 2170,
 'son': 2469,
 'francisfordcoppolaalpacino': 998,
 'robertduvall': 2243,
 'dianekeaton': 705,
 'early': 789,
 'life': 1548,
 'career': 386,
 'vito': 2831,
 'corleone': 556,
 '1920s': 8,
 'new': 1839,
 'york': 2930,
 'portrayed': 2030,
 'michael': 1721,
 'expands': 899,
 'tightens': 2656,
 'grip': 1121,
 'family': 925,
 'syndicate': 2593,
 'action': 56,
 'christophernolanchristianbale': 454,
 'heathledger': 1167,
 'aaroneckhart': 31

In [25]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix)
cosine_sim

array([[1.        , 0.16666667, 0.14433757, ..., 0.05555556, 0.05555556,
        0.05892557],
       [0.16666667, 1.        , 0.28867513, ..., 0.05555556, 0.05555556,
        0.05892557],
       [0.14433757, 0.28867513, 1.        , ..., 0.04811252, 0.04811252,
        0.05103104],
       ...,
       [0.05555556, 0.05555556, 0.04811252, ..., 1.        , 0.05555556,
        0.05892557],
       [0.05555556, 0.05555556, 0.04811252, ..., 0.05555556, 1.        ,
        0.05892557],
       [0.05892557, 0.05892557, 0.05103104, ..., 0.05892557, 0.05892557,
        1.        ]])

In [26]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    print(top_10_indexes)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
        
    return recommended_movies

In [27]:
recommendations('Fargo')

[34, 1, 226, 61, 2, 164, 100, 15, 214, 123]


['The Departed',
 'The Godfather',
 'Rope',
 'Reservoir Dogs',
 'The Godfather: Part II',
 'Touch of Evil',
 'On the Waterfront',
 'Goodfellas',
 'Arsenic and Old Lace',
 'Cool Hand Luke']