## **Popularity based recommendation**

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df = pd.read_csv('movies.csv')
print(movies_df.shape)
movies_df.head()

(9125, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df = pd.read_csv('ratings.csv')
print(ratings_df.shape)
ratings_df.head()

(100004, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
df = pd.merge(ratings_df, movies_df, on='movieId')

In [5]:
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,7,31,3.0,851868750,Dangerous Minds (1995),Drama
2,31,31,4.0,1273541953,Dangerous Minds (1995),Drama
3,32,31,4.0,834828440,Dangerous Minds (1995),Drama
4,36,31,3.0,847057202,Dangerous Minds (1995),Drama
...,...,...,...,...,...,...
99999,664,64997,2.5,1343761859,War of the Worlds (2005),Action|Sci-Fi
100000,664,72380,3.5,1344435977,"Box, The (2009)",Drama|Horror|Mystery|Sci-Fi|Thriller
100001,665,129,3.0,995232528,Pie in the Sky (1996),Comedy|Romance
100002,665,4736,1.0,1010197684,Summer Catch (2001),Comedy|Drama|Romance


**Criteria** For Popularity Based Recommendation Systems

The criteria is based on:
1. Movies with the highest rating
2. Number of views

In [6]:
#df.groupby('title')['rating'].mean()

df.groupby(by='title').agg(Rating = ('rating', 'mean'))

Unnamed: 0_level_0,Rating
title,Unnamed: 1_level_1
"""Great Performances"" Cats (1998)",1.750000
$9.99 (2008),3.833333
'Hellboy': The Seeds of Creation (2004),2.000000
'Neath the Arizona Skies (1934),0.500000
'Round Midnight (1986),2.250000
...,...
xXx (2002),2.478261
xXx: State of the Union (2005),1.000000
¡Three Amigos! (1986),3.258065
À nous la liberté (Freedom for Us) (1931),4.500000


In [7]:
ratings_means = df.groupby(by='title').agg(Rating = ('rating', 'mean')).sort_values('Rating', ascending=False)
ratings_means

Unnamed: 0_level_0,Rating
title,Unnamed: 1_level_1
Ivan Vasilievich: Back to the Future (Ivan Vasilievich menyaet professiyu) (1973),5.0
Alien Escape (1995),5.0
Boiling Point (1993),5.0
Bone Tomahawk (2015),5.0
Borgman (2013),5.0
...,...
Waterloo Bridge (1940),0.5
Karla (2006),0.5
Ring of Terror (1962),0.5
Road Games (a.k.a. Roadgames) (1981),0.5


In [8]:
#df.groupby('title')['rating'].count().sort_values('Rating', ascending=False)
ratings_counts = df.groupby(by='title').agg(Rating = ('rating', 'count')).sort_values('Rating', ascending=False)
ratings_counts

Unnamed: 0_level_0,Rating
title,Unnamed: 1_level_1
Forrest Gump (1994),341
Pulp Fiction (1994),324
"Shawshank Redemption, The (1994)",311
"Silence of the Lambs, The (1991)",304
Star Wars: Episode IV - A New Hope (1977),291
...,...
Robin Williams: Weapons of Self Destruction (2009),1
"Good German, The (2006)",1
"Good Earth, The (1937)",1
Robot Overlords (2014),1


In [9]:
ratings_means['Counts'] = ratings_counts
ratings_means

Unnamed: 0_level_0,Rating,Counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Ivan Vasilievich: Back to the Future (Ivan Vasilievich menyaet professiyu) (1973),5.0,1
Alien Escape (1995),5.0,1
Boiling Point (1993),5.0,1
Bone Tomahawk (2015),5.0,1
Borgman (2013),5.0,1
...,...,...
Waterloo Bridge (1940),0.5,1
Karla (2006),0.5,1
Ring of Terror (1962),0.5,1
Road Games (a.k.a. Roadgames) (1981),0.5,1


In [10]:
ratings_means.sort_values('Counts', ascending=False)

Unnamed: 0_level_0,Rating,Counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump (1994),4.054252,341
Pulp Fiction (1994),4.256173,324
"Shawshank Redemption, The (1994)",4.487138,311
"Silence of the Lambs, The (1991)",4.138158,304
Star Wars: Episode IV - A New Hope (1977),4.221649,291
...,...,...
Aloha (2015),3.500000,1
All the Right Moves (1983),3.500000,1
Charlie St. Cloud (2010),3.500000,1
Hush (2016),3.500000,1


In [11]:
ratings_means['Rating'] = round(ratings_means['Rating'], 2)
ratings_means.sort_values('Counts', ascending=False)

Unnamed: 0_level_0,Rating,Counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump (1994),4.05,341
Pulp Fiction (1994),4.26,324
"Shawshank Redemption, The (1994)",4.49,311
"Silence of the Lambs, The (1991)",4.14,304
Star Wars: Episode IV - A New Hope (1977),4.22,291
...,...,...
Aloha (2015),3.50,1
All the Right Moves (1983),3.50,1
Charlie St. Cloud (2010),3.50,1
Hush (2016),3.50,1


In [12]:
# Top ten movies
selected_movies = ratings_means[(ratings_means['Rating'] > 3) & (ratings_means['Counts'] > 100)].sort_values('Rating', ascending=False).head(10)
selected_movies

Unnamed: 0_level_0,Rating,Counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Godfather, The (1972)",4.49,200
"Shawshank Redemption, The (1994)",4.49,311
"Godfather: Part II, The (1974)",4.39,135
"Usual Suspects, The (1995)",4.37,201
Schindler's List (1993),4.3,244
One Flew Over the Cuckoo's Nest (1975),4.26,144
Fargo (1996),4.26,224
Pulp Fiction (1994),4.26,324
"Dark Knight, The (2008)",4.24,121
Casablanca (1942),4.24,117


## **Content based recommendation**

We want to base on our past movie data to recommend similar movies based on similar plot, genre, director, etc. For example if we search for Titanic movie, similar movies should be recommended to us, which can be done based on the plots, directors, cast, etc.

In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')
print(df.shape)
df.head()

(250, 38)


Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,...,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,1,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",...,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,2,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",...,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,3,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",...,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,4,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",...,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,5,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",...,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


In [15]:
df_selected = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']]
df_selected.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [16]:
# converting to lower case to avoid duplicates and splitting the actors' full names
df_splited = df_selected.copy()
df_splited.loc[:, 'Actors'] = df_splited.loc[:, 'Actors'].map(lambda x: x.replace(' ', '').lower())

In [17]:
df_splited

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"timrobbins,morganfreeman,bobgunton,williamsadler",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"marlonbrando,alpacino,jamescaan,richards.caste...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"alpacino,robertduvall,dianekeaton,robertdeniro",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"christianbale,heathledger,aaroneckhart,michael...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"martinbalsam,johnfiedler,leej.cobb,e.g.marshall",A jury holdout attempts to prevent a miscarria...
...,...,...,...,...,...
245,The Lost Weekend,"Drama, Film-Noir",Billy Wilder,"raymilland,janewyman,phillipterry,howarddasilva",The desperate life of a chronic alcoholic is f...
246,Short Term 12,Drama,Destin Daniel Cretton,"brielarson,johngallagherjr.,stephaniebeatriz,r...",A 20-something supervising staff member of a r...
247,His Girl Friday,"Comedy, Drama, Romance",Howard Hawks,"carygrant,rosalindrussell,ralphbellamy,geneloc...",A newspaper editor uses every trick in the boo...
248,The Straight Story,"Biography, Drama",David Lynch,"sissyspacek,janegallowayheitz,josepha.carpente...",An old man makes a long journey by lawn-mover ...


In [18]:
# converting to lower case to avoid duplicates and splitting the genres
 
df_splited.loc[:, 'Genre'] = df_splited.loc[:, 'Genre'].map(lambda x: x.lower().split(','))
df_splited

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"[crime, drama]",Frank Darabont,"timrobbins,morganfreeman,bobgunton,williamsadler",Two imprisoned men bond over a number of years...
1,The Godfather,"[crime, drama]",Francis Ford Coppola,"marlonbrando,alpacino,jamescaan,richards.caste...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"[crime, drama]",Francis Ford Coppola,"alpacino,robertduvall,dianekeaton,robertdeniro",The early life and career of Vito Corleone in ...
3,The Dark Knight,"[action, crime, drama]",Christopher Nolan,"christianbale,heathledger,aaroneckhart,michael...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"[crime, drama]",Sidney Lumet,"martinbalsam,johnfiedler,leej.cobb,e.g.marshall",A jury holdout attempts to prevent a miscarria...
...,...,...,...,...,...
245,The Lost Weekend,"[drama, film-noir]",Billy Wilder,"raymilland,janewyman,phillipterry,howarddasilva",The desperate life of a chronic alcoholic is f...
246,Short Term 12,[drama],Destin Daniel Cretton,"brielarson,johngallagherjr.,stephaniebeatriz,r...",A 20-something supervising staff member of a r...
247,His Girl Friday,"[comedy, drama, romance]",Howard Hawks,"carygrant,rosalindrussell,ralphbellamy,geneloc...",A newspaper editor uses every trick in the boo...
248,The Straight Story,"[biography, drama]",David Lynch,"sissyspacek,janegallowayheitz,josepha.carpente...",An old man makes a long journey by lawn-mover ...


In [19]:
df_splited['Director'] = df_splited['Director'].map(lambda x: x.replace(' ', '').lower())

In [20]:
df_splited

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"[crime, drama]",frankdarabont,"timrobbins,morganfreeman,bobgunton,williamsadler",Two imprisoned men bond over a number of years...
1,The Godfather,"[crime, drama]",francisfordcoppola,"marlonbrando,alpacino,jamescaan,richards.caste...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"[crime, drama]",francisfordcoppola,"alpacino,robertduvall,dianekeaton,robertdeniro",The early life and career of Vito Corleone in ...
3,The Dark Knight,"[action, crime, drama]",christophernolan,"christianbale,heathledger,aaroneckhart,michael...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"[crime, drama]",sidneylumet,"martinbalsam,johnfiedler,leej.cobb,e.g.marshall",A jury holdout attempts to prevent a miscarria...
...,...,...,...,...,...
245,The Lost Weekend,"[drama, film-noir]",billywilder,"raymilland,janewyman,phillipterry,howarddasilva",The desperate life of a chronic alcoholic is f...
246,Short Term 12,[drama],destindanielcretton,"brielarson,johngallagherjr.,stephaniebeatriz,r...",A 20-something supervising staff member of a r...
247,His Girl Friday,"[comedy, drama, romance]",howardhawks,"carygrant,rosalindrussell,ralphbellamy,geneloc...",A newspaper editor uses every trick in the boo...
248,The Straight Story,"[biography, drama]",davidlynch,"sissyspacek,janegallowayheitz,josepha.carpente...",An old man makes a long journey by lawn-mover ...


In [21]:
import rake_nltk
import nltk
from rake_nltk import Rake
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ErikC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ErikC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
# initializing the new column
df_splited['Key_words'] = ""

for index, row in df_splited.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default is uses english stopwords from NLTK
    # and discard all puntuation characters
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words and their scores
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column
    row['Key_words'] = list(key_words_dict_scores.keys())

In [23]:
# dropping the Plot column
df_splited.drop(columns = ['Plot'], inplace = True)

In [24]:
df_splited

Unnamed: 0,Title,Genre,Director,Actors,Key_words
0,The Shawshank Redemption,"[crime, drama]",frankdarabont,"timrobbins,morganfreeman,bobgunton,williamsadler","[two, imprisoned, men, bond, number, years, fi..."
1,The Godfather,"[crime, drama]",francisfordcoppola,"marlonbrando,alpacino,jamescaan,richards.caste...","[aging, patriarch, organized, crime, dynasty, ..."
2,The Godfather: Part II,"[crime, drama]",francisfordcoppola,"alpacino,robertduvall,dianekeaton,robertdeniro","[early, life, career, vito, corleone, 1920s, n..."
3,The Dark Knight,"[action, crime, drama]",christophernolan,"christianbale,heathledger,aaroneckhart,michael...","[menace, known, joker, emerges, mysterious, pa..."
4,12 Angry Men,"[crime, drama]",sidneylumet,"martinbalsam,johnfiedler,leej.cobb,e.g.marshall","[jury, holdout, attempts, prevent, miscarriage..."
...,...,...,...,...,...
245,The Lost Weekend,"[drama, film-noir]",billywilder,"raymilland,janewyman,phillipterry,howarddasilva","[desperate, life, chronic, alcoholic, followed..."
246,Short Term 12,[drama],destindanielcretton,"brielarson,johngallagherjr.,stephaniebeatriz,r...","[20, something, supervising, staff, member, re..."
247,His Girl Friday,"[comedy, drama, romance]",howardhawks,"carygrant,rosalindrussell,ralphbellamy,geneloc...","[newspaper, editor, uses, every, trick, book, ..."
248,The Straight Story,"[biography, drama]",davidlynch,"sissyspacek,janegallowayheitz,josepha.carpente...","[old, man, makes, long, journey, lawn, mover, ..."
