Objective:
1. Create a popularity-based recommender system at a genre level. The user will input a 
genre (g), minimum rating threshold (t) for a movie, and no. of
recommendations(N) for which it should be recommended top N movies which are 
most popular within that genre (g) ordered by ratings in descending order where each 
movie has at least (t) reviews.
Example:
Input: 
• Genre (g) : Comedy
• Minimum reviews threshold (t): 100
• Num recommendations (N) : 5

In [54]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('white')

In [55]:
Data=pd.read_csv("movies.csv")
Data.shape


(10329, 3)

In [56]:
Data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [57]:
Data.isnull().count
Data.describe()

Unnamed: 0,movieId
count,10329.0
mean,31924.282893
std,37734.741149
min,1.0
25%,3240.0
50%,7088.0
75%,59900.0
max,149532.0


In [58]:
rating=pd.read_csv("ratings.csv")
rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
...,...,...,...,...
105334,668,142488,4.0,1451535844
105335,668,142507,3.5,1451535889
105336,668,143385,4.0,1446388585
105337,668,144976,2.5,1448656898


In [59]:
rating.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


Q.1)
•Genre (g) : Comedy
• Minimum reviews threshold (t): 100
• Num recommendations (N) : 5

In [60]:
df=Data.merge(rating)
df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,5.0,859046895
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,1303501039
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,5.0,858610933
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.0,850815810
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.0,851766286
...,...,...,...,...,...,...
105334,148238,A Very Murray Christmas (2015),Comedy,475,3.0,1451213043
105335,148626,The Big Short (2015),Drama,458,4.0,1452014749
105336,148626,The Big Short (2015),Drama,576,4.5,1451687664
105337,148626,The Big Short (2015),Drama,668,4.5,1451148148


In [61]:
df.groupby('title')['rating'].mean().sort_values(ascending=False).head() 


title
Saddest Music in the World, The (2003)    5.0
Interstate 60 (2002)                      5.0
Gunfighter, The (1950)                    5.0
Heima (2007)                              5.0
Limelight (1952)                          5.0
Name: rating, dtype: float64

In [62]:
#Creating dataframe with 'rating' count values 
ratings = pd.DataFrame(df.groupby('title')['rating'].mean().sort_values(ascending=False))  
ratings['num of ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())   
ratings.head(5) 

Unnamed: 0_level_0,rating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Saddest Music in the World, The (2003)",5.0,1
Interstate 60 (2002),5.0,2
"Gunfighter, The (1950)",5.0,1
Heima (2007),5.0,1
Limelight (1952),5.0,1


2.) Create a content-based recommender system that recommends top N movies based on 
similar movie(m) genres.


In [63]:
df.head(5)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,5.0,859046895
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,1303501039
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,5.0,858610933
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.0,850815810
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.0,851766286


In [64]:
moviemat = df.pivot_table(index ='userId',columns ='title',values='rating') 
moviemat.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 (1979),...,[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),a/k/a Tommy Chong (2005),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,



• Movie Title (t): Toy Story
• Num recommendations (N): 5


In [65]:
Toy_story_rating = moviemat['Toy Story (1995)']

Toy_story_rating.head(5)


userId
1    NaN
2    5.0
3    NaN
4    NaN
5    4.0
Name: Toy Story (1995), dtype: float64

In [66]:
Toy_story_rating.fillna(0).sort_values(ascending=False).head(5)

userId
335    5.0
575    5.0
116    5.0
109    5.0
108    5.0
Name: Toy Story (1995), dtype: float64

In [117]:
#Check out all the movies and their respective IDs 
movie_titles = pd.read_csv('movies.csv') 
movie_titles.head(10) 

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


# We have Toy Story with ID 1

In [118]:
ratings.sort_values('title', ascending = False).head(5) 

Unnamed: 0_level_0,rating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
À nous la liberté (Freedom for Us) (1931),3.0,1
¡Three Amigos! (1986),3.0125,40
xXx: State of the Union (2005),2.071429,7
xXx (2002),2.958333,24
loudQUIETloud: A Film About the Pixies (2006),4.5,1


3)Create a collaborative based recommender system which recommends top N movies 
based on “K” similar users for a target user “u”
Example:
Input:
• UserID: 1
• Num recommendations(N): 5
• Threshold for similar users (k: 100

In [119]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,5.0,859046895
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,1303501039
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,5.0,858610933
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.0,850815810
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.0,851766286


In [120]:
df=df[['userId','title',]]
df

Unnamed: 0,userId,title
0,2,Toy Story (1995)
1,5,Toy Story (1995)
2,8,Toy Story (1995)
3,11,Toy Story (1995)
4,14,Toy Story (1995)
...,...,...
105334,475,A Very Murray Christmas (2015)
105335,458,The Big Short (2015)
105336,576,The Big Short (2015)
105337,668,The Big Short (2015)


In [121]:
df.groupby('title')['title'].count().sort_values(ascending=False).head()

title
Pulp Fiction (1994)                 325
Forrest Gump (1994)                 311
Shawshank Redemption, The (1994)    308
Jurassic Park (1993)                294
Silence of the Lambs, The (1991)    290
Name: title, dtype: int64

In [122]:
df.sort_values(by="title",ascending=False)

Unnamed: 0,userId,title
77420,668,À nous la liberté (Freedom for Us) (1931)
52504,242,¡Three Amigos! (1986)
52503,235,¡Three Amigos! (1986)
52522,544,¡Three Amigos! (1986)
52523,560,¡Three Amigos! (1986)
...,...,...
21386,160,'Til There Was You (1997)
21385,118,'Til There Was You (1997)
87533,668,'Round Midnight (1986)
102934,251,'Hellboy': The Seeds of Creation (2004)


try

In [123]:
df1=pd.read_csv('movies.csv')
df2=pd.read_csv('ratings.csv')


In [124]:
data=df2.merge(df1)
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama
1,9,16,4.0,842686699,Casino (1995),Crime|Drama
2,12,16,1.5,1144396284,Casino (1995),Crime|Drama
3,24,16,4.0,963468757,Casino (1995),Crime|Drama
4,29,16,3.0,836820223,Casino (1995),Crime|Drama


In [125]:
moviemat = data.pivot_table(index ='userId',columns ='title',values='rating') 
moviemat.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 (1979),...,[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),a/k/a Tommy Chong (2005),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [126]:
ratings.sort_values('rating', ascending = False).head(5) 

Unnamed: 0_level_0,rating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Saddest Music in the World, The (2003)",5.0,1
"Nine Lives of Fritz the Cat, The (1974)",5.0,1
Operation 'Y' & Other Shurik's Adventures (1965),5.0,1
Palo Alto (2013),5.0,1
"Traviata, La (1982)",5.0,1


In [128]:
# analysing correlation with similar movies 
toystory_user_ratings = moviemat['Toy Story (1995)'] 
Sabrina_user_ratings = moviemat['Sabrina (1995)'] 
  
toystory_user_ratings.head()
Sabrina_user_ratings.head()

userId
1    NaN
2    NaN
3    3.0
4    NaN
5    NaN
Name: Sabrina (1995), dtype: float64

In [134]:
#toystory_user_ratings.dropna(0)
Sabrina_user_ratings.fillna(0)

userId
1      0.0
2      0.0
3      3.0
4      0.0
5      0.0
      ... 
664    0.0
665    0.0
666    0.0
667    0.0
668    3.0
Name: Sabrina (1995), Length: 668, dtype: float64

In [None]:
similar_to_toystory = moviemat.corrwith('toystory_user_ratings')
similar_to_sabrina = moviemat.corrwith('Sabrina_user_ratings')


In [None]:
corr_toystory = pd.DataFrame(similar_to_toystory, columns =['Correlation']) 
corr_toystory .dropna(inplace = True) 
corr_toystory .head() 

In [None]:
corr_toystory  = corr_toystory .join(ratings['num of ratings']) 
corr_toystory.head() 

In [None]:
#Similar movies like starwars 
corr_toystory.sort_values('Correlation', ascending = False).head(7)

In [None]:
corr_toystory[corr_toystory['ratings']>100].sort_values('Correlation', ascending = False).head() 

In [None]:
#Similar movies as of Sabrina 
corr_sabrina = pd.DataFrame(similar_to_sabrina, columns =['Correlation']) 
corr_sabrina.dropna(inplace = True) 
corr_sabrina = corr_sabrina.join(ratings['num of ratings']) 
corr_sabrina[corr_sabrina['ratings']>100].sort_values('Correlation', ascending = False).head() 