In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [5]:
print(movies.shape)
print(ratings.shape)

(10329, 3)
(105339, 4)


In [6]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [7]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [8]:
movies.duplicated().sum()

0

In [9]:
ratings.duplicated().sum()

0

Popularity Based Recommender System

In [10]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
...,...,...,...,...
105334,668,142488,4.0,1451535844
105335,668,142507,3.5,1451535889
105336,668,143385,4.0,1446388585
105337,668,144976,2.5,1448656898


In [11]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Animation|Children|Comedy
10325,146878,Le Grand Restaurant (1966),Comedy
10326,148238,A Very Murray Christmas (2015),Comedy
10327,148626,The Big Short (2015),Drama


In [12]:
ratings = ratings.merge(movies,on='movieId')

In [13]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama
1,9,16,4.0,842686699,Casino (1995),Crime|Drama
2,12,16,1.5,1144396284,Casino (1995),Crime|Drama
3,24,16,4.0,963468757,Casino (1995),Crime|Drama
4,29,16,3.0,836820223,Casino (1995),Crime|Drama
...,...,...,...,...,...,...
105334,668,140098,2.5,1450415424,Runoff (2015),Drama
105335,668,140816,2.5,1443288791,Tangerine (2015),Comedy|Drama
105336,668,141472,2.5,1442679119,The 50 Year Argument (2014),(no genres listed)
105337,668,142488,4.0,1451535844,Spotlight (2015),Thriller


In [14]:
ratings['rating'] = pd.to_numeric(ratings['rating'], errors='coerce')

In [15]:
num_rating_df = ratings.groupby('title').count()['rating'].reset_index()
num_rating_df.rename(columns={'rating':'num_rating'},inplace=True)
num_rating_df

Unnamed: 0,title,num_rating
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),1
3,'Til There Was You (1997),3
4,"'burbs, The (1989)",20
...,...,...
10318,loudQUIETloud: A Film About the Pixies (2006),1
10319,xXx (2002),24
10320,xXx: State of the Union (2005),7
10321,¡Three Amigos! (1986),40


In [16]:
avg_rating_df = ratings.groupby('title')['rating'].mean(numeric_only=True).reset_index()
avg_rating_df.rename(columns={'rating':'avg_rating'},inplace=True)
avg_rating_df

Unnamed: 0,title,avg_rating
0,'71 (2014),3.500000
1,'Hellboy': The Seeds of Creation (2004),3.000000
2,'Round Midnight (1986),2.500000
3,'Til There Was You (1997),4.000000
4,"'burbs, The (1989)",3.125000
...,...,...
10318,loudQUIETloud: A Film About the Pixies (2006),4.500000
10319,xXx (2002),2.958333
10320,xXx: State of the Union (2005),2.071429
10321,¡Three Amigos! (1986),3.012500


In [17]:
popular_df = num_rating_df.merge(avg_rating_df,on='title')
popular_df

Unnamed: 0,title,num_rating,avg_rating
0,'71 (2014),1,3.500000
1,'Hellboy': The Seeds of Creation (2004),1,3.000000
2,'Round Midnight (1986),1,2.500000
3,'Til There Was You (1997),3,4.000000
4,"'burbs, The (1989)",20,3.125000
...,...,...,...
10318,loudQUIETloud: A Film About the Pixies (2006),1,4.500000
10319,xXx (2002),24,2.958333
10320,xXx: State of the Union (2005),7,2.071429
10321,¡Three Amigos! (1986),40,3.012500


In [18]:
popular_df = popular_df[popular_df['num_rating'] >= 250].sort_values('avg_rating',ascending=False)

In [19]:
popular_df 

Unnamed: 0,title,num_rating,avg_rating
8136,"Shawshank Redemption, The (1994)",308,4.454545
5877,"Matrix, The (1999)",261,4.264368
8228,"Silence of the Lambs, The (1991)",290,4.194828
8585,Star Wars: Episode IV - A New Hope (1977),273,4.188645
7323,Pulp Fiction (1994),325,4.16
3349,Forrest Gump (1994),311,4.138264
8991,Terminator 2: Judgment Day (1991),253,3.960474
4934,Jurassic Park (1993),294,3.659864


In [20]:
popular_df = popular_df.merge(movies,on='title').drop_duplicates('title')[['title','genres','num_rating','avg_rating']]

In [21]:
popular_df

Unnamed: 0,title,genres,num_rating,avg_rating
0,"Shawshank Redemption, The (1994)",Crime|Drama,308,4.454545
1,"Matrix, The (1999)",Action|Sci-Fi|Thriller,261,4.264368
2,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,290,4.194828
3,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,273,4.188645
4,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,325,4.16
5,Forrest Gump (1994),Comedy|Drama|Romance|War,311,4.138264
6,Terminator 2: Judgment Day (1991),Action|Sci-Fi,253,3.960474
7,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,294,3.659864


Collaborative Filtering Based Recommender System

In [22]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama
1,9,16,4.0,842686699,Casino (1995),Crime|Drama
2,12,16,1.5,1144396284,Casino (1995),Crime|Drama
3,24,16,4.0,963468757,Casino (1995),Crime|Drama
4,29,16,3.0,836820223,Casino (1995),Crime|Drama
...,...,...,...,...,...,...
105334,668,140098,2.5,1450415424,Runoff (2015),Drama
105335,668,140816,2.5,1443288791,Tangerine (2015),Comedy|Drama
105336,668,141472,2.5,1442679119,The 50 Year Argument (2014),(no genres listed)
105337,668,142488,4.0,1451535844,Spotlight (2015),Thriller


In [23]:
x = ratings.groupby('userId').count()['rating'] >200
users = x[x].index

In [24]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama
1,9,16,4.0,842686699,Casino (1995),Crime|Drama
2,12,16,1.5,1144396284,Casino (1995),Crime|Drama
3,24,16,4.0,963468757,Casino (1995),Crime|Drama
4,29,16,3.0,836820223,Casino (1995),Crime|Drama
...,...,...,...,...,...,...
105334,668,140098,2.5,1450415424,Runoff (2015),Drama
105335,668,140816,2.5,1443288791,Tangerine (2015),Comedy|Drama
105336,668,141472,2.5,1442679119,The 50 Year Argument (2014),(no genres listed)
105337,668,142488,4.0,1451535844,Spotlight (2015),Thriller


In [25]:
filtered_ratings = ratings[ratings['userId'].isin(users)]

In [26]:
filtered_ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
3,24,16,4.0,963468757,Casino (1995),Crime|Drama
4,29,16,3.0,836820223,Casino (1995),Crime|Drama
7,60,16,4.0,1162941904,Casino (1995),Crime|Drama
8,62,16,4.5,1267576909,Casino (1995),Crime|Drama
9,88,16,3.5,1180879072,Casino (1995),Crime|Drama
...,...,...,...,...,...,...
105334,668,140098,2.5,1450415424,Runoff (2015),Drama
105335,668,140816,2.5,1443288791,Tangerine (2015),Comedy|Drama
105336,668,141472,2.5,1442679119,The 50 Year Argument (2014),(no genres listed)
105337,668,142488,4.0,1451535844,Spotlight (2015),Thriller


In [27]:
y = filtered_ratings.groupby('title').count()['rating']>=50
famous_movies = y[y].index

In [28]:
famous_movies

Index(['2001: A Space Odyssey (1968)', 'Abyss, The (1989)',
       'Ace Ventura: Pet Detective (1994)', 'Air Force One (1997)',
       'Airplane! (1980)', 'Aladdin (1992)', 'Alien (1979)', 'Aliens (1986)',
       'Amadeus (1984)',
       'Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)',
       ...
       'Unforgiven (1992)', 'Untouchables, The (1987)',
       'Usual Suspects, The (1995)', 'Waterworld (1995)',
       'When Harry Met Sally... (1989)', 'Who Framed Roger Rabbit? (1988)',
       'Willy Wonka & the Chocolate Factory (1971)',
       'Wizard of Oz, The (1939)', 'X-Men (2000)', 'X2: X-Men United (2003)'],
      dtype='object', name='title', length=191)

In [29]:
final_tatings = filtered_ratings[filtered_ratings['title'].isin(famous_movies)]

In [30]:
pt = final_tatings.pivot_table(index='title',columns='userId',values='rating')

In [31]:
pt.fillna(0,inplace=True)

In [32]:
pt

userId,22,24,29,32,38,44,54,60,62,63,...,607,615,622,627,628,650,659,665,666,668
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),0.0,0.0,4.0,4.0,2.5,0.0,0.0,0.0,0.0,0.0,...,5.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,3.0
"Abyss, The (1989)",0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.5,0.0,0.0,0.0,3.5,4.0,3.0
Ace Ventura: Pet Detective (1994),0.0,0.0,3.0,0.0,0.0,0.0,2.5,0.0,0.5,3.5,...,3.0,0.0,0.0,3.0,0.0,5.0,0.0,3.0,0.0,0.0
Air Force One (1997),0.0,0.0,0.0,4.0,0.0,0.0,3.0,0.0,0.0,4.0,...,3.0,0.0,0.0,2.5,0.0,5.0,4.0,0.0,0.0,2.0
Airplane! (1980),0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.5,3.5,...,3.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Who Framed Roger Rabbit? (1988),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.5,...,5.0,4.0,0.0,4.0,3.5,5.0,0.0,0.0,3.0,2.0
Willy Wonka & the Chocolate Factory (1971),0.0,0.0,0.0,4.0,3.5,0.0,0.0,4.0,1.5,0.0,...,3.0,0.0,0.0,0.0,4.0,5.0,4.5,4.0,0.0,0.0
"Wizard of Oz, The (1939)",2.0,0.0,0.0,5.0,3.5,0.0,0.0,0.0,5.0,0.0,...,3.0,4.0,0.0,0.0,0.0,5.0,0.0,0.0,3.0,5.0
X-Men (2000),1.5,0.0,0.0,0.0,3.5,0.0,3.0,0.0,3.0,3.5,...,5.0,0.0,0.0,4.0,3.0,0.0,3.5,0.0,4.0,3.0


In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
similarity_scores = cosine_similarity(pt)

In [35]:
similarity_scores.shape

(191, 191)

In [36]:
def recommend(movie_name):
    # index fetch
    index = np.where(pt.index==movie_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:11]
    

    for i in similar_items:
        print(pt.index[i[0]])
       
        
       


In [37]:
recommend('X-Men (2000)') 

Gladiator (2000)
Spider-Man (2002)
Lord of the Rings: The Fellowship of the Ring, The (2001)
Matrix, The (1999)
Ghostbusters (a.k.a. Ghost Busters) (1984)
Fight Club (1999)
Terminator 2: Judgment Day (1991)
Fifth Element, The (1997)
Minority Report (2002)
Men in Black (a.k.a. MIB) (1997)


In [38]:
import pickle

In [39]:
pickle.dump(popular_df,open('popular.pkl','wb'))

In [40]:
pickle.dump(pt,open('pt.pkl','wb'))

In [41]:
pickle.dump(movies,open('movies.pkl','wb'))

In [42]:
pickle.dump(similarity_scores,open('similarity_scores.pkl','wb'))