In [1]:

import pandas as pd
pd.set_option('display.max_columns', 20)

In [2]:
import os
dataset_path = os.path.abspath('../Dataset/data/')
rating = pd.read_csv(dataset_path + '/ratings.csv')
movie = pd.read_csv(dataset_path + '/movies.csv')
df_ = movie.merge(rating, how="left", on="movieId")
df = df_.copy()
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,3.5,1141416000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1439472000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,3.0,1573944000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,858625900.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,890492500.0


In [3]:
df.shape

(25003471, 6)

In [4]:
# number of unique titles in ratings 
comment_counts = pd.DataFrame(df["title"].value_counts())
comment_counts

Unnamed: 0,title
Forrest Gump (1994),81491
"Shawshank Redemption, The (1994)",81482
Pulp Fiction (1994),79672
"Silence of the Lambs, The (1991)",74127
"Matrix, The (1999)",72674
...,...
The Midnight Story (1957),1
Guilty or Innocent: The Sam Sheppard Murder Case (1975),1
"It's Me, Hilary: The Man Who Drew Eloise (2015)",1
Ride Out for Revenge (1957),1


In [5]:
# movies rarely rated 
rare_movies = comment_counts[comment_counts["title"] <= 1000].index

In [6]:
# exclusion of movies rarely rated 
common_movies = df[~df["title"].isin(rare_movies)]
common_movies.shape
# check number of common movies
common_movies["title"].nunique()

3790

In [7]:
# creating pivot table consisting of so called common movies             
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
#user_movie_df.shape
user_movie_df.head(10)

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),...,Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,2.0,...,,4.0,3.5,,,,,3.5,,
4.0,,,,,,,,,,,...,,,4.0,4.5,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
6.0,,,,,,,,,,,...,,,,,,,,,,
7.0,,,,,,,,,,,...,,,,,,,,,,
8.0,,,,,,,,,,,...,,,,,,,,,,
9.0,,,,,,,,5.0,,,...,,,,,,,,,,
10.0,,,,,,3.0,,,2.0,,...,,,,,,,,,,


In [8]:
# movies are in columns checked
user_movie_df.columns

Index([''burbs, The (1989)', '(500) Days of Summer (2009)',
       '*batteries not included (1987)', '...And Justice for All (1979)',
       '10 Cloverfield Lane (2016)', '10 Things I Hate About You (1999)',
       '10,000 BC (2008)', '101 Dalmatians (1996)',
       '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
       '102 Dalmatians (2000)',
       ...
       'Zodiac (2007)', 'Zombieland (2009)', 'Zoolander (2001)',
       'Zootopia (2016)', 'Zulu (1964)', '[REC] (2007)', 'eXistenZ (1999)',
       'xXx (2002)', 'xXx: State of the Union (2005)',
       '¡Three Amigos! (1986)'],
      dtype='object', name='title', length=3790)

In [9]:
# picking up a random user for user based recommendation
random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=45).values)

In [10]:
#selecting the movies the the random picked user watched 
random_user_df = user_movie_df[user_movie_df.index == random_user]
random_user_df.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),...,Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
120549.0,,,,,,,,,,,...,,,,,,,,,,


In [11]:
#moving them to a list 
movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist() 
movies_watched
#verification
#user_movie_df.loc[user_movie_df.index == random_user, user_movie_df.columns == "Ace Ventura: Pet Detective (1994)"]
#len(movies_watched)

['6th Day, The (2000)',
 'About Schmidt (2002)',
 "Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",
 'Beautiful Mind, A (2001)',
 'Black Hawk Down (2001)',
 'Bourne Identity, The (2002)',
 'Braveheart (1995)',
 'Catch Me If You Can (2002)',
 'Con Air (1997)',
 'Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)',
 'E.T. the Extra-Terrestrial (1982)',
 'Enigma (2001)',
 'Erin Brockovich (2000)',
 'Finding Forrester (2000)',
 'Forrest Gump (1994)',
 'Hamlet (2000)',
 'Jumanji (1995)',
 'Lord of the Rings: The Two Towers, The (2002)',
 'Memento (2000)',
 'Minority Report (2002)',
 'Monsters, Inc. (2001)',
 'Moulin Rouge (2001)',
 'O Brother, Where Art Thou? (2000)',
 "Ocean's Eleven (2001)",
 'One Hour Photo (2002)',
 'Pitch Black (2000)',
 'Princess Bride, The (1987)',
 'Quills (2000)',
 'Remember the Titans (2000)',
 'Requiem for a Dream (2000)',
 'Reservoir Dogs (1992)',
 'Runaway Bride (1999)',
 'Saint, The (1997)',
 'Save the Last Dance (2001)',
 'Saving Private Ryan (1998

In [12]:
# selecting the movies that random user watched which also includes other users
movies_watched_df = user_movie_df[movies_watched]
movies_watched_df.head()

title,"6th Day, The (2000)",About Schmidt (2002),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)","Beautiful Mind, A (2001)",Black Hawk Down (2001),"Bourne Identity, The (2002)",Braveheart (1995),Catch Me If You Can (2002),Con Air (1997),"Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)",...,Shrek (2001),Snatch (2000),Spider-Man (2002),Star Wars: Episode VI - Return of the Jedi (1983),The Count of Monte Cristo (2002),Thirteen Days (2000),Traffic (2000),We Were Soldiers (2002),What Dreams May Come (1998),White Christmas (1954)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,4.5,,,,,,,,...,,,,,,,,,,
2.0,,,,5.0,2.0,5.0,5.0,,,,...,4.5,,5.0,5.0,,,,,,
3.0,3.5,4.0,,4.0,,5.0,,4.0,,4.0,...,4.0,4.0,4.0,4.0,,,4.0,,,
4.0,,,,,,3.5,,,,,...,,,,3.0,,,,,,
5.0,,,,,,,,,,,...,,,,5.0,,,,,,


In [13]:
# number of movies watched by users to find the similar pattern with random user
user_movie_count = movies_watched_df.T.notnull().sum()
user_movie_count = user_movie_count.reset_index()
#number of movies watched by users
user_movie_count.columns = ["userId", "movie_count"]
user_movie_count.head(10)
# excluding the user who watched less than 20 movies to get similar pattern with random user
# user_movie_count[user_movie_count["movie_count"] > 20].sort_values("movie_count", ascending=False)
# users who watched same amount of movies with random user
# user_movie_count[user_movie_count["movie_count"] == 33].count() # just 17

Unnamed: 0,userId,movie_count
0,1.0,4
1,2.0,16
2,3.0,25
3,4.0,8
4,5.0,4
5,6.0,2
6,7.0,0
7,8.0,5
8,9.0,5
9,10.0,4


In [14]:
# selecting the users who watched more than %60 of movies the the random user watched to get better results
perc = len(movies_watched) * 60 / 100
users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]
len(users_same_movies)

2812

In [15]:
# creating dataframe consisting of movies watched by random user and other users who watched them
final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies.index)],
                      random_user_df[movies_watched]])
final_df

title,"6th Day, The (2000)",About Schmidt (2002),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)","Beautiful Mind, A (2001)",Black Hawk Down (2001),"Bourne Identity, The (2002)",Braveheart (1995),Catch Me If You Can (2002),Con Air (1997),"Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)",...,Shrek (2001),Snatch (2000),Spider-Man (2002),Star Wars: Episode VI - Return of the Jedi (1983),The Count of Monte Cristo (2002),Thirteen Days (2000),Traffic (2000),We Were Soldiers (2002),What Dreams May Come (1998),White Christmas (1954)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
170.0,,,,,,,,,,,...,,,,,,,,,,
225.0,,,,,,,,,,,...,,,,,,,,,,
425.0,,,,,,,,,,,...,,4.5,,,,,,,,
430.0,,,,,,,4.0,,,,...,,,,,,,,,,
547.0,,,5.0,,,,,,,,...,,4.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162268.0,,,,4.0,,3.5,,3.5,,3.5,...,,3.5,,,,,,,,
162384.0,,,,3.5,,,,,,,...,,,,,,,,,,
162492.0,,,,,,,,,,,...,,5.0,,4.0,,,,,,
162513.0,,,,3.5,,,4.5,,,4.0,...,,,4.5,4.5,,,,,,


In [16]:
# finding correlations between users
corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ['user_id_1', 'user_id_2']
corr_df = corr_df.reset_index()
corr_df

Unnamed: 0,user_id_1,user_id_2,corr
0,12933.0,84655.0,-1.0
1,72357.0,161672.0,-1.0
2,50265.0,9404.0,-1.0
3,42145.0,71810.0,-1.0
4,112585.0,121040.0,-1.0
...,...,...,...
218364,141350.0,97449.0,1.0
218365,35279.0,85699.0,1.0
218366,9810.0,21724.0,1.0
218367,94151.0,17029.0,1.0


In [17]:
# selecting users at least %65 correlated with random user
top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= 0.65)][
    ["user_id_2", "corr"]].reset_index(drop=True)
top_users = top_users.sort_values(by='corr', ascending=False)
top_users.rename(columns={"user_id_2": "userId"}, inplace=True)
top_users

Unnamed: 0,userId,corr
20,156477.0,0.967617
19,132648.0,0.925057
18,114331.0,0.904875
17,33682.0,0.885615
16,53610.0,0.850963
15,46447.0,0.799953
14,61106.0,0.798625
13,19099.0,0.781575
12,89897.0,0.762373
11,87702.0,0.730297


In [18]:
# rating scores of similar users with random user
top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')
top_users_ratings.head()

Unnamed: 0,userId,corr,movieId,rating
0,156477.0,0.967617,1,4.0
1,156477.0,0.967617,17,3.5
2,156477.0,0.967617,34,4.0
3,156477.0,0.967617,104,3.0
4,156477.0,0.967617,208,4.0


In [19]:
# considering rating and correlation together: weighted average
top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']
# getting the movie IDs and weighted ratings
recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
recommendation_df = recommendation_df.reset_index()
recommendation_df.head()

Unnamed: 0,movieId,weighted_rating
0,1,3.10437
1,2,2.117014
2,3,2.676578
3,5,2.162059
4,6,2.788043


In [20]:
# 5 movies to recommend (user-based)
movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 4].sort_values("weighted_rating", ascending=False)
movies_to_be_recommend = movies_to_be_recommend.merge(movie[["movieId", "title"]])["title"]
movies_to_be_recommend.head(5)

0                                 Incredibles 2 (2018)
1                            Mask of Zorro, The (1998)
2                   Ponyo (Gake no ue no Ponyo) (2008)
3    Kiki's Delivery Service (Majo no takkyûbin) (1...
4                           Pride and Prejudice (1995)
Name: title, dtype: object

In [21]:
# getting the movie ID of recently watched and highly graded movie by random picked user 
movie_id = rating[(rating["userId"] == random_user) & (rating["rating"] ==  5.0)].\
sort_values(by = "timestamp", ascending = False)["movieId"][0:6].values[0]

In [22]:
# 5 movies to recommend (item-based)
movie_name = movie[movie["movieId"]== movie_id]["title"]
movie_name = user_movie_df[movie_name]
movies_from_item_based = user_movie_df.corrwith(movie_name).sort_values(ascending=False)
movies_from_item_based[1:6].index

Index([''burbs, The (1989)', '(500) Days of Summer (2009)',
       '*batteries not included (1987)', '...And Justice for All (1979)',
       '10 Cloverfield Lane (2016)'],
      dtype='object', name='title')

In [23]:
movie


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [24]:
rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [33]:

len(pd.unique(rating['userId']))
rare_movies = comment_counts[comment_counts["title"] <= len(pd.unique(rating['userId']))*0.01].index
# exclusion of movies rarely rated 
common_movies = df[~df["title"].isin(rare_movies)]
# check number of common movies
common_movies["title"].nunique()


          movieId                                     title  \
0               1                          Toy Story (1995)   
1               1                          Toy Story (1995)   
2               1                          Toy Story (1995)   
3               1                          Toy Story (1995)   
4               1                          Toy Story (1995)   
...           ...                                       ...   
24964330   195159  Spider-Man: Into the Spider-Verse (2018)   
24964331   195159  Spider-Man: Into the Spider-Verse (2018)   
24964332   195159  Spider-Man: Into the Spider-Verse (2018)   
24964333   195159  Spider-Man: Into the Spider-Verse (2018)   
24999893   204982                                  9 (2009)   

                                               genres    userId  rating  \
0         Adventure|Animation|Children|Comedy|Fantasy       2.0     3.5   
1         Adventure|Animation|Children|Comedy|Fantasy       3.0     4.0   
2         Adventur

In [34]:
common_movies

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,3.5,1.141416e+09
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1.439472e+09
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,3.0,1.573944e+09
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,8.586259e+08
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,8.904925e+08
...,...,...,...,...,...,...
24964330,195159,Spider-Man: Into the Spider-Verse (2018),Action|Adventure|Animation|Sci-Fi,162307.0,5.0,1.553978e+09
24964331,195159,Spider-Man: Into the Spider-Verse (2018),Action|Adventure|Animation|Sci-Fi,162335.0,5.0,1.569199e+09
24964332,195159,Spider-Man: Into the Spider-Verse (2018),Action|Adventure|Animation|Sci-Fi,162366.0,5.0,1.558242e+09
24964333,195159,Spider-Man: Into the Spider-Verse (2018),Action|Adventure|Animation|Sci-Fi,162516.0,3.5,1.571016e+09
