In [1]:
import tensorflow as tf

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPU is available! Devices: {gpus}")
else:
    print("GPU is not available. Running on CPU.")

GPU is available! Devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
import numpy as np
import pandas as pd


In [3]:
movie = pd.read_csv("movies.csv")
rating = pd.read_csv("ratings.csv")

df = movie.merge(rating, how="left", on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,1225735000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,5.0,835816000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.0,974518000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,3.0,1430666000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,12.0,5.0,862500700.0


In [4]:
df.shape

(33835460, 6)

In [5]:
df["title"].nunique()

86330

In [6]:
df["title"].value_counts().head()

title
Shawshank Redemption, The (1994)    122296
Forrest Gump (1994)                 113581
Pulp Fiction (1994)                 108756
Matrix, The (1999)                  107056
Silence of the Lambs, The (1991)    101802
Name: count, dtype: int64

In [8]:
comment_counts = pd.DataFrame(df["title"].value_counts())
rare_movies = comment_counts[comment_counts["count"] <= 1000].index

common_movies = df[~df["title"].isin(rare_movies)]
common_movies.shape

(30351956, 6)

In [9]:
common_movies["title"].nunique()

4461

In [10]:
# Let's create the User Movie Df:
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")

# There are 4461 movies that 330136 users have voted for. 
user_movie_df.shape

(330136, 4461)

In [11]:
user_movie_df.head(10)

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),...,Zombieland (2009),Zombieland: Double Tap (2019),Zoolander (2001),Zootopia (2016),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,3.5,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,5.0,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
6.0,,,,,,,,,,,...,,,,,,,,,,
7.0,,,,,,,,,,,...,,,,,,,,,,
8.0,,,,,,,,,,,...,,,,,,,,,,
9.0,,,,,,,,,,,...,,,,,,,,,,
10.0,,,,,,,,,,,...,,,,,,,,,,


In [12]:
# item-based movie recommendation example:
movie_name = "Matrix, The (1999)"
movie_name = user_movie_df[movie_name]
user_movie_df.corrwith(movie_name).sort_values(ascending=False).head(10)

title
Matrix, The (1999)                                       1.000000
Matrix Reloaded, The (2003)                              0.537265
Matrix Revolutions, The (2003)                           0.477337
Terminator 2: Judgment Day (1991)                        0.356952
Minority Report (2002)                                   0.355914
Inception (2010)                                         0.345382
Animatrix, The (2003)                                    0.345160
Blade (1998)                                             0.333631
Fight Club (1999)                                        0.330083
Lord of the Rings: The Return of the King, The (2003)    0.328174
dtype: float64

In [16]:
# Let's determine the movies that the user watched.

# Let's choose random user:
# random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=45).values)
random_user = 30101


In [17]:
random_user_df = user_movie_df[user_movie_df.index == random_user]
random_user_df

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),...,Zombieland (2009),Zombieland: Double Tap (2019),Zoolander (2001),Zootopia (2016),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30101.0,,,,,,3.0,,,,,...,,,,,,,,,,


In [18]:
# Let's choose non-NaN. Movies watched by all 30101:
movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist()
movies_watched

['10 Things I Hate About You (1999)',
 'Ace Ventura: Pet Detective (1994)',
 'Ace Ventura: When Nature Calls (1995)',
 'American Pie (1999)',
 'Apollo 13 (1995)',
 'Austin Powers: International Man of Mystery (1997)',
 'Austin Powers: The Spy Who Shagged Me (1999)',
 'Back to the Future Part II (1989)',
 'Back to the Future Part III (1990)',
 'Batman (1989)',
 'Bean (1997)',
 'Beauty and the Beast (1991)',
 "Bill & Ted's Bogus Journey (1991)",
 "Bill & Ted's Excellent Adventure (1989)",
 'Blair Witch Project, The (1999)',
 'Bloodsport (1988)',
 'Blow (2001)',
 'Blues Brothers, The (1980)',
 'Breakfast Club, The (1985)',
 'Buffy the Vampire Slayer (1992)',
 'Cable Guy, The (1996)',
 'Chicken Run (2000)',
 'Coming to America (1988)',
 'Copycat (1995)',
 'Crow, The (1994)',
 'Deuce Bigalow: Male Gigolo (1999)',
 'Die Hard (1988)',
 'Dodgeball: A True Underdog Story (2004)',
 'Dragon: The Bruce Lee Story (1993)',
 'Eddie Murphy Raw (1987)',
 'Edward Scissorhands (1990)',
 'Face/Off (1997)'

In [20]:
#let's verify:
user_movie_df.loc[user_movie_df.index == random_user, user_movie_df.columns == "X-Men (2000)"]
# gave this movie a 3.0 rating.

title,X-Men (2000)
userId,Unnamed: 1_level_1
30101.0,3.5


In [21]:
len(movies_watched)

99

In [22]:
movies_watched_df = user_movie_df[movies_watched]
movies_watched_df.head()
movies_watched_df.shape

(330136, 99)

In [23]:
# information on how many movies each user watched in total:
user_movie_count = movies_watched_df.T.notnull().sum()

user_movie_count = user_movie_count.reset_index()
user_movie_count.columns = ["userId","movie_count"]
user_movie_count.head()

Unnamed: 0,userId,movie_count
0,1.0,3
1,2.0,13
2,3.0,3
3,4.0,3
4,5.0,3


In [25]:
# 1 user has watched 99 movies
user_movie_count[user_movie_count["movie_count"] == 99].count()

userId         1
movie_count    1
dtype: int64

In [28]:
# 60% of movies watched by 30101
perc = len(movies_watched) * 60 / 100
perc

59.4

In [29]:
# People who have watched more than 60% movies together with 30101 users:
users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]
users_same_movies.count()

2318

In [30]:
# Let's combine the data of user #30101 and similar users:
final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],
                      random_user_df[movies_watched]])

final_df.shape
final_df.T.corr()

userId,198.0,227.0,487.0,867.0,897.0,974.0,1205.0,1677.0,1838.0,2082.0,...,329845.0,329908.0,329975.0,330181.0,330196.0,330371.0,330507.0,330517.0,330771.0,30101.0
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
198.0,1.000000,0.015741,0.388848,0.338642,0.512957,0.349906,0.299904,0.294651,0.321619,0.254491,...,0.224644,0.520598,0.237854,0.444396,0.428105,0.379121,0.222259,0.390559,0.396236,0.301604
227.0,0.015741,1.000000,0.404218,0.247099,0.209317,0.149274,0.104254,-0.078207,0.158074,0.213231,...,-0.179027,0.406663,0.222760,0.151822,-0.049356,0.394625,0.520084,0.107510,0.236658,0.412785
487.0,0.388848,0.404218,1.000000,0.114393,0.424141,0.434370,0.209315,0.385988,0.380359,0.491788,...,0.263260,0.277526,0.413898,0.347666,0.178434,0.362420,0.416459,0.251255,0.386018,0.340163
867.0,0.338642,0.247099,0.114393,1.000000,0.273384,0.276249,0.489496,0.211199,0.197404,0.034342,...,0.208007,0.291730,0.162723,0.300126,0.223589,0.238627,0.379317,0.097654,0.262040,0.461783
897.0,0.512957,0.209317,0.424141,0.273384,1.000000,0.378009,0.195790,0.391058,0.412957,0.622816,...,0.310504,0.422781,0.429758,0.396272,0.298144,0.373584,0.093136,0.128992,0.395192,0.152262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330371.0,0.379121,0.394625,0.362420,0.238627,0.373584,0.113426,-0.083687,0.265319,0.253246,0.332183,...,-0.162318,0.230054,0.232816,0.223687,0.181180,1.000000,0.117427,0.104471,0.431374,0.042770
330507.0,0.222259,0.520084,0.416459,0.379317,0.093136,0.329150,0.472550,0.300720,0.232115,0.287677,...,0.027938,0.421646,0.579890,0.254761,0.125370,0.117427,1.000000,0.294650,0.497505,0.671570
330517.0,0.390559,0.107510,0.251255,0.097654,0.128992,0.025881,0.034965,0.012491,0.215586,0.084541,...,0.387918,0.215284,-0.055440,0.396520,0.175641,0.104471,0.294650,1.000000,0.252825,0.343801
330771.0,0.396236,0.236658,0.386018,0.262040,0.395192,0.454740,0.119653,0.231366,0.323107,0.222044,...,0.068621,0.600194,0.398723,0.451791,0.250467,0.431374,0.497505,0.252825,1.000000,0.309121


In [33]:
# This aggregates duplicate users by taking the mean of their ratings
final_df = final_df.groupby(final_df.index).mean()

corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ['user_id_1', 'user_id_2']
corr_df = corr_df.reset_index()
corr_df.head()

Unnamed: 0,user_id_1,user_id_2,corr
0,139677.0,96889.0,-0.601759
1,246907.0,282711.0,-0.574665
2,198124.0,190452.0,-0.570361
3,195378.0,289219.0,-0.558236
4,9861.0,298874.0,-0.55463


In [35]:
top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= 0.60)][
    ["user_id_2", "corr"]].reset_index(drop=True)

top_users = top_users.sort_values(by='corr', ascending=False)
top_users.rename(columns={"user_id_2": "userId"}, inplace=True)
top_users.head()

Unnamed: 0,userId,corr
15,227057.0,0.715252
14,330507.0,0.67157
13,180691.0,0.667064
12,82826.0,0.659197
11,229463.0,0.640539


In [36]:
rating = pd.read_csv("ratings.csv")
top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')

top_users_ratings = top_users_ratings[top_users_ratings["userId"] != random_user]
top_users_ratings.head()

Unnamed: 0,userId,corr,movieId,rating
0,227057.0,0.715252,1,3.0
1,227057.0,0.715252,5,2.5
2,227057.0,0.715252,6,4.5
3,227057.0,0.715252,10,4.0
4,227057.0,0.715252,11,3.0


In [37]:
# Calculate the Weighted Average Recommendation Score and keep the first 5 movies.

#Let's do a single score with the most similar by corr * rating:
top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']
top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})

recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
recommendation_df = recommendation_df.reset_index()
recommendation_df.head()

Unnamed: 0,movieId,weighted_rating
0,1,2.416318
1,2,1.761942
2,3,1.882853
3,4,1.804218
4,5,1.532192


In [40]:
# weighted rating greater than 3:
recommendation_df[recommendation_df["weighted_rating"] > 3.0]

# Movies 30101 will like:
movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 3.0].sort_values("weighted_rating", ascending=False)

movies_to_be_recommend.merge(movie[["movieId", "title"]])

#Let's see the top 5 movies:
movies_to_be_recommend.merge(movie[["movieId", "title"]])[:5]

Unnamed: 0,movieId,weighted_rating,title
0,1950,3.357849,In the Heat of the Night (1967)
1,3787,3.357849,Shower (Xizao) (1999)
2,3700,3.357849,"Brother from Another Planet, The (1984)"
3,2859,3.357849,Stop Making Sense (1984)
4,3241,3.357849,"Cup, The (Phörpa) (1999)"


In [43]:
# Make an item-based suggestion based on the name of the movie that the user has watched with the highest score.

# ▪ 5 suggestions user-based
# ▪ 5 suggestions item-based

movie = pd.read_csv("movies.csv")
rating = pd.read_csv("ratings.csv")

# The last highly-rated movie by user 108170:

user = 150101
movie_id = rating[(rating["userId"] == user) & (rating["rating"] == 5.0)].sort_values(by="timestamp", ascending=False)["movieId"][0:1].values[0]
movie_id

58559

In [44]:
# ▪ 5 suggestions user-based
movies_to_be_recommend.merge(movie[["movieId", "title"]])[:5]['title'].to_list()

['In the Heat of the Night (1967)',
 'Shower (Xizao) (1999)',
 'Brother from Another Planet, The (1984)',
 'Stop Making Sense (1984)',
 'Cup, The (Phörpa) (1999)']

In [45]:
# ▪ 5 suggestions item-based
movie_name = movie[movie['movieId'] == movie_id]['title'].values[0]
movie_name = user_movie_df[movie_name]
moveis_from_item_based = user_movie_df.corrwith(movie_name).sort_values(ascending=False)
moveis_from_item_based[1:6].index.to_list()

['Dark Knight Rises, The (2012)',
 'Batman Begins (2005)',
 'Inception (2010)',
 'Iron Man (2008)',
 'Spider-Man (2002)']