# Practice PS06: Recommendations engines (interactions-based)

Author: <font color="blue">Marcel Fernández</font>

E-mail: <font color="blue">marcel.fernandez02@estudiant.upf.edu</font>

Date: <font color="blue">The current date here</font>

# 1. The Movies dataset

# 1.1. Load the input files

In [74]:
# Leave this code as-is
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from math import*
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import linear_kernel

In [75]:
# Leave this code as-is
FILENAME_MOVIES = "movies-2000s.csv"
FILENAME_RATINGS = "ratings-2000s.csv"
FILENAME_TAGS = "tags-2000s.csv"

In [76]:
# Leave this code as-is

movies = pd.read_csv(FILENAME_MOVIES, 
                    sep=',', 
                    engine='python', 
                    encoding='latin-1',
                    names=['movie_id', 'title', 'genres'])
display(movies.head(5))

ratings_raw = pd.read_csv(FILENAME_RATINGS, 
                    sep=',', 
                    encoding='latin-1',
                    engine='python',
                    names=['user_id', 'movie_id', 'rating'])
display(ratings_raw.head(5))

Unnamed: 0,movie_id,title,genres
0,2769,"Yards, The (2000)",Crime|Drama
1,3177,Next Friday (2000),Comedy
2,3190,Supernova (2000),Adventure|Sci-Fi|Thriller
3,3225,Down to You (2000),Comedy|Romance
4,3228,Wirey Spindell (2000),Comedy


Unnamed: 0,user_id,movie_id,rating
0,4,1,3.0
1,4,260,3.5
2,4,296,4.0
3,4,541,4.5
4,4,589,4.0


# 1.2. Merge the data into a single dataframe

In [77]:
ratings = pd.merge(ratings_raw, movies, how='inner', on=['movie_id'])

In [78]:
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,title,genres
0,4,3624,2.5,Shanghai Noon (2000),Action|Adventure|Comedy|Western
1,152,3624,3.0,Shanghai Noon (2000),Action|Adventure|Comedy|Western
2,171,3624,3.5,Shanghai Noon (2000),Action|Adventure|Comedy|Western
3,276,3624,4.0,Shanghai Noon (2000),Action|Adventure|Comedy|Western
4,494,3624,3.5,Shanghai Noon (2000),Action|Adventure|Comedy|Western


In [79]:
def find_movies(keyword, movies_df):
    found_movies = movies_df[movies_df['title'].str.contains(keyword, case=False)]
    
    if not found_movies.empty:
        for index, row in found_movies.iterrows():
            print(f"movie_id: {row['movie_id']}, title: {row['title']}")
    else:
        print(f"No movies found with the keyword '{keyword}'")

In [80]:
# LEAVE AS-IS

# For testing, this should print:
# movie_id:  4993, title: Lord of the Rings: The Fellowship of the Ring, The (2001)
# movie_id:  5952, title: Lord of the Rings: The Two Towers, The (2002)
# movie_id:  7153, title: Lord of the Rings: The Return of the King, The (2003)
find_movies("Lord of the Rings", movies)

movie_id: 4993, title: Lord of the Rings: The Fellowship of the Ring, The (2001)
movie_id: 5952, title: Lord of the Rings: The Two Towers, The (2002)
movie_id: 7153, title: Lord of the Rings: The Return of the King, The (2003)


In [81]:
# LEAVE AS-IS

def get_title(movie_id, movies):
    return movies[movies['movie_id'] == movie_id].title.iloc[0]

In [82]:
# LEAVE AS-IS

# For testing, should print "Lord of the Rings: The Return of the King, The (2003)")
print(get_title(7153, movies))

Lord of the Rings: The Return of the King, The (2003)


## 1.3. Count unique registers

In [83]:
# Count the number of unique users and unique movies in the ratings DataFrame
unique_users = ratings['user_id'].nunique()
unique_movies = ratings['movie_id'].nunique()

# Get the total number of movies from the movies DataFrame
total_movies = movies['movie_id'].nunique()

# Print the results
print(f"Number of users who have rated a movie: {unique_users}")
print(f"Number of movies that have been rated: {unique_movies}")
print(f"Total number of movies: {total_movies}")

Number of users who have rated a movie: 12676
Number of movies that have been rated: 2049
Total number of movies: 33168


# 2. Item-based Collaborative Filtering

## 2.1. Data pre-processing

In [84]:
rated_movies = ratings.drop(columns=['genres'])
rated_movies.head(10)


Unnamed: 0,user_id,movie_id,rating,title
0,4,3624,2.5,Shanghai Noon (2000)
1,152,3624,3.0,Shanghai Noon (2000)
2,171,3624,3.5,Shanghai Noon (2000)
3,276,3624,4.0,Shanghai Noon (2000)
4,494,3624,3.5,Shanghai Noon (2000)
5,1148,3624,2.5,Shanghai Noon (2000)
6,1967,3624,2.0,Shanghai Noon (2000)
7,2189,3624,4.0,Shanghai Noon (2000)
8,2287,3624,4.0,Shanghai Noon (2000)
9,2360,3624,4.0,Shanghai Noon (2000)


In [85]:
# Group movies by 'movie_id' and 'title' and calculate mean and count of ratings
ratings_summary = rated_movies.groupby(['movie_id', 'title']).agg({'rating': ['mean', 'count']})
# Now we rename
ratings_summary.columns = ['ratings_mean', 'ratings_count']

print(ratings_summary.head(5))


                                ratings_mean  ratings_count
movie_id title                                             
2769     Yards, The (2000)          3.122549            102
3177     Next Friday (2000)         2.824000            125
3190     Supernova (2000)           2.395683            139
3225     Down to You (2000)         2.577273            110
3228     Wirey Spindell (2000)      2.500000              2


In [86]:
ratings_summary[ratings_summary.ratings_count >= 2500].sort_values(by = 'ratings_mean', ascending = False).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,ratings_mean,ratings_count
movie_id,title,Unnamed: 2_level_1,Unnamed: 3_level_1
4226,Memento (2000),4.158512,4476
4973,"Amelie (Fabuleux destin d'AmÃ©lie Poulain, Le) (2001)",4.097234,3687
4993,"Lord of the Rings: The Fellowship of the Ring, The (2001)",4.09253,5944
7153,"Lord of the Rings: The Return of the King, The (2003)",4.08396,5449
5952,"Lord of the Rings: The Two Towers, The (2002)",4.083869,5449


In [87]:
ratings_summary[ratings_summary.ratings_count >= 3].sort_values(by = 'ratings_mean', ascending = False).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,ratings_mean,ratings_count
movie_id,title,Unnamed: 2_level_1,Unnamed: 3_level_1
5082,"Rumor of Angels, A (2000)",4.666667,6
27764,2LDK (2003),4.5,3
31954,Beautiful City (Shah-re ziba) (2004),4.4,5
5224,Promises (2001),4.388889,18
6775,Life and Debt (2001),4.333333,3


> We can see that the values of ratings_mean when we check **ratings_summary.ratings_count >= 2500 are lower than when we check ratings_summary.ratings_count >= 3** , this is because in the case of doing the average with fewer values it may result in a higher ratings_mean, since with few values it is easier to maintain this larger mean, so when we see the movies that have more than 2500 votes, the top rating is lower because it is more difficult to maintain because you take into account many more opinions.

## 2.2. Compute the user-movie matrix

In [88]:
user_movie = rated_movies.pivot_table(index='user_id', columns='movie_id', values='rating')
user_movie.head(5)

movie_id,2769,3177,3190,3225,3228,3239,3273,3275,3276,3279,...,33138,33145,33148,33150,33152,33154,33158,33162,33164,33166
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,,,,,,,,,,,...,,,,,,,,,,
33,,,,,,,,,,,...,,,,,,,,,,
62,,,,,,,,4.5,,,...,,,,,,,,,,3.5
63,,,,,,,,,,,...,,,,,,,,,,
95,,,,,,,,3.5,,,...,,,,,,,,,,


### COMMENT

> The "user_movie" matrix has many "NaN" values because not all users have rated all movies. In a recommender system, users provide ratings only for a subset of the available movies, and the majority of movies are unrated by a particular user. This is a common characteristic in collaborative filtering-based recommender systems.

> The sparsity of the matrix, where a large portion of the entries is missing or undefined, is known as the "sparsity problem" in recommender systems. The sparsity problem arises because users typically rate only a small fraction of the total items in the system. This makes it challenging to predict user preferences for unrated items accurately. The "NaN" values in the matrix represent these missing ratings, indicating that the corresponding user has not provided a rating for the respective movie.

> To address the sparsity problem, various techniques can be employed in recommender systems, such as matrix factorization, neighborhood-based methods, or hybrid approaches that combine multiple recommendation strategies. These techniques aim to make accurate predictions for missing values based on the available ratings and improve the overall performance of the recommender system

# 2.3. Explore some correlations in the user-movie matrix

In [89]:
# Step 1: Locate movie_id for the three movies
id_pivot = rated_movies[rated_movies['title'] == 'Lord of the Rings: The Fellowship of the Ring, The (2001)']['movie_id'].values[0]
id_m1 = rated_movies[rated_movies['title'] == 'Finding Nemo (2003)']['movie_id'].values[0]
id_m2 = rated_movies[rated_movies['title'] == 'Talk to Her (Hable con Ella) (2002)']['movie_id'].values[0]

# Step 2: Obtain ratings for each movie
s1 = user_movie[id_pivot].dropna()
s2 = user_movie[id_m1].dropna()
s3 = user_movie[id_m2].dropna()

# Step 3: Consolidate the Series into a single dataframe
ratings3 = pd.concat([s1, s2, s3], axis=1)

# Step 4: Drop rows containing NaN
ratings3 = ratings3.dropna()

# Step 5: Display the first 10 rows
print(ratings3.head(10))


         4993  6377  5878
user_id                  
859       3.0   4.0   5.0
1229      4.0   4.0   4.5
1281      3.0   2.5   3.0
1722      5.0   4.5   4.0
2004      4.5   3.0   3.5
4590      4.0   4.0   2.0
5052      2.0   4.0   4.0
5144      5.0   5.0   5.0
6497      3.5   3.5   3.5
8369      3.0   4.0   4.5


In [90]:
comb12 = round(ratings3[id_pivot].corr(ratings3[id_m1]),2)
comb13 = round(ratings3[id_pivot].corr(ratings3[id_m2]),2)
comb23 = round(ratings3[id_m1].corr(ratings3[id_m2]),2)

print("Similarity between 'Lord of the Rings: The Fellowship of the Ring, The (2001)' and 'Finding Nemo (2003)'",comb12)
print("Similarity between 'Lord of the Rings: The Fellowship of the Ring, The (2001)' and 'Talk to Her (Hable con Ella) (2002)'",comb13)
print("Similarity between 'Finding Nemo (2003)' and 'Talk to Her (Hable con Ella) (2002)':",comb23)

Similarity between 'Lord of the Rings: The Fellowship of the Ring, The (2001)' and 'Finding Nemo (2003)' 0.38
Similarity between 'Lord of the Rings: The Fellowship of the Ring, The (2001)' and 'Talk to Her (Hable con Ella) (2002)' 0.16
Similarity between 'Finding Nemo (2003)' and 'Talk to Her (Hable con Ella) (2002)': 0.2


#### Commentary:

The correlation values you obtained indicate the degree of similarity between the ratings of different movie pairs. Here's a brief commentary on the correlations:

> 'Lord of the Rings: The Fellowship of the Ring, The (2001)' and 'Finding Nemo (2003)' (0.38):

>  - A positive correlation of 0.38 suggests a moderate positive relationship between the ratings of these two movies. Users who liked 'Lord of the Rings' might also have a tendency to like 'Finding Nemo,' and vice versa.

> 'Lord of the Rings: The Fellowship of the Ring, The (2001)' and 'Talk to Her (Hable con Ella) (2002)' (0.16):

> - The positive correlation of 0.16 indicates a weaker positive relationship compared to the first pair. There is still a positive tendency, but it is not as strong. Users who enjoyed 'Lord of the Rings' may have a slight inclination to enjoy 'Talk to Her,' but the correlation is not as pronounced.

> 'Finding Nemo (2003)' and 'Talk to Her (Hable con Ella) (2002)' (0.2):

> - Similar to the second pair, there is a positive correlation of 0.2, indicating a weak positive relationship. Users who liked 'Finding Nemo' might have a slight tendency to like 'Talk to Her,' but the correlation is not very strong.

To sum up, the correlation values provide insights into the relationships between the ratings of different movie pairs. A higher positive correlation suggests a stronger tendency for users to rate both movies similarly, while a lower correlation indicates a weaker relationship. These correlations can be useful in understanding user preferences and building recommender systems based on similar user tastes

In [91]:
pivot_movie_ratings = user_movie[id_pivot]
similar_to_pivot_all = user_movie.corrwith(pivot_movie_ratings)
similar_to_pivot = similar_to_pivot_all.dropna()

print(similar_to_pivot)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


movie_id
2769    -0.127515
3177     0.093221
3190     0.041206
3225     0.126600
3239     0.338378
           ...   
33154    0.318255
33158    0.228214
33162    0.285377
33164    0.037130
33166    0.197344
Length: 1868, dtype: float64


In [92]:
similar_to_pivot_df = pd.DataFrame(similar_to_pivot_all, columns=['corr'])
corr_with_pivot = ratings_summary.copy()

corr_with_pivot['corr'] = similar_to_pivot_df['corr'].values


corr_with_pivot_filtered = corr_with_pivot[corr_with_pivot['ratings_count'] > 500]
corr_with_pivot_filtered.sort_values('corr', ascending=False).head(10)


Unnamed: 0_level_0,Unnamed: 1_level_0,ratings_mean,ratings_count,corr
movie_id,title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4993,"Lord of the Rings: The Fellowship of the Ring, The (2001)",4.09253,5944,1.0
5952,"Lord of the Rings: The Two Towers, The (2002)",4.083869,5449,0.892103
7153,"Lord of the Rings: The Return of the King, The (2003)",4.08396,5449,0.892073
6539,Pirates of the Caribbean: The Curse of the Black Pearl (2003),3.779241,3950,0.377599
8368,Harry Potter and the Prisoner of Azkaban (2004),3.809971,2397,0.340934
3578,Gladiator (2000),3.95105,4811,0.337667
3793,X-Men (2000),3.556436,3535,0.329686
4896,Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001),3.678509,2843,0.31918
3624,Shanghai Noon (2000),3.297443,1017,0.307471
31658,Howl's Moving Castle (Hauru no ugoku shiro) (2004),4.064417,1141,0.303898


> - Setting the condition on ratings_count to a much larger value will likely result in a more selective list, including only movies with a very high number of ratings. This could lead to a list dominated by blockbuster films or widely popular titles.

> - Conversely, setting the condition to a much smaller value would likely include a broader range of movies, including those with fewer ratings. This might lead to more niche or less mainstream films appearing on the list, and correlations might be more volatile due to the smaller sample size.







# 2.4. Implement the item-based recommendations

In [93]:
item_similarity = user_movie.corr()
item_similarity.head(10)

movie_id,2769,3177,3190,3225,3228,3239,3273,3275,3276,3279,...,33138,33145,33148,33150,33152,33154,33158,33162,33164,33166
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2769,1.0,0.115068,0.033721,-0.232268,,-0.5,0.197011,0.199514,0.250873,,...,0.37998,0.87831,,,,0.248126,0.1806095,-0.08557,-0.408248,0.105671
3177,0.115068,1.0,0.30382,0.559533,,,0.331191,0.167918,1.0,,...,0.546119,0.735767,-1.0,,,-0.221382,0.3174747,0.014735,0.661989,0.185654
3190,0.033721,0.30382,1.0,0.636361,,-0.014315,0.146042,0.394293,-0.290397,,...,0.246183,0.632026,,,,0.378181,0.1709261,0.022444,-0.07336,-0.054114
3225,-0.232268,0.559533,0.636361,1.0,,0.578414,0.347716,0.263671,-0.250313,,...,-0.300376,0.318377,,,,0.480173,0.7503063,0.536828,0.753141,0.098748
3228,,,,,1.0,,,,,,...,,,,,,,,,,
3239,-0.5,,-0.014315,0.578414,,1.0,0.180846,1.0,,,...,,,,,,1.0,,1.0,0.636285,0.8882
3273,0.197011,0.331191,0.146042,0.347716,,0.180846,1.0,0.105735,0.154371,,...,0.006774,0.409968,1.0,,,0.088405,0.07516779,0.143492,0.466705,0.084202
3275,0.199514,0.167918,0.394293,0.263671,,1.0,0.105735,1.0,0.485071,,...,-0.011426,0.279624,,,,0.075827,0.2994603,0.187713,0.285584,0.225317
3276,0.250873,1.0,-0.290397,-0.250313,,,0.154371,0.485071,1.0,,...,,0.29277,,,,0.0,-6.885311000000001e-17,-0.45553,0.5,-0.138013
3279,,,,,,,,,,1.0,...,,,,,,,,,,


In [94]:
item_similarity_min_ratings = user_movie.corr(min_periods = 100)
item_similarity.head(10)

movie_id,2769,3177,3190,3225,3228,3239,3273,3275,3276,3279,...,33138,33145,33148,33150,33152,33154,33158,33162,33164,33166
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2769,1.0,0.115068,0.033721,-0.232268,,-0.5,0.197011,0.199514,0.250873,,...,0.37998,0.87831,,,,0.248126,0.1806095,-0.08557,-0.408248,0.105671
3177,0.115068,1.0,0.30382,0.559533,,,0.331191,0.167918,1.0,,...,0.546119,0.735767,-1.0,,,-0.221382,0.3174747,0.014735,0.661989,0.185654
3190,0.033721,0.30382,1.0,0.636361,,-0.014315,0.146042,0.394293,-0.290397,,...,0.246183,0.632026,,,,0.378181,0.1709261,0.022444,-0.07336,-0.054114
3225,-0.232268,0.559533,0.636361,1.0,,0.578414,0.347716,0.263671,-0.250313,,...,-0.300376,0.318377,,,,0.480173,0.7503063,0.536828,0.753141,0.098748
3228,,,,,1.0,,,,,,...,,,,,,,,,,
3239,-0.5,,-0.014315,0.578414,,1.0,0.180846,1.0,,,...,,,,,,1.0,,1.0,0.636285,0.8882
3273,0.197011,0.331191,0.146042,0.347716,,0.180846,1.0,0.105735,0.154371,,...,0.006774,0.409968,1.0,,,0.088405,0.07516779,0.143492,0.466705,0.084202
3275,0.199514,0.167918,0.394293,0.263671,,1.0,0.105735,1.0,0.485071,,...,-0.011426,0.279624,,,,0.075827,0.2994603,0.187713,0.285584,0.225317
3276,0.250873,1.0,-0.290397,-0.250313,,,0.154371,0.485071,1.0,,...,,0.29277,,,,0.0,-6.885311000000001e-17,-0.45553,0.5,-0.138013
3279,,,,,,,,,,1.0,...,,,,,,,,,,


In [95]:
movies_ids1 = [5349, 3793, 6534]
movies_ids2 = [6870, 5995, 3555]
user_id_super = 0
user_id_drama = 0

def find_user_id(movie_ids, threshold):
    for i in range(len(user_movie)):
        rates = [user_movie.iloc[i][movie_id] for movie_id in movie_ids]

        if all(rate > threshold for rate in rates):
            return user_movie.iloc[i].name

    return 0  # Return 0 if no user is found

user_id_super = find_user_id(movies_ids1, 4.5)
user_id_drama = find_user_id(movies_ids2, 4.5)

In [96]:
# Print the user IDs
print("-"*70)
print("One example of user IDs who liked superhero movies  :", user_id_super)

print("-"*70)
print("One example of User IDs who liked drama movies      :", user_id_drama)
print("-"*70)

----------------------------------------------------------------------
One example of user IDs who liked superhero movies  : 127342
----------------------------------------------------------------------
One example of User IDs who liked drama movies      : 34336
----------------------------------------------------------------------


In [97]:
# Leave this code as-is

# Gets a list of watched movies for a user_id
def get_watched_movies(user_id, user_movie):
    return list(user_movie.loc[user_id].dropna().sort_values(ascending=False).index)
    
# Gets the rating a user_id has given to a movie_id
def get_rating(user_id, movie_id, user_movie):
    return user_movie[movie_id][user_id]

# Print watched movies
def print_watched_movies(user_id, user_movie, movies):
    for movie_id in get_watched_movies(user_id, user_movie):
        print("%d %.1f %s " %
          (movie_id, get_rating(user_id, movie_id, user_movie), get_title(movie_id, movies)))


In [98]:
# LEAVE AS-IS (TESTING CODE)

print_watched_movies(user_id_super, user_movie, movies)

5502 5.0 Signs (2002) 
5445 5.0 Minority Report (2002) 
6156 5.0 Shanghai Knights (2003) 
5952 5.0 Lord of the Rings: The Two Towers, The (2002) 
5944 5.0 Star Trek: Nemesis (2002) 
5816 5.0 Harry Potter and the Chamber of Secrets (2002) 
5618 5.0 Spirited Away (Sen to Chihiro no kamikakushi) (2001) 
5524 5.0 Blue Crush (2002) 
5480 5.0 Stuart Little 2 (2002) 
5459 5.0 Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (2002) 
5420 5.0 Windtalkers (2002) 
4388 5.0 Scary Movie 2 (2001) 
5389 5.0 Spirit: Stallion of the Cimarron (2002) 
5349 5.0 Spider-Man (2002) 
5218 5.0 Ice Age (2002) 
5064 5.0 The Count of Monte Cristo (2002) 
4993 5.0 Lord of the Rings: The Fellowship of the Ring, The (2001) 
4973 5.0 Amelie (Fabuleux destin d'AmÃ©lie Poulain, Le) (2001) 
4896 5.0 Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) 
4886 5.0 Monsters, Inc. (2001) 
6186 5.0 Gods and Generals (2003) 
6333 5.0 X2: X-Men United (2003) 
6377 5.0 Finding Nemo (2003) 
6

In [99]:
# LEAVE AS-IS (TESTING CODE)

print_watched_movies(user_id_drama, user_movie, movies)

3967 5.0 Billy Elliot (2000) 
4014 5.0 Chocolat (2000) 
4034 5.0 Traffic (2000) 
5995 5.0 Pianist, The (2002) 
7147 5.0 Big Fish (2003) 
4995 5.0 Beautiful Mind, A (2001) 
3555 5.0 U-571 (2000) 
6870 5.0 Mystic River (2003) 
5991 5.0 Chicago (2002) 
8464 5.0 Super Size Me (2004) 
5669 5.0 Bowling for Columbine (2002) 
8622 5.0 Fahrenheit 9/11 (2004) 
30707 5.0 Million Dollar Baby (2004) 
6953 4.5 21 Grams (2003) 
5015 4.5 Monster's Ball (2001) 
5464 4.5 Road to Perdition (2002) 
3510 4.5 Frequency (2000) 
5989 4.5 Catch Me If You Can (2002) 
4022 4.0 Cast Away (2000) 
5010 4.0 Black Hawk Down (2001) 
5299 4.0 My Big Fat Greek Wedding (2002) 
3897 4.0 Almost Famous (2000) 
3755 4.0 Perfect Storm, The (2000) 
4308 4.0 Moulin Rouge (2001) 
4447 3.5 Legally Blonde (2001) 
4246 3.5 Bridget Jones's Diary (2001) 
4975 3.5 Vanilla Sky (2001) 
4019 3.5 Finding Forrester (2000) 
5377 3.5 About a Boy (2002) 
3948 3.5 Meet the Parents (2000) 
5956 3.0 Gangs of New York (2002) 
6281 3.0 Phone Booth

In [100]:
def get_movies_relevance(user_id, user_movie, item_similarity_matrix):
    
    # Create an empty series
    movies_relevance = pd.Series()
    
    # Iterate through the movies the user has watched
    for watched_movie, rating_given in user_movie.loc[user_id].dropna().items():
        
        # Obtain the vector containing the similarities of watched_movie
        # with all other movies in item_similarity_matrix
        similarities = item_similarity_matrix[watched_movie]

        # Multiply this vector by the given rating
        weighted_similarities = rating_given * similarities
        
        # Append these terms to movies_relevance
        movies_relevance = movies_relevance.add(weighted_similarities, fill_value=0)
    
    # Convert to a dataframe
    movies_relevance_df = pd.DataFrame({'relevance': movies_relevance.values, 'movie_id': movies_relevance.index})
    
    return movies_relevance_df

In [101]:
movies_rel_super = get_movies_relevance(user_id_super, user_movie, item_similarity)
merge_super = pd.merge(movies, movies_rel_super, how='inner', left_on='movie_id', right_on='movie_id')
display(merge_super.sort_values(by='relevance', ascending=False).head(5))

  movies_relevance = pd.Series()


Unnamed: 0,movie_id,title,genres,relevance
1930,31636,"Bunker, The (2001)",Drama|Horror|Mystery|Thriller|War,394.160783
1662,27426,"Accidental Spy, The (Dak miu mai shing) (2001)",Action|Comedy|Thriller,353.463397
1136,6761,Tibet: Cry of the Snow Lion (2002),Documentary,350.723308
1375,7477,Eye See You (D-Tox) (2002),Horror|Thriller,320.251163
1005,6375,Gigantic (A Tale of Two Johns) (2002),Documentary,300.482169


In [102]:
movies_rel_super = get_movies_relevance(user_id_drama, user_movie, item_similarity)
merge_super = pd.merge(movies, movies_rel_super, how='inner', left_on='movie_id', right_on='movie_id')
display(merge_super.sort_values(by='relevance', ascending=False).head(5))

  movies_relevance = pd.Series()


Unnamed: 0,movie_id,title,genres,relevance
1376,7521,Mercy (2000),Crime|Mystery|Thriller,160.0
351,4449,Adanggaman (2000),Drama,154.388241
1357,7443,This So-Called Disaster (2003),Documentary,146.447098
1930,31636,"Bunker, The (2001)",Drama|Horror|Mystery|Thriller|War,135.966211
1823,27835,"Agronomist, The (2003)",Documentary,133.5


#### Comment
List 1:

"Bunker, The (2001)"

>Drama|Horror|Mystery|Thriller|War
Highly relevant (394.16). Offers a complex and intense narrative.

"Accidental Spy, The (2001)"

>Action|Comedy|Thriller
Highly relevant (353.46). Provides an engaging mix of genres.

"Tibet: Cry of the Snow Lion (2002)"

>Documentary
Highly relevant (350.72). Appeals to those interested in real-world issues.

"Eye See You (D-Tox) (2002)"

>Horror|Thriller
Highly relevant (320.25). Intense and gripping choice.

"Gigantic (A Tale of Two Johns) (2002)"

>Documentary
Relevant (300.48). Attractive for those interested in non-fiction narratives.

List 2:

"Mercy (2000)"

>Crime|Mystery|Thriller
Relevant (160.0). Engaging for suspense and intensity.

"Adanggaman (2000)"

>Drama
Relevant (154.39). A focused and character-driven narrative.

"This So-Called Disaster (2003)"

>Documentary
Relevant (146.45). Appeals to drama enthusiasts interested in behind-the-scenes.

"Bunker, The (2001)"

>Drama|Horror|Mystery|Thriller|War
Relevant (135.97). Diverse genre elements for drama enthusiasts.

"Agronomist, The (2003)"

>Documentary
Relevant (133.5). Appeals to those interested in real-life narratives.

In both lists, the movies demonstrate high relevance for drama enthusiasts, offering a mix of genres that cater to various tastes within the broader drama category.







In [103]:
def get_recommended_movies(user_id, user_movie, item_similarity_matrix):
    # Get movies relevance using the previously defined function
    movies_relevance_df = get_movies_relevance(user_id, user_movie, item_similarity_matrix)

    # Set the dataframe index to 'movie_id'
    movies_relevance_df.set_index('movie_id', inplace=True)

    # Obtain the list of movie_ids of watched movies
    watched_movies = get_watched_movies(user_id, user_movie)

    # Drop watched movies from the relevant movies dataframe
    recommended_movies_df = movies_relevance_df.drop(watched_movies, errors='ignore')

    return recommended_movies_df

In [104]:
# Get recommended movies for user_id_super and user_id_drama
recommended_movies_super = get_recommended_movies(user_id_super, user_movie, item_similarity)
recommended_movies_drama = get_recommended_movies(user_id_drama, user_movie, item_similarity)

# Convert 'relevance' column to numeric dtype
recommended_movies_super['relevance'] = pd.to_numeric(recommended_movies_super['relevance'], errors='coerce')
recommended_movies_drama['relevance'] = pd.to_numeric(recommended_movies_drama['relevance'], errors='coerce')

# Get the top 10 recommended movies for each user
top_10_recommended_super = recommended_movies_super.nlargest(10, 'relevance')
top_10_recommended_drama = recommended_movies_drama.nlargest(10, 'relevance')

print("-"*25)
print("Top 10 recommended movies for user_id_super:")
print(top_10_recommended_super)
print("-"*25)
print("Top 10 recommended movies for user_id_drama:")
print(top_10_recommended_drama)
print("-"*25)

-------------------------
Top 10 recommended movies for user_id_super:
           relevance
movie_id            
31636     394.160783
27426     353.463397
7477      320.251163
6375      300.482169
6651      283.913191
8835      282.371780
5170      277.711932
6544      277.051465
6648      274.349794
32914     271.345034
-------------------------
Top 10 recommended movies for user_id_drama:
           relevance
movie_id            
7521      160.000000
4449      154.388241
7443      146.447098
31636     135.966211
27835     133.500000
5806      132.174524
6688      131.791755
4150      130.500000
6336      130.500000
6544      130.122867
-------------------------


  movies_relevance = pd.Series()
  movies_relevance = pd.Series()



#### Commentary:

- The recommended movies seem relevant based on their high relevance scores, suggesting a strong alignment with the users' preferences.
- After removing the watched movies, the relevance scores remain relatively high, indicating that the recommendation system is still effective in suggesting movies that match the users' tastes.
- The remaining items' relevance scores are comparable to the previous lists that included all relevant movies, emphasizing the system's ability to provide meaningful recommendations even with a reduced set of available movies.
- The diversity in genres within the top recommendations indicates that the system considers a broad range of user preferences, providing a mix of drama, action, thriller, and documentary films based on individual tastes.


### EXTRA

In [41]:
pip install scikit-surprise  # i needed to do this for the use of this library


Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py): started
  Building wheel for scikit-surprise (setup.py): finished with status 'done'
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp39-cp39-win_amd64.whl size=1095077 sha256=3beacc57c7476b1d081071844bd0b944c2de92934bc160fcc1d50199f7ecb8fe
  Stored in directory: c:\users\usuario\appdata\local\pip\cache\wheels\c6\3a\46\9b17b3512bdf283c6cb84f59929cdd5199d4e754d596d22784
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3
Note: you may need to restart the kernel to use updated packages.


In [105]:
from collections import defaultdict
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

def get_top_n(predictions, n=10, user_id=None):
    user_predictions = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_predictions[uid].append((iid, est))

    for uid, ratings in user_predictions.items():  # Sort the recommendations
        ratings.sort(key=lambda x: x[1], reverse=True)
        user_predictions[uid] = ratings[:n]

    if user_id is not None: # --> Get the top N recommendations 
        return dict(user_predictions[user_id])
    else:
        return user_predictions


In [106]:
# Create a Surprise Dataset
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))
data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

algorithm = SVD()         # We apply the SVD!
algorithm.fit(trainset)   # Now we train the data

predictions = algorithm.test(testset)   # Make the predictions

# Print recommendations
for user_id in [user_id_super, user_id_drama]:
    top_n = get_top_n(predictions, n=10, user_id=user_id)
    
    print(f"\nTop 10 Recommendations for User {user_id}:\n")
    
    for movie_id, est_rating in top_n.items():
        print(f"  Movie ID: {movie_id}")
        print(f"  Estimated Rating: {est_rating:.2f}")
        print("-"*30)



Top 10 Recommendations for User 127342:

  Movie ID: 4973
  Estimated Rating: 5.00
------------------------------
  Movie ID: 4306
  Estimated Rating: 5.00
------------------------------
  Movie ID: 8636
  Estimated Rating: 4.92
------------------------------
  Movie ID: 5444
  Estimated Rating: 4.91
------------------------------
  Movie ID: 5328
  Estimated Rating: 4.79
------------------------------
  Movie ID: 7153
  Estimated Rating: 4.79
------------------------------
  Movie ID: 5064
  Estimated Rating: 4.78
------------------------------
  Movie ID: 3408
  Estimated Rating: 4.68
------------------------------
  Movie ID: 7143
  Estimated Rating: 4.66
------------------------------
  Movie ID: 5670
  Estimated Rating: 4.62
------------------------------

Top 10 Recommendations for User 34336:

  Movie ID: 6870
  Estimated Rating: 3.87
------------------------------
  Movie ID: 3967
  Estimated Rating: 3.86
------------------------------
  Movie ID: 4019
  Estimated Rating: 3.84

As we can see, User 1041 has lower values of rating recommendations. This is because this user is more difficult to relate to others; therefore, the coefficient will be lower. We can observe a significant difference compared to User 859, as we appreciate higher values than the user mentioned previously.

<font size="+1" color="#003300">We hereby declare that, except for the code provided by the course instructors, all of my code, report, and figures were produced by myself.</font>