Date: 12/9/2024

In [None]:
# Connect to drive to access data
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Imports
import pandas as pd
import numpy as np
from IPython.core.display import display, HTML

# System I: Recommendation based on Popularity

Popularity -- Sort movies by the highest number of user ratings (RatingsNum) and only consider movies with an average rating (AvgRating) above 4.3.

In [None]:
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

movies = pd.read_csv('ml-1m/movies.dat', sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']

In [None]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [None]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
movies.shape

(3883, 3)

In [None]:
# Perform a left join on MovieID
merged_df = pd.merge(ratings, movies, on='MovieID', how='left')
# Drop unnecessary columns
merged_df = merged_df.drop(columns=['Genres', 'Timestamp', 'UserID'])

In [None]:
merged_df

Unnamed: 0,MovieID,Rating,Title
0,1193,5,One Flew Over the Cuckoo's Nest (1975)
1,661,3,James and the Giant Peach (1996)
2,914,3,My Fair Lady (1964)
3,3408,4,Erin Brockovich (2000)
4,2355,5,"Bug's Life, A (1998)"
...,...,...,...
1000204,1091,1,Weekend at Bernie's (1989)
1000205,1094,5,"Crying Game, The (1992)"
1000206,562,5,Welcome to the Dollhouse (1995)
1000207,1096,4,Sophie's Choice (1982)


In [None]:
# Group by MovieID and Title, then calculate RatingsNum and AvgRating
aggregated_df = merged_df.groupby(['MovieID', 'Title']).agg(
                  RatingsNum=('Rating', 'size'),    # Count the number of ratings
                  AvgRating=('Rating', 'mean')      # Calculate the average rating
                  ).reset_index()

In [None]:
aggregated_df

Unnamed: 0,MovieID,Title,RatingsNum,AvgRating
0,1,Toy Story (1995),2077,4.146846
1,2,Jumanji (1995),701,3.201141
2,3,Grumpier Old Men (1995),478,3.016736
3,4,Waiting to Exhale (1995),170,2.729412
4,5,Father of the Bride Part II (1995),296,3.006757
...,...,...,...,...
3701,3948,Meet the Parents (2000),862,3.635731
3702,3949,Requiem for a Dream (2000),304,4.115132
3703,3950,Tigerland (2000),54,3.666667
3704,3951,Two Family House (2000),40,3.900000


In [None]:
# Sort by RatingsNum in descending order
sorted_df = aggregated_df.sort_values(by='RatingsNum', ascending=False)
sorted_df

Unnamed: 0,MovieID,Title,RatingsNum,AvgRating
2651,2858,American Beauty (1999),3428,4.317386
253,260,Star Wars: Episode IV - A New Hope (1977),2991,4.453694
1106,1196,Star Wars: Episode V - The Empire Strikes Back...,2990,4.292977
1120,1210,Star Wars: Episode VI - Return of the Jedi (1983),2883,4.022893
466,480,Jurassic Park (1993),2672,3.763847
...,...,...,...,...
2031,2213,Waltzes from Vienna (1933),1,1.000000
2032,2214,Number Seventeen (1932),1,3.000000
2034,2217,Elstree Calling (1930),1,1.000000
2035,2218,Juno and Paycock (1930),1,2.000000


In [None]:
# Filter for movies with AvgRating > 4.3
filtered_df = sorted_df[sorted_df['AvgRating'] > 4.3]

# Select the top 10 rows with the highest RatingsNum
top_10_movies = filtered_df.head(10)

In [None]:
filtered_df.to_csv("popular_movies.csv", index=False)

In [None]:
top_10_movies

Unnamed: 0,MovieID,Title,RatingsNum,AvgRating
2651,2858,American Beauty (1999),3428,4.317386
253,260,Star Wars: Episode IV - A New Hope (1977),2991,4.453694
1848,2028,Saving Private Ryan (1998),2653,4.337354
2374,2571,"Matrix, The (1999)",2590,4.31583
579,593,"Silence of the Lambs, The (1991)",2578,4.351823
1108,1198,Raiders of the Lost Ark (1981),2514,4.477725
2557,2762,"Sixth Sense, The (1999)",2459,4.406263
1107,1197,"Princess Bride, The (1987)",2318,4.30371
513,527,Schindler's List (1993),2304,4.510417
309,318,"Shawshank Redemption, The (1994)",2227,4.554558


In [None]:
# Display top 10 movies
def image_formatter(image_path):
  return f'<img src="https://liangfgithub.github.io/MovieImages/{image_path}.jpg?raw=true" width="100" height="150">'

top_10_movies = top_10_movies.copy()
top_10_movies["Image"] = top_10_movies["MovieID"].apply(image_formatter)

html_table = top_10_movies.to_html(
    escape=False,
    index=False,
    columns=["MovieID", "Image", "Title", "AvgRating"]
)

display(HTML(html_table))

MovieID,Image,Title,AvgRating
2858,,American Beauty (1999),4.317386
260,,Star Wars: Episode IV - A New Hope (1977),4.453694
2028,,Saving Private Ryan (1998),4.337354
2571,,"Matrix, The (1999)",4.31583
593,,"Silence of the Lambs, The (1991)",4.351823
1198,,Raiders of the Lost Ark (1981),4.477725
2762,,"Sixth Sense, The (1999)",4.406263
1197,,"Princess Bride, The (1987)",4.30371
527,,Schindler's List (1993),4.510417
318,,"Shawshank Redemption, The (1994)",4.554558


# System II: Recommendation based on IBCF

In [None]:
rmat = pd.read_csv('Rmat.csv', sep=',')

In [None]:
rmat.shape

(6040, 3706)

In [None]:
rmat.head()

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
u1,5.0,,,,,,,,,,...,,,,,,,,,,
u10,5.0,,,,,,,,,,...,,,,,,,,,,
u100,,,,,,,,,,,...,,,,,,,,,,
u1000,5.0,,,,,,,,,,...,,,,,,,,,,
u1001,4.0,,,,,,,,,,...,,,,,,,,,,


### 1. Normalize the rating matrix by centering each row

In [None]:
matrix = rmat.to_numpy()
row_means = np.nanmean(matrix, axis=1)
normalized_matrix = matrix - row_means[:, np.newaxis]
pd.set_option('display.float_format', '{:.7f}'.format)
normalized_rmat = pd.DataFrame(normalized_matrix, index=rmat.index, columns=rmat.columns)

In [None]:
normalized_rmat

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
u1,0.8113208,,,,,,,,,,...,,,,,,,,,,
u10,0.8852868,,,,,,,,,,...,,,,,,,,,,
u100,,,,,,,,,,,...,,,,,,,,,,
u1000,0.8690476,,,,,,,,,,...,,,,,,,,,,
u1001,0.3474801,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u995,,,,,,,,,,,...,,,,,,,,,,
u996,0.0641892,,,,,,,,,,...,,,,,,,,,,-0.9358108
u997,0.0666667,,,,,,,,,,...,,,,,,,,,,
u998,,,,,,,,,,,...,,,,,,,,,,


### 2. Compute the (transformed) Cosine similarity among the 3,706 movies

In [None]:
# Function to compute transformed cosine similarity
def calc_similarity(vector1, vector2):

  valid_indices = np.isfinite(vector1) & np.isfinite(vector2)

  # Ignore pairs with fewer than 3 common raters
  if valid_indices.sum() < 3:
      return np.nan

  v1, v2 = vector1[valid_indices], vector2[valid_indices]
  num = np.dot(v1, v2)

  norm_v1 = np.sqrt(np.sum(v1**2))
  norm_v2 = np.sqrt(np.sum(v2**2))

  if norm_v1 == 0 or norm_v2 == 0:
      return np.nan

  cosine_similarity = num / (norm_v1 * norm_v2)

  return 0.5 + 0.5 * cosine_similarity

In [None]:
# Compute the similarity matrix
n_movies = normalized_rmat.shape[1]
movie_ids = normalized_rmat.columns

# Initialize the similarity matrix
similarity_matrix = pd.DataFrame(np.nan,
                                index=movie_ids,
                                columns=movie_ids
                                )

for i, movie_i in enumerate(movie_ids):
  for j, movie_j in enumerate(movie_ids[i:], start=i): # To avoid recomputing symmetric pairs
    similarity = calc_similarity(normalized_rmat[movie_i], normalized_rmat[movie_j])
    similarity_matrix.loc[movie_i, movie_j] = similarity
    similarity_matrix.loc[movie_j, movie_i] = similarity  # Ensure symmetry

# Set all diagonal entries are NaN
np.fill_diagonal(similarity_matrix.values, np.nan)

In [None]:
similarity_matrix

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,,0.5121055,0.3919999,0.7296371,0.4052488,0.3443622,0.1934793,0.2920968,0.2757620,0.4342140,...,0.5256346,0.1678860,0.4382444,0.2044081,0.5517557,0.6834218,0.2906526,0.5140432,0.3837718,0.4145054
m10,0.5121055,,0.5474583,0.4904717,,0.6109830,0.4237425,0.4606591,0.6576989,0.5495395,...,0.2617006,0.4658628,0.4480788,0.3857350,,0.4544643,0.5475044,0.6687327,0.4482895,0.6008116
m100,0.3919999,0.5474583,,0.4829650,,0.8365839,0.6295382,0.5682818,0.8118070,0.4885245,...,0.4107531,0.6426157,0.4936404,0.1936714,0.8028437,0.3067432,0.6293738,0.2695757,0.4789227,0.6128149
m1000,0.7296371,0.4904717,0.4829650,,,0.1807649,,,,0.7052228,...,,,0.2073925,0.9015211,,0.2260270,0.6684361,,0.7253362,0.6805737
m1002,0.4052488,,,,,,,,,,...,,,,,,0.7227661,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,0.6834218,0.4544643,0.3067432,0.2260270,0.7227661,0.2517379,0.2271863,0.1402858,0.2490622,0.2743969,...,0.4011803,0.1486861,0.4705176,0.1928585,0.5397140,,0.2155611,0.4490137,0.3078245,0.3985167
m996,0.2906526,0.5475044,0.6293738,0.6684361,,0.7908892,0.7119653,0.6911337,0.8060751,0.6216948,...,0.6181369,0.7796494,0.4780710,0.7975184,,0.2155611,,0.0771135,0.5563784,0.6225577
m997,0.5140432,0.6687327,0.2695757,,,0.3660229,0.9327237,0.9492277,0.2144257,0.2100087,...,0.2157111,0.8661206,0.4162218,,0.4120181,0.4490137,0.0771135,,0.6426355,0.4606457
m998,0.3837718,0.4482895,0.4789227,0.7253362,,0.4450076,0.8437724,0.6048153,0.3545709,0.5041464,...,,0.6983913,0.6629043,0.8523279,,0.3078245,0.5563784,0.6426355,,0.6427270


In [None]:
# Save the similarity matrix to a CSV file
# similarity_matrix.to_csv('similarity_matrix.csv', index=True)

### 3.1. For each row, sort the non-NA similarity measures and keep the top 30, setting the rest to NA

In [None]:
def retain_top_n(matrix, n=30):
    for index, row in matrix.iterrows():
        top_n_indices = row.nlargest(n).index
        matrix.loc[index, ~matrix.columns.isin(top_n_indices)] = np.nan
    return matrix

S_copy = similarity_matrix.copy()
S_top_30 = retain_top_n(S_copy, n=30)
S_top_30

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,,,,,,,,,,,...,,,,,,,,,,
m10,,,,,,,,,,,...,,,,,,,,,,
m100,,,,,,,,,,,...,,,,,,,,,,
m1000,,,,,,,,,,,...,,,,,,,,,,
m1002,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,,,,,,,,,,,...,,,,,,,,,,
m996,,,,,,,,,,,...,,,,,,,,,,
m997,,,,,,,,,,,...,,,,,,,,,,
m998,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# Save the top 30 similarity matrix to a CSV file
S_top_30.to_csv("S_top_30.csv", index=True)

### 3.2. Display the pairwise similarity values from the S matrix (you obtained at Step 2)

In [None]:
movies_to_display = ["m1", "m10", "m100", "m1510", "m260", "m3212"]
pd.set_option('display.float_format', '{:.7f}'.format)
pairwise_similarities = similarity_matrix.loc[movies_to_display, movies_to_display]
pairwise_similarities

Unnamed: 0,m1,m10,m100,m1510,m260,m3212
m1,,0.5121055,0.3919999,,0.7411482,
m10,0.5121055,,0.5474583,,0.5343338,
m100,0.3919999,0.5474583,,,0.3296943,
m1510,,,,,,
m260,0.7411482,0.5343338,0.3296943,,,
m3212,,,,,,


### 4. Create a function named myIBCF
For the webapp, we were able to use the entire top_30 similarity matrix and we did not run into memory issues while testing the deployment on render.com.

In [None]:
# Helper function to handle edge case: If fewer than 10 predictions are non-NA,
# select the remaining movies based on the popularity defined in System 1.

def handle_edge_case(top_predictions, w, n, non_na_size):

  remainder = n - non_na_size

  # Choose popular movies not rated by the user
  popular_movies = pd.read_csv('popular_movies.csv')
  popular_movies['MovieID'] = 'm' + popular_movies['MovieID'].astype(str)

  rated_movies = w.dropna().index
  unrated_movies = []

  for movie in popular_movies['MovieID']:
      if movie not in rated_movies:
          unrated_movies.append(movie)

  additional_movies = pd.Series(data=["Recommended"] * remainder, index=unrated_movies[:remainder])

  # Add additional movies to predictions
  top_predictions = pd.concat([top_predictions, additional_movies])

  return top_predictions

In [None]:
def myIBCF(w, S, n=10):

  predictions = pd.Series(index=w.index, dtype=float)

  for movie_id in predictions.index:

      # Skip already-rated movies
      if not pd.isna(w[movie_id]):
          predictions[movie_id] = np.nan
          continue

      # Retrieve similarity scores for this movie
      S_movie = S.loc[movie_id]

      # Filter for movies rated by the user
      rated_movies = w.dropna()
      relevant_similarities = S_movie[rated_movies.index]

      # Compute weighted average of ratings
      weighted_sum = (relevant_similarities * rated_movies).sum()
      similarity_sum = np.abs(relevant_similarities).sum()

      # Compute predicted rating if denominator is nonzero
      if similarity_sum > 0:
          predictions[movie_id] = weighted_sum / similarity_sum
      else:
          predictions[movie_id] = np.nan

  # Select the top N predictions
  top_predictions = predictions.nlargest(n)
  top_predictions.name = "predictions"

  # EDGE CASE: if number of non-NA values < n
  non_na_size = top_predictions.notna().sum().sum()

  # Fill any missing predictions with movies not rated by the user from the popular movies list defined by System 1
  if non_na_size < n:
    return handle_edge_case(top_predictions[top_predictions.notna()], w, n, non_na_size)

  return top_predictions

In [None]:
# Load reduced top 30 similarity matrix
top_30 = pd.read_csv('S_top_30.csv')
top_30 = top_30.set_index('Unnamed: 0')

top_30.head()

Unnamed: 0_level_0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
m1,,,,,,,,,,,...,,,,,,,,,,
m10,,,,,,,,,,,...,,,,,,,,,,
m100,,,,,,,,,,,...,,,,,,,,,,
m1000,,,,,,,,,,,...,,,,,,,,,,
m1002,,,,,,,,,,,...,,,,,,,,,,


### 5. Test your function

In [None]:
# 1. Actual user: u1181
user_1181 = rmat.loc["u1181"].copy()
user_1181_recs = myIBCF(user_1181, top_30)
user_1181_recs

Unnamed: 0,predictions
m3732,5.0
m749,4.5265592
m3899,4.526066
m1039,4.0
m1235,4.0
m1253,4.0
m1734,4.0
m1914,4.0
m2082,4.0
m2361,4.0


In [None]:
# 2. Hypothetical new user
new_user = user_1181.copy()
new_user.name = "rating"
new_user[:] = np.nan

# Set specific values for indices "m1613" and "m1755"
new_user.loc["m1613"] = 5
new_user.loc["m1755"] = 4

In [None]:
new_user_recs = myIBCF(new_user, top_30)
new_user_recs

Unnamed: 0,predictions
m1017,5.0
m2805,5.0
m3269,5.0
m691,5.0
m74,5.0
m765,5.0
m1100,5.0
m1468,5.0
m1541,5.0
m158,5.0
