In [128]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats
# Visualization
import seaborn as sns
# Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [129]:
ratings_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv("../ml-100k/u.data", sep="\t", names=ratings_cols, encoding="latin-1",)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [130]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   movie_id   100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [131]:
print('The dataset has', ratings['user_id'].nunique(), 'unique users') # Number of users
print('The dataset has', ratings['movie_id'].nunique(), 'unique movies') # Number of movies
print('The dataset has', ratings['rating'].nunique(), 'unique ratings') # Number of ratings
print('The unique ratings are', sorted(ratings['rating'].unique())) # List of unique ratings

The dataset has 943 unique users
The dataset has 1682 unique movies
The dataset has 5 unique ratings
The unique ratings are [1, 2, 3, 4, 5]


In [132]:
movies_cols = [ "movie_id", "movie_title", "release_date", "video_release_date", "IMDb_URL", 
            "unknown", "Action", "Adventure", "Animation",
            "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
            "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
            "Thriller", "War", "Western"]
movies = pd.read_csv("../ml-100k/u.item", sep="|", names=movies_cols, encoding="latin-1",)
movies.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [133]:
df = pd.merge(ratings, movies, on='movie_id')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             100000 non-null  int64  
 1   movie_id            100000 non-null  int64  
 2   rating              100000 non-null  int64  
 3   timestamp           100000 non-null  int64  
 4   movie_title         100000 non-null  object 
 5   release_date        99991 non-null   object 
 6   video_release_date  0 non-null       float64
 7   IMDb_URL            99987 non-null   object 
 8   unknown             100000 non-null  int64  
 9   Action              100000 non-null  int64  
 10  Adventure           100000 non-null  int64  
 11  Animation           100000 non-null  int64  
 12  Children's          100000 non-null  int64  
 13  Comedy              100000 non-null  int64  
 14  Crime               100000 non-null  int64  
 15  Documentary         100000 non-null

In [134]:
df

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
1,63,242,3,875747190,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
2,226,242,5,883888671,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
3,154,242,3,879138235,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
4,306,242,5,876503793,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,840,1674,4,891211682,Mamma Roma (1962),01-Jan-1962,,http://us.imdb.com/M/title-exact?Mamma%20Roma%...,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,655,1640,3,888474646,"Eighth Day, The (1996)",01-Nov-1996,,"http://us.imdb.com/Title?Huiti%E8me+jour,+Le+(...",0,0,...,0,0,0,0,0,0,0,0,0,0
99997,655,1637,3,888984255,Girls Town (1996),23-Aug-1996,,http://us.imdb.com/M/title-exact?Girls%20Town%...,0,0,...,0,0,0,0,0,0,0,0,0,0
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1...",02-Feb-1996,,http://us.imdb.com/M/title-exact?Saimt%20el%20...,0,0,...,0,0,0,0,0,0,0,0,0,0


In [135]:
# IGNORE --------------------------------------------------------------------------------------------------------------------------------------
# filter the movies and keep only those with over 50 ratings for the analysis
agg_ratings = df.groupby('movie_title').agg(mean_rating = ('rating', 'mean'), number_of_ratings = ('rating', 'count')).reset_index()
print(agg_ratings)
agg_ratings_GT100 = agg_ratings[agg_ratings['number_of_ratings']>50]
agg_ratings_GT100.info()

                                movie_title  mean_rating  number_of_ratings
0                 'Til There Was You (1997)     2.333333                  9
1                              1-900 (1994)     2.600000                  5
2                     101 Dalmatians (1996)     2.908257                109
3                       12 Angry Men (1957)     4.344000                125
4                                187 (1997)     3.024390                 41
...                                     ...          ...                ...
1659                   Young Guns II (1990)     2.772727                 44
1660  Young Poisoner's Handbook, The (1995)     3.341463                 41
1661                Zeus and Roxanne (1997)     2.166667                  6
1662                                unknown     3.444444                  9
1663     Á köldum klaka (Cold Fever) (1994)     3.000000                  1

[1664 rows x 3 columns]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 596 entries, 

In [136]:
agg_ratings_GT100.head()

Unnamed: 0,movie_title,mean_rating,number_of_ratings
2,101 Dalmatians (1996),2.908257,109
3,12 Angry Men (1957),4.344,125
5,2 Days in the Valley (1996),3.225806,93
6,"20,000 Leagues Under the Sea (1954)",3.5,72
7,2001: A Space Odyssey (1968),3.969112,259


In [137]:
agg_ratings_GT100.sort_values(by='number_of_ratings', ascending=False).head()

Unnamed: 0,movie_title,mean_rating,number_of_ratings
1398,Star Wars (1977),4.358491,583
333,Contact (1997),3.803536,509
498,Fargo (1996),4.155512,508
1234,Return of the Jedi (1983),4.00789,507
860,Liar Liar (1997),3.156701,485


In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             100000 non-null  int64  
 1   movie_id            100000 non-null  int64  
 2   rating              100000 non-null  int64  
 3   timestamp           100000 non-null  int64  
 4   movie_title         100000 non-null  object 
 5   release_date        99991 non-null   object 
 6   video_release_date  0 non-null       float64
 7   IMDb_URL            99987 non-null   object 
 8   unknown             100000 non-null  int64  
 9   Action              100000 non-null  int64  
 10  Adventure           100000 non-null  int64  
 11  Animation           100000 non-null  int64  
 12  Children's          100000 non-null  int64  
 13  Comedy              100000 non-null  int64  
 14  Crime               100000 non-null  int64  
 15  Documentary         100000 non-null

In [139]:
df_GT100 = pd.merge(df, agg_ratings_GT100[['movie_title']], on='movie_title', how='inner')
df_GT100.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83619 entries, 0 to 83618
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             83619 non-null  int64  
 1   movie_id            83619 non-null  int64  
 2   rating              83619 non-null  int64  
 3   timestamp           83619 non-null  int64  
 4   movie_title         83619 non-null  object 
 5   release_date        83619 non-null  object 
 6   video_release_date  0 non-null      float64
 7   IMDb_URL            83619 non-null  object 
 8   unknown             83619 non-null  int64  
 9   Action              83619 non-null  int64  
 10  Adventure           83619 non-null  int64  
 11  Animation           83619 non-null  int64  
 12  Children's          83619 non-null  int64  
 13  Comedy              83619 non-null  int64  
 14  Crime               83619 non-null  int64  
 15  Documentary         83619 non-null  int64  
 16  Dram

In [140]:
print('The ratings dataset has', df_GT100['user_id'].nunique(), 'unique users') # Number of users
print('The ratings dataset has', df_GT100['movie_id'].nunique(), 'unique movies') # Number of movies
print('The ratings dataset has', df_GT100['rating'].nunique(), 'unique ratings') # Number of ratings
print('The unique ratings are', sorted(df_GT100['rating'].unique())) # List of unique ratings

# END IGNORE --------------------------------------------------------------------------------------------------------------------------------------

The ratings dataset has 943 unique users
The ratings dataset has 604 unique movies
The ratings dataset has 5 unique ratings
The unique ratings are [1, 2, 3, 4, 5]


In [141]:
matrix = df_GT100.pivot_table(index='movie_title', columns='user_id', values='rating')
matrix.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Dalmatians (1996),2.0,,,,2.0,,,,,,...,2.0,,,2.0,4.0,,,,,
12 Angry Men (1957),5.0,,,,,4.0,4.0,,,5.0,...,,,,,,,,,,
2 Days in the Valley (1996),,,,,,,,,,,...,,,4.0,,,,,,,2.0
"20,000 Leagues Under the Sea (1954)",3.0,,,,,,5.0,,,,...,,,,,,,,,,
2001: A Space Odyssey (1968),4.0,,,,4.0,5.0,5.0,,,5.0,...,4.0,,,,,,,,3.0,


In [142]:
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 'rows')
matrix_norm.head()
# this is basically like (x-xbar)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Dalmatians (1996),-0.908257,,,,-0.908257,,,,,,...,-0.908257,,,-0.908257,1.091743,,,,,
12 Angry Men (1957),0.656,,,,,-0.344,-0.344,,,0.656,...,,,,,,,,,,
2 Days in the Valley (1996),,,,,,,,,,,...,,,0.774194,,,,,,,-1.225806
"20,000 Leagues Under the Sea (1954)",-0.5,,,,,,1.5,,,,...,,,,,,,,,,
2001: A Space Odyssey (1968),0.030888,,,,0.030888,1.030888,1.030888,,,1.030888,...,0.030888,,,,,,,,-0.969112,


In [143]:
user_similarity = matrix_norm.corr()
user_similarity.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.059995,-0.130228,-0.024854,0.136413,0.115656,0.013388,0.446327,-0.840889,-0.392623,...,-0.094935,-0.55731,0.126189,-0.2408,0.074506,0.110449,0.04519,-0.013261,-0.518892,-0.281295
2,0.059995,1.0,-0.022677,-0.3262,-0.132564,0.101443,0.428124,0.19397,-0.439612,0.195774,...,0.078402,-0.30165,-0.010533,0.447909,-0.118931,-0.136792,-0.310935,-0.443221,0.118639,-0.024551
3,-0.130228,-0.022677,1.0,-0.235727,,-0.030368,-0.172556,-0.303092,-1.0,0.411608,...,0.347846,-1.0,0.037067,-0.368064,-0.109197,1.0,-0.120325,0.894564,-0.141099,1.0
4,-0.024854,-0.3262,-0.235727,1.0,1.0,-0.67857,-0.638721,0.501839,1.0,-0.747513,...,0.996942,,-0.203454,-0.829935,0.200605,,0.705994,0.977947,0.686583,-0.546167
5,0.136413,-0.132564,,1.0,1.0,-0.021158,-0.054497,0.132757,0.063712,-0.031703,...,0.079837,-0.595695,-0.050222,-0.089929,0.221115,0.534356,-0.079468,0.183068,0.079783,0.15479


In [144]:
user_similarity_cosine = cosine_similarity(matrix_norm.fillna(0))
user_similarity_cosine

array([[ 1.        , -0.01280713,  0.03409722, ..., -0.0053008 ,
         0.04159865,  0.03465869],
       [-0.01280713,  1.        ,  0.020511  , ...,  0.08390492,
         0.03170214,  0.02837992],
       [ 0.03409722,  0.020511  ,  1.        , ..., -0.01170388,
         0.00435578,  0.04621845],
       ...,
       [-0.0053008 ,  0.08390492, -0.01170388, ...,  1.        ,
         0.04303836,  0.00185024],
       [ 0.04159865,  0.03170214,  0.00435578, ...,  0.04303836,
         1.        ,  0.19846637],
       [ 0.03465869,  0.02837992,  0.04621845, ...,  0.00185024,
         0.19846637,  1.        ]])

In [145]:
# Pick a user ID
picked_userid = 1
# Remove picked user ID from the candidate list
user_similarity.drop(index=picked_userid, inplace=True)
# Take a look at the data
user_similarity.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.059995,1.0,-0.022677,-0.3262,-0.132564,0.101443,0.428124,0.19397,-0.439612,0.195774,...,0.078402,-0.30165,-0.010533,0.447909,-0.118931,-0.136792,-0.310935,-0.443221,0.118639,-0.024551
3,-0.130228,-0.022677,1.0,-0.235727,,-0.030368,-0.172556,-0.303092,-1.0,0.411608,...,0.347846,-1.0,0.037067,-0.368064,-0.109197,1.0,-0.120325,0.894564,-0.141099,1.0
4,-0.024854,-0.3262,-0.235727,1.0,1.0,-0.67857,-0.638721,0.501839,1.0,-0.747513,...,0.996942,,-0.203454,-0.829935,0.200605,,0.705994,0.977947,0.686583,-0.546167
5,0.136413,-0.132564,,1.0,1.0,-0.021158,-0.054497,0.132757,0.063712,-0.031703,...,0.079837,-0.595695,-0.050222,-0.089929,0.221115,0.534356,-0.079468,0.183068,0.079783,0.15479
6,0.115656,0.101443,-0.030368,-0.67857,-0.021158,1.0,0.050865,0.406131,-0.307431,0.008428,...,-0.130563,-0.379526,0.053459,0.043996,-0.076073,-0.131677,-0.263438,0.10658,-0.249217,-0.109225


In [146]:
# Number of similar users
n = 10
# User similarity threashold
user_similarity_threshold = 0.3
# Get top n similar users
similar_users = user_similarity[user_similarity[picked_userid]>user_similarity_threshold][picked_userid].sort_values(ascending=False)[:n]
# Print out top n similar users
print(f'The similar users for user {picked_userid} are', similar_users)

The similar users for user 1 are user_id
811    1.0
351    1.0
418    1.0
511    1.0
547    1.0
356    1.0
531    1.0
273    1.0
39     1.0
309    1.0
Name: 1, dtype: float64


In [147]:
def getList(dict):
    list = []
    for key in dict.keys():
        list.append(key)
         
    return list

In [148]:
dic = similar_users.to_dict()
lst = getList(dic)

In [149]:
picked_userid_watched = pd.DataFrame(matrix_norm[picked_userid].dropna(axis=0, how='all').sort_values(ascending=False)).reset_index().rename(columns={picked_userid:'rating'})
picked_userid_watched

Unnamed: 0,movie_title,rating
0,Mars Attacks! (1996),2.152074
1,Kids in the Hall: Brain Candy (1996),1.923077
2,Dolores Claiborne (1994),1.658228
3,Mighty Aphrodite (1995),1.581522
4,Mystery Science Theater 3000: The Movie (1996),1.569231
...,...,...
226,"Nightmare on Elm Street, A (1984)",-2.171171
227,Homeward Bound: The Incredible Journey (1993),-2.213115
228,Breakfast at Tiffany's (1961),-2.726316
229,"Sound of Music, The (1965)",-2.765766


In [153]:
# Movies that similar users watched. Remove movies that none of the similar users have watched
similar_user_movies = pd.DataFrame(matrix_norm[lst[0]].dropna(axis=0, how='all').sort_values(ascending=False)).reset_index().rename(columns={lst[0]:'rating'})
similar_user_movies

Unnamed: 0,movie_title,rating
0,Volcano (1997),2.191781
1,Dante's Peak (1997),2.066667
2,Scream 2 (1997),1.783019
3,In & Out (1997),1.695652
4,"Beautician and the Beast, The (1997)",1.686047
5,Seven Years in Tibet (1997),1.541935
6,Fly Away Home (1996),1.445513
7,Air Force One (1997),1.36891
8,"English Patient, The (1996)",1.343035
9,Flubber (1997),1.245283


In [None]:
# Remove the watched movie from the movie list
similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')
# Take a look at the data
similar_user_movies

In [None]:
item_score = {}
# Loop through items
for i in similar_user_movies.columns:
    # Get the ratings for movie i
    movie_rating = similar_user_movies[i]
    # Create a variable to store the score
    total = 0
    # Create a variable to store the number of scores
    count = 0
    # Loop through similar users
    for u in similar_users.index:
        # If the movie has rating
        if pd.isna(movie_rating[u]) == False:
            # Score is the sum of user similarity score multiply by the movie rating
            score = similar_users[u] * movie_rating[u]
            # Add the score to the total score for the movie so far
            total += score
            # Add 1 to the count
            count += 1
    # Get the average score for the item
    item_score[i] = total / count
# Convert dictionary to pandas dataframe
item_score = pd.DataFrame(item_score.items(), columns=['movie', 'movie_score'])

# Sort the movies by score
ranked_item_score = item_score.sort_values(by='movie_score', ascending=False)
# Select top m movies
m = 10
ranked_item_score.head(m)


In [None]:
# Average rating for the picked user
avg_rating = matrix[matrix.index == picked_userid].T.mean()[picked_userid]
# Print the average movie rating for user 1
print(f'The average movie rating for user {picked_userid} is {avg_rating:.2f}')
# Calcuate the predicted rating
ranked_item_score['predicted_rating'] = ranked_item_score['movie_score'] + avg_rating
# Take a look at the data
ranked_item_score.head(m)