In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

## 1. Load small MovieLens-dataset

In [2]:
user_rating_init = pd.read_csv("./data/ml-latest-small/ratings.csv",index_col=1,)
user_rating_init.reset_index(inplace=True)
user_rating_init.drop(['timestamp'], axis=1 , inplace=True)
user_rating_init

Unnamed: 0,movieId,userId,rating
0,1,1,4.0
1,3,1,4.0
2,6,1,4.0
3,47,1,5.0
4,50,1,5.0
...,...,...,...
100831,166534,610,4.0
100832,168248,610,5.0
100833,168250,610,5.0
100834,168252,610,5.0


## 2. Edit data

### 2.1 Add movie titles

In [3]:
# pivot the table
user_item_init = pd.pivot(data=user_rating_init,
                index='userId',
                columns='movieId',
                values='rating')
user_item_init

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [4]:
# load the movie titles and genres into a dataframe
movie_genre = pd.read_csv("./data/ml-latest-small/movies.csv",index_col=1,)
# change index from title to movieId
movie_genre.reset_index(inplace=True)
movie_genre.set_index('movieId',inplace=True)

In [5]:
# Merge the dataframes on movieId and subsitute movieId with movie title
user_item = pd.merge(movie_genre, user_item_init.T, left_index=True, right_index=True)
user_item.drop('genres', inplace=True, axis=1)
user_item.reset_index(inplace=True, drop=True)
user_item = user_item.set_index('title').T
user_item.index.name = 'userId'
user_item

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


### 2.3 calculate matrix sparsity

In [6]:
# calculate the sparsity
user_item.isna().sum().sum()/user_item.size

0.9830003169443864

### 2.3 delete all movies with less than 20 votes

In [7]:
# define mask with all movieIds having at least 20 votes
mask_at_least_20_evals = user_item.notna().sum() >= 20
# drop these user Ids (within the transposed dataframe (with movieId as index))
user_item = user_item.T.drop(user_item.T.loc[~mask_at_least_20_evals].index).T
user_item

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,4.0,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,2.5,,2.5,,4.0,...,,,,,,,,,,
607,4.0,,,,,,,3.0,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,4.0,,4.5,,...,,,,,,,,,,
609,3.0,,,,,,4.0,,,,...,,,,,,,,,,


### 2.4 Fill the NaN by average of movie

In [8]:
user_item_f = user_item.fillna(user_item.mean())
user_item_f

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.00000,3.431818,4.000000,3.071429,4.000000,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
2,3.92093,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
3,3.92093,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
4,3.92093,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
5,4.00000,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.50000,3.431818,3.259615,3.071429,3.946078,2.500000,3.496212,2.500000,3.926829,4.000000,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
607,4.00000,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.000000,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
608,2.50000,2.000000,2.000000,3.071429,3.946078,3.185185,4.000000,3.671429,4.500000,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
609,3.00000,3.431818,3.259615,3.071429,3.946078,3.185185,4.000000,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28


## 3. create a User-User-Matrix with Cosine-Simularity

### 3.1. create user-user-matrix manually

In [9]:
def cosine_sim(vec_x, vec_y):
    '''calculate the cosine similarity of vector x and vector y'''
    assert len(vec_x) == len(vec_y)
    num = np.dot(vec_x, vec_y)
    # dot product of a vector with itself = the sum of all squared vector-elements 
    denom = np.sqrt(np.dot(vec_x, vec_x)) * np.sqrt(np.dot(vec_y, vec_y))
    return num/denom

In [10]:
def create_user_user_matrix(df):
    '''creates a user-user-matrix of cosine similarities'''
    users = df.index
    cos_sim_list = [[cosine_sim(df.loc[user_a], df.loc[user_b]) for user_a in users] for user_b in users]
    user_user_matrix = pd.DataFrame(data=cos_sim_list, columns=users, index=users)
    return user_user_matrix

In [11]:
# Create the user-user-matrix manually
user_user_cos = create_user_user_matrix(user_item_f)
user_user_cos

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.994206,0.989798,0.987675,0.993643,0.990087,0.989413,0.993621,0.994149,0.989263,...,0.993340,0.991866,0.976639,0.992999,0.991463,0.988512,0.990445,0.977803,0.994362,0.988388
2,0.994206,1.000000,0.994932,0.992378,0.998747,0.995278,0.994542,0.998526,0.999149,0.994342,...,0.998478,0.996876,0.983352,0.998277,0.997103,0.994172,0.995410,0.984143,0.999473,0.993805
3,0.989798,0.994932,1.000000,0.987847,0.994074,0.991245,0.990120,0.994039,0.994803,0.989916,...,0.993958,0.992439,0.979459,0.993931,0.992715,0.989013,0.991144,0.979801,0.995051,0.989796
4,0.987675,0.992378,0.987847,1.000000,0.991389,0.988909,0.988778,0.991908,0.992265,0.989138,...,0.991237,0.990616,0.976278,0.991404,0.990710,0.986998,0.988850,0.976331,0.992621,0.986249
5,0.993643,0.998747,0.994074,0.991389,1.000000,0.994638,0.993855,0.997625,0.998476,0.993514,...,0.997882,0.996065,0.982864,0.997518,0.996203,0.993459,0.994785,0.983225,0.998893,0.992800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.988512,0.994172,0.989013,0.986998,0.993459,0.989494,0.988986,0.993359,0.993918,0.988491,...,0.993485,0.991477,0.978825,0.993333,0.991823,1.000000,0.989557,0.979159,0.994398,0.987909
607,0.990445,0.995410,0.991144,0.988850,0.994785,0.992167,0.991132,0.994709,0.995244,0.990582,...,0.994994,0.993229,0.979893,0.994334,0.993123,0.989557,1.000000,0.980376,0.995461,0.989609
608,0.977803,0.984143,0.979801,0.976331,0.983225,0.978175,0.978339,0.983644,0.983880,0.977983,...,0.983896,0.981886,0.968146,0.982829,0.981889,0.979159,0.980376,1.000000,0.984611,0.978510
609,0.994362,0.999473,0.995051,0.992621,0.998893,0.995375,0.994607,0.998788,0.999291,0.994515,...,0.998710,0.996893,0.983689,0.998551,0.997338,0.994398,0.995461,0.984611,1.000000,0.993551


### 3.2. create user-user-matrix with scikit learn

In [12]:
user_user_sklearn = cosine_similarity(user_item_f)
user_user_sklearn = pd.DataFrame(data=user_user_sklearn, columns=user_item_f.index, index=user_item_f.index)
user_user_sklearn

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.994206,0.989798,0.987675,0.993643,0.990087,0.989413,0.993621,0.994149,0.989263,...,0.993340,0.991866,0.976639,0.992999,0.991463,0.988512,0.990445,0.977803,0.994362,0.988388
2,0.994206,1.000000,0.994932,0.992378,0.998747,0.995278,0.994542,0.998526,0.999149,0.994342,...,0.998478,0.996876,0.983352,0.998277,0.997103,0.994172,0.995410,0.984143,0.999473,0.993805
3,0.989798,0.994932,1.000000,0.987847,0.994074,0.991245,0.990120,0.994039,0.994803,0.989916,...,0.993958,0.992439,0.979459,0.993931,0.992715,0.989013,0.991144,0.979801,0.995051,0.989796
4,0.987675,0.992378,0.987847,1.000000,0.991389,0.988909,0.988778,0.991908,0.992265,0.989138,...,0.991237,0.990616,0.976278,0.991404,0.990710,0.986998,0.988850,0.976331,0.992621,0.986249
5,0.993643,0.998747,0.994074,0.991389,1.000000,0.994638,0.993855,0.997625,0.998476,0.993514,...,0.997882,0.996065,0.982864,0.997518,0.996203,0.993459,0.994785,0.983225,0.998893,0.992800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.988512,0.994172,0.989013,0.986998,0.993459,0.989494,0.988986,0.993359,0.993918,0.988491,...,0.993485,0.991477,0.978825,0.993333,0.991823,1.000000,0.989557,0.979159,0.994398,0.987909
607,0.990445,0.995410,0.991144,0.988850,0.994785,0.992167,0.991132,0.994709,0.995244,0.990582,...,0.994994,0.993229,0.979893,0.994334,0.993123,0.989557,1.000000,0.980376,0.995461,0.989609
608,0.977803,0.984143,0.979801,0.976331,0.983225,0.978175,0.978339,0.983644,0.983880,0.977983,...,0.983896,0.981886,0.968146,0.982829,0.981889,0.979159,0.980376,1.000000,0.984611,0.978510
609,0.994362,0.999473,0.995051,0.992621,0.998893,0.995375,0.994607,0.998788,0.999291,0.994515,...,0.998710,0.996893,0.983689,0.998551,0.997338,0.994398,0.995461,0.984611,1.000000,0.993551


## 4. Recommend movies to users

### 4.1. create a dict of unseen movies

In [13]:
# melt the dataframe to long_format
init_long = pd.melt(user_item.reset_index(),id_vars='userId',value_name='ratings',var_name='movieId')
# Create a dict with a list of unseen movies (value) for each user (key)
unseen_dict = {user : list(init_long['movieId'].loc[(init_long['userId'] == user) & (init_long['ratings'].isna())]) 
                   for user in user_item.index}
# unseen_dict

### 4.2. create a dictionary of most similar users

In [14]:
# Create a dict with a descending list of similar users by cosine similarity) (value) for each user (key)
top_us_dict = {user : 
    list(user_user_cos[user].loc[user_user_cos.index != user].sort_values(ascending=False).index) 
    for user in user_user_cos.index}
# top_us_dict

### 4. 3. Recommender based on top 5 users

In [15]:
def recommend_movie(user):
    recommendations = []
    # iterate over all movies the user hasn't seen and calculate it's score with cosine similarity
    for movie in unseen_dict[user]:
        # get a set of all users who have seen the movie
        user_rated_tot = set(user_item.loc[~user_item[movie].isna()].index)
        # take only those users who are in the top 5
        top_5_seen = []
        count = 0
        for top_user in top_us_dict[user]:
            if count == 5:
                break
            if top_user in user_rated_tot:
                top_5_seen.append(top_user)
                count += 1
        num = 0
        den = 0
        #calculate the score for the movie
        for neighbor in top_5_seen:
            sim = user_user_cos[user][neighbor]
            rating = user_item[movie][neighbor]
            num += sim * rating
            den += sim + 0.0000001
        # append each movie to the recommendations list
        recommendations.append([movie, num / den])
    # create a dataframe out of the recommendations and sort it by score
    result = pd.DataFrame(data=recommendations, columns=['movie', 'score'])
    result.sort_values(by='score', ascending=False, inplace=True)
    result.set_index(['movie'], inplace=True)

    return result

In [16]:
recommendations = recommend_movie(1)
recommendations 

Unnamed: 0_level_0,score
movie,Unnamed: 1_level_1
Cinema Paradiso (Nuovo cinema Paradiso) (1989),4.900014
Life Is Beautiful (La Vita è bella) (1997),4.899987
To Kill a Mockingbird (1962),4.799993
"Third Man, The (1949)",4.799980
Brazil (1985),4.700034
...,...
Coneheads (1993),1.799988
RoboCop 2 (1990),1.700618
Babe: Pig in the City (1998),1.700160
"Mask of Zorro, The (1998)",1.600232
