In [42]:
import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
# import numpy as np
# from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.metrics import mean_absolute_error
import numpy as np

In [2]:
links = pd.read_csv('ml-latest-small/links.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

### restructure the movie ratings dataset

In [3]:
piv_ratings = pd.pivot(data=ratings, index='userId', columns='movieId', values='rating')


### Impute missing values

In [4]:
imputer = SimpleImputer(strategy='constant',fill_value=0)
imp_ratings = pd.DataFrame(imputer.fit_transform(piv_ratings), index=piv_ratings.index, columns=piv_ratings.columns)


### Create a similarity matrix

In [5]:
similarity = pd.DataFrame(cosine_similarity(imp_ratings), index=piv_ratings.index, columns=piv_ratings.index)
similarity

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.027283,0.059720,0.194395,0.129080,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.000000,0.000000,0.003726,0.016614,0.025333,0.027585,0.027257,0.000000,0.067445,...,0.202671,0.016866,0.011997,0.000000,0.000000,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.059720,0.000000,1.000000,0.002251,0.005020,0.003936,0.000000,0.004941,0.000000,0.000000,...,0.005048,0.004892,0.024992,0.000000,0.010694,0.012993,0.019247,0.021128,0.000000,0.032119
4,0.194395,0.003726,0.002251,1.000000,0.128659,0.088491,0.115120,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.129080,0.016614,0.005020,0.128659,1.000000,0.300349,0.108342,0.429075,0.000000,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.164191,0.028429,0.012993,0.200395,0.106435,0.102123,0.200035,0.099388,0.075898,0.088963,...,0.178084,0.116534,0.300669,0.066032,0.148141,1.000000,0.153063,0.262558,0.069622,0.201104
607,0.269389,0.012948,0.019247,0.131746,0.152866,0.162182,0.186114,0.185142,0.011844,0.010451,...,0.092525,0.199910,0.203540,0.137834,0.118780,0.153063,1.000000,0.283081,0.149190,0.139114
608,0.291097,0.046211,0.021128,0.149858,0.135535,0.178809,0.323541,0.187233,0.100435,0.077424,...,0.158355,0.197514,0.232771,0.155306,0.178142,0.262558,0.283081,1.000000,0.121993,0.322055
609,0.093572,0.027565,0.000000,0.032198,0.261232,0.214234,0.090840,0.423993,0.000000,0.021766,...,0.035653,0.335231,0.061941,0.236601,0.097610,0.069622,0.149190,0.121993,1.000000,0.053225


### Build a recommender function

In [117]:
def find_movie_names(ids):
    '''Takes a movieId or a list of movieIds and returns a list with its/their corresponding movie titles.'''
    movie_titles = list()
    if isinstance(ids,list):
        for el in ids:
            movie_titles.append(find_single_movie_name(el))
    elif (isinstance(ids,str) and ids.isnumeric()) or isinstance(ids, int) or isinstance(ids, float):
        movie_titles.append(find_single_movie_name(ids))
    else:
        print(f'Data type of input ({ids}) not valid.')
    return movie_titles

def find_single_movie_name(id):
    '''Takes a single movieId and returns the corresponding title.'''
    return movies.query(f'movieId=={id}').title.values[0]

def recommender(user_id,n):
    '''Recommends n movies to watch for a user. 
    
    Takes a user_id and the number (n) of movies to recommend. 
    Recommendations are based on already rated movies and the similarity to users. 
    Movies that the user has already rated are excluded from the recommendations.
     '''
    #calculate weights and remove user themselves
    weights = similarity.query(f'userId!={user_id}')[user_id] / sum(similarity.query(f'userId!={user_id}')[user_id])
    weighted_ratings = imp_ratings.multiply(weights, axis='index').sum(axis=0)
    
    # remove movies that the user has already watched
    already_seen = [col for col in imp_ratings.columns if imp_ratings.loc[user_id,col]!=0]
    weighted_ratings = weighted_ratings.drop(index = already_seen)

    # get the titles of the n movies with the highest predicted ratings
    recom_n_movieId = weighted_ratings.sort_values(ascending=False).head(n).index
    recom_n_movieTitles = find_movie_names(list(recom_n_movieId))
    return recom_n_movieTitles

recommender(6,5)

['Toy Story (1995)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Mission: Impossible (1996)',
 'Matrix, The (1999)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)']

In [110]:
col=imp_ratings.columns[0]
imp_ratings.loc[id,col]!=0

True

In [112]:
len([col for col in imp_ratings.columns if imp_ratings.loc[id,col]!=0])

2698

In [44]:
imp_ratings.query('userId==414')

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
414,4.0,3.0,4.0,0.0,2.0,3.0,3.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
id = 414
array=imp_ratings.iloc[id-1,:].to_numpy().nonzero()[0]
array


array([   0,    1,    2, ..., 9638, 9677, 9692], dtype=int64)

In [94]:
imp_ratings.query(f'userId=={id} & ')

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19,4.0,3.0,3.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
217,4.0,2.0,1.0,0.0,0.0,2.0,0.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
414,4.0,3.0,4.0,0.0,2.0,3.0,3.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
555,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
599,3.0,2.5,1.5,0.0,0.0,4.5,2.5,0.0,1.5,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
