# Sistema Colaborativo de Recomendacion

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

In [2]:
movies = pd.read_csv('dataset_2/movies.csv', encoding='latin1')
ratings = pd.read_csv('dataset_2/ratings.csv')
tags = pd.read_csv('dataset_2/tags.csv', encoding='latin1')

mean = ratings.groupby(by = 'userId', as_index=False)['rating'].mean()

rating_avg = pd.merge(ratings,mean,on='userId')
rating_avg['adg_rating'] = rating_avg['rating_x'] - rating_avg['rating_y']
rating_avg.head(3)

Unnamed: 0,userId,movieId,rating_x,timestamp,rating_y,adg_rating
0,12882,1,4.0,1147195252,4.061321,-0.061321
1,12882,32,3.5,1147195307,4.061321,-0.561321
2,12882,47,5.0,1147195343,4.061321,0.938679


In [3]:
rating_avg.shape

(264505, 6)

In [4]:
rating_avg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264505 entries, 0 to 264504
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   userId      264505 non-null  int64  
 1   movieId     264505 non-null  int64  
 2   rating_x    264505 non-null  float64
 3   timestamp   264505 non-null  int64  
 4   rating_y    264505 non-null  float64
 5   adg_rating  264505 non-null  float64
dtypes: float64(3), int64(3)
memory usage: 14.1 MB


In [5]:
movies.keys()

Index(['movieId', 'title', 'genres'], dtype='object')

In [6]:
ratings.keys()

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [7]:
tags.keys()

Index(['movieId', 'userId', 'tag', 'timestamp'], dtype='object')

## Similitud de coseno

In [8]:
final = pd.pivot_table(rating_avg,
                       values = 'adg_rating',
                       index='userId',
                       columns='movieId')

In [9]:
final.head(3)

movieId,1,2,3,4,5,6,7,9,10,11,...,106487,106489,106782,106920,109374,109487,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,-0.829457,,,,,,-1.329457,,-0.829457,,...,,,,,,,,,,
320,,,,,,,,,,,...,,,,,,,,,,
359,1.314526,,,,,1.314526,,,0.314526,0.314526,...,,,,,,,,,,


In [10]:
final.shape

(862, 2500)

In [11]:
# quitar los NaN

#Reemplazando  NaN por Movie Average
final_movie = final.fillna(final.mean(axis=0))

#Reemplazando NaN por User Average
final_user = final.apply(lambda row: row.fillna(row.mean()), axis=1)

In [12]:
print(final_movie.head(3))

movieId    1         2         3         4         5         6         7       \
userId                                                                          
316     -0.829457 -0.436518 -0.468109 -0.770223 -0.615331  0.320415 -1.329457   
320      0.200220 -0.436518 -0.468109 -0.770223 -0.615331  0.320415 -0.203889   
359      1.314526 -0.436518 -0.468109 -0.770223 -0.615331  1.314526 -0.203889   

movieId    9         10        11      ...    106487    106489    106782  \
userId                                 ...                                 
316     -0.690175 -0.829457 -0.094277  ...  0.105075  0.006629  0.262314   
320     -0.690175 -0.150642 -0.094277  ...  0.105075  0.006629  0.262314   
359     -0.690175  0.314526  0.314526  ...  0.105075  0.006629  0.262314   

movieId   106920    109374    109487   111362    111759    112556    112852  
userId                                                                       
316      0.23735  0.429868  0.306567  0.22511  0.234458  

In [13]:
print(final_user.head(3))

movieId        1             2             3             4             5       \
userId                                                                          
316     -8.294574e-01 -1.445872e-16 -1.445872e-16 -1.445872e-16 -1.445872e-16   
320     -5.013910e-17 -5.013910e-17 -5.013910e-17 -5.013910e-17 -5.013910e-17   
359      1.314526e+00  9.809414e-17  9.809414e-17  9.809414e-17  9.809414e-17   

movieId        6             7             9             10            11      \
userId                                                                          
316     -1.445872e-16 -1.329457e+00 -1.445872e-16 -8.294574e-01 -1.445872e-16   
320     -5.013910e-17 -5.013910e-17 -5.013910e-17 -5.013910e-17 -5.013910e-17   
359      1.314526e+00  9.809414e-17  9.809414e-17  3.145258e-01  3.145258e-01   

movieId  ...        106487        106489        106782        106920  \
userId   ...                                                           
316      ... -1.445872e-16 -1.445872e-16 -1.

## Calcular el coseno de similitud entre los usuarios

In [14]:
cosine = cosine_similarity(final_movie)
np.fill_diagonal(cosine, 0)

similarity_with_movie = pd.DataFrame(cosine, index=final_movie.index)
similarity_with_movie.columns = final_user.index
similarity_with_movie.head(4)

userId,316,320,359,370,910,975,1015,1387,1447,1588,...,137118,137209,137227,137446,137559,137609,137805,138072,138176,138200
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,0.0,0.921169,0.665659,0.673486,0.694247,0.894969,0.80578,0.851492,0.945224,0.705491,...,0.827564,0.895641,0.87929,0.916856,0.912146,0.922262,0.587738,0.671783,0.949138,0.74022
320,0.921169,0.0,0.687225,0.691158,0.699527,0.91602,0.816931,0.874283,0.970234,0.724147,...,0.861798,0.909376,0.907009,0.938964,0.929049,0.943265,0.612746,0.695382,0.973853,0.768459
359,0.665659,0.687225,0.0,0.534369,0.523475,0.655225,0.602806,0.629143,0.705042,0.542504,...,0.62182,0.65432,0.655839,0.679696,0.6839,0.686193,0.418283,0.489595,0.70737,0.534065
370,0.673486,0.691158,0.534369,0.0,0.54756,0.67181,0.618456,0.628825,0.712683,0.548592,...,0.636688,0.673489,0.651209,0.688647,0.689265,0.692595,0.405881,0.497332,0.714011,0.546637


In [15]:
b = cosine_similarity(final_user)

np.fill_diagonal(b, 0)
similarity_with_user = pd.DataFrame(b, index=final_user.index)
similarity_with_user.columns = final_user.index
similarity_with_user.head(3)

userId,316,320,359,370,910,975,1015,1387,1447,1588,...,137118,137209,137227,137446,137559,137609,137805,138072,138176,138200
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,0.0,0.060063,0.072075,0.043266,0.039305,0.045616,0.035341,0.038068,-0.01248514,0.050183,...,0.052632,0.104864,0.011358,0.029674,0.092552,0.017876,0.051371,0.077377,0.026924,-0.022727
320,0.060063,0.0,0.063054,0.027315,0.006811,0.07562,0.01191,0.042509,-2.126227e-31,0.067389,...,0.115325,0.06513,0.071996,0.097554,0.064769,-0.006251,0.077256,0.098845,0.038752,0.056639
359,0.072075,0.063054,0.0,0.135836,0.076131,0.036757,0.046418,0.066544,0.04287659,0.109726,...,0.120191,0.020672,0.032166,0.039599,0.108502,0.026371,0.075492,0.102698,0.099307,0.003147


In [16]:
def  get_user_similar_movies(user1, user2):
    common_movies = rating_avg[rating_avg.userId == user1].merge(rating_avg[rating_avg.userId == user2],on='movieId',how='inner')
    return common_movies.merge(movies, on = 'movieId')

In [17]:
a = get_user_similar_movies(370,86309)
a = a.loc[ : , ['rating_x_x', 'rating_x_y', 'title']]
a.head(3)

Unnamed: 0,rating_x_x,rating_x_y,title
0,5.0,5.0,"Matrix, The (1999)"
1,5.0,4.5,"Lord of the Rings: The Fellowship of the Ring,..."
2,5.0,4.0,"Lord of the Rings: The Two Towers, The (2002)"


In [18]:
a.head()

Unnamed: 0,rating_x_x,rating_x_y,title
0,5.0,5.0,"Matrix, The (1999)"
1,5.0,4.5,"Lord of the Rings: The Fellowship of the Ring,..."
2,5.0,4.0,"Lord of the Rings: The Two Towers, The (2002)"
3,4.5,4.0,"Lord of the Rings: The Return of the King, The..."
4,1.5,1.0,Serenity (2005)


## Neighborhood for User (k)

In [20]:
def find_n_neighbours(df,n):
    order = np.argsort(df.values, axis=1)[:, :n]
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False).iloc[:n].index,
                                      index = ['top{}'.format(i) for i in range(1, n+1)]),
                                      axis = 1)
    return df

In [21]:
# top 30 de vecinos para cada usuario
sim_user_30_u = find_n_neighbours(similarity_with_user,30)
print(sim_user_30_u.head())

          top1    top2    top3   top4    top5    top6    top7    top8   top9  \
userId                                                                         
316     113673  117918    9050  12882   38187  102668   98880   43829  13215   
320      12288  113673   28159  79846  134627  112948  120729   97163   2945   
359     102118   96482  102532  50898    2702   60016   23428  120782  57937   
370      46645   42245   40768  23428  123707   60016   45120  113645  97195   
910      87042  131620   67352  40768   31321   48821   26222   63295   5611   

         top10  ...   top21   top22   top23   top24  top25   top26   top27  \
userId          ...                                                          
316      78501  ...   88608  120782   74472   53834  88928   42245   58265   
320       4931  ...   39271   94883  127683  101137  54989  134521   80946   
359      42096  ...  117258    7723  120729   61305  40768  117918   86768   
370     102118  ...    5611   20530    2702   381

In [22]:
sim_user_30_m = find_n_neighbours(similarity_with_movie,30)
print(sim_user_30_m.head())

          top1    top2    top3   top4    top5    top6    top7    top8    top9  \
userId                                                                          
316     138176  100240   96936  51460   88932    1447  104732  125012    5268   
320     138176   96936  121403   1447   51460  125012   88932   42944    5268   
359     138176    1447    5268  96936  100240   21401   88932   13927  104732   
370      86309   44194  138176  24802  129869   96936    1447  104529   94333   
910      96936  107991  138176  27142   51460  125012   88932  100240   72633   

         top10  ...   top21   top22   top23   top24   top25   top26   top27  \
userId          ...                                                           
316     121403  ...  121987   72633   21401  114335   22338  118304  124981   
320     104529  ...  121987  102549  118304   86309   94333  124981   93203   
359      72633  ...   12930  121987  114335  125012   51460  118304   57474   
370      88932  ...  124981   27142  

In [23]:
def user_item_score(user,item):
    a = sim_user_30_m [ sim_user_30_m.index == user ].values
    b = a.squeeze().tolist()
    c = final_movie.loc[:,item]
    d = c [ c.index.isin(b) ]
    f = d [ d.notnull() ]
    
    avg_user = mean.loc [ mean['userId'] == user, 'rating' ].values[0]
    index = f.index.values.squeeze().tolist()
    corr = similarity_with_movie.loc[user,index]
    
    fin = pd.concat( [ f,corr ], axis=1 )
    fin.columns = ['adg_score' , 'correlation']
    fin['score'] = fin.apply ( lambda x: x['adg_score'] * x['correlation'], axis = 1)
    
    nume = fin['score'].sum()
    deno = fin['correlation'].sum()
    
    final_score = avg_user + (nume / deno)
    
    return final_score
    
    

In [25]:
Score = user_item_score(320, 7371)
print("score (u,i) es", Score) # i = the movie

score (u,i) es 4.255766437391595


In [27]:
rating_avg = rating_avg.astype({"movieId": str})
Movie_user = rating_avg.groupby(by = 'userId')['movieId'].apply(lambda x: ','.join(x))

def User_item_score1(user):
    Movie_seen_by_user = check.columns[check[check.index == user].notna().any()].tolist()
    
    a = sim_user_30_m[sim_user_30_m.index == user].values
    b = a.squeeze().tolist()
    d = Movie_user[Movie_user.index.isin(b)]
    l = ','.join(d.values)
    
    Movie_seen_by_similar_users = l.split(',')
    Movies_under_consideration  = list( set(Movie_seen_by_similar_users) - set(list(map(str, Movie_seen_by_user)) ))
    Movies_under_consideration  = list(map(int, Movies_under_consideration))
    
    score = []
    for item in Movies_under_consideration:
        c = final_movie.loc[:, item]
        d = c[c.index.isin(b)]
        f = d[d.notnull()]
        
        avg_user = Mean.loc[Mean['userId'] == user,'rating'].values[0]
        index    = f.index.values.squeeze().tolist()
        corr     = similarity_with_movie.loc[user, index]
        
        fin          = pd.concat([f, corr], axis = 1)
        fin.columns  = ['adg_score', 'correlation']
        fin['score'] = fin.apply(lambda x: x['adg_score'] * x['correlation'], axis = 1)
        
        nume = fin['score'].sum()
        deno = fin['correlation'].sum()
        
        final_score = avg_user + (nume / deno)
        score.append(final_score)
        
    data                 = pd.DataFrame({'movieId': Movies_under_consideration, 'score': score})
    top_5_recommendation = data.sort_values(by = 'score', ascending = False).head(5)
    
    Movie_Name  = top_5_recommendation.merge(movies, how = 'inner', on = 'movieId')
    Movie_Names = Movie_Name.title.values.tolist()
    
    return Movie_Names

In [None]:
predicted_movies = user_item_score1(370)
print(" ")
print(" Las recomendaciones para el usuario de ID 370 son: ")
print(" ")

for i in predicted_movies:
    print(i)