# Collaborative Filtering



In [1]:
import pandas as pd
import numpy as np

- MovieLens dataset é disponibilizado pelo grupo de pesquisa GroupLen, que é um laboratório na Universidade de Minnesota.
    - Mais de 26.000.000 de avaliações, com 45.000 filmes e 270.000 usuários. 
    - Vamos considerar 100.000 avaliações, 1.000 usuários e 1.700 filmes.

In [2]:
#ler dados dos usuários
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

users = pd.read_csv('../../data/movielens/u.user', sep='|', names=u_cols, encoding='latin-1')

users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [3]:
#ler dados dos itens
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('../../data/movielens/u.item', sep='|', names=i_cols, encoding='latin-1')

movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
#selecionar id e titulo
movies = movies[['movie_id', 'title']]

In [5]:
#ler dados de usuário e rating dado
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv('../../data/movielens/u.data', sep='\t', names=r_cols, encoding='latin-1')

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
#dropar coluna não necessária
ratings = ratings.drop('timestamp', axis=1)

In [7]:
#configuração de treino e teste
from sklearn.model_selection import train_test_split

# X ratings originais e y  dataframe e y user_id 
X = ratings.copy()
y = ratings['user_id']

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [8]:
from sklearn.metrics import mean_squared_error

#Computa RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [9]:
#baseline
def baseline(user_id, movie_id):
    return 3.0

In [10]:
#Computa o RMSE score obtido no testing set por um modelo
def score(cf_model):
    
    #Constroi uma lista de user-movie dos dados de teste
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    
    #Prediz o rating para cada tupla user-movie 
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    
    #Extrai o valor real dos ratings dado pelos usuarios nos dados de teste
    y_true = np.array(X_test['rating'])
    
    #Retorna o RMSE score final
    return rmse(y_true, y_pred)

In [11]:
score(baseline)

1.2470926188539486

## User Based Collaborative Filtering

## Memory based

### Ratings Matrix

In [12]:
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='movie_id')

r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1670,1671,1673,1674,1675,1676,1679,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


### Mean

In [13]:
#Usando a média das avaliações
def cf_user_mean(user_id, movie_id):
    
    #Validar se filme existe na matriz
    if movie_id in r_matrix:
        #Computar a média de todas as avaliações daquele filme
        mean_rating = r_matrix[movie_id].mean()
    
    else:
        #Se o filme não existe na matriz a avaliação fica 3
        mean_rating = 3.0
    
    return mean_rating

In [14]:
#Computa RMSE 
score(cf_user_mean)

1.0234701463131335

### Weighted Mean

<img src="../../imgs/weighted_mean.png">

In [15]:
#Cria avaliações quando vazias setadas para 0
r_matrix_dummy = r_matrix.copy().fillna(0)

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

#Computa a similaridade de coseno com a inputação dummy
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [17]:
#Converte matriz em pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head(10)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.118076,0.029097,0.011628,0.264677,0.312419,0.308729,0.224269,0.026017,0.286411,...,0.308475,0.055872,0.197862,0.131367,0.152449,0.084456,0.293293,0.056765,0.103536,0.326491
2,0.118076,1.0,0.099097,0.10768,0.034279,0.152789,0.086705,0.078864,0.06894,0.092399,...,0.086927,0.259636,0.289092,0.318824,0.149105,0.186347,0.168034,0.106748,0.136796,0.080358
3,0.029097,0.099097,1.0,0.252131,0.026893,0.062539,0.039767,0.089474,0.078162,0.03767,...,0.040918,0.019031,0.065417,0.055373,0.086503,0.018418,0.096993,0.109631,0.092574,0.018987
4,0.011628,0.10768,0.252131,1.0,0.0,0.045543,0.078812,0.095354,0.059498,0.053879,...,0.024226,0.050703,0.056561,0.107294,0.098892,0.0,0.1329,0.142798,0.097066,0.015176
5,0.264677,0.034279,0.026893,0.0,1.0,0.202843,0.299619,0.163724,0.038474,0.153021,...,0.262547,0.048524,0.048312,0.022202,0.09191,0.066,0.156172,0.115842,0.124297,0.267574
6,0.312419,0.152789,0.062539,0.045543,0.202843,1.0,0.375963,0.131795,0.110944,0.400758,...,0.287549,0.080312,0.162988,0.182856,0.114262,0.09209,0.261859,0.097606,0.206104,0.187637
7,0.308729,0.086705,0.039767,0.078812,0.299619,0.375963,1.0,0.211282,0.107795,0.328923,...,0.290002,0.07417,0.094619,0.084235,0.11562,0.100625,0.233843,0.039199,0.224227,0.296332
8,0.224269,0.078864,0.089474,0.095354,0.163724,0.131795,0.211282,1.0,0.03704,0.183375,...,0.165008,0.066843,0.058766,0.068759,0.087159,0.129381,0.188662,0.121223,0.08391,0.273238
9,0.026017,0.06894,0.078162,0.059498,0.038474,0.110944,0.107795,0.03704,1.0,0.155435,...,0.011708,0.0,0.10171,0.034568,0.045002,0.052699,0.107486,0.055766,0.070065,0.088281
10,0.286411,0.092399,0.03767,0.053879,0.153021,0.400758,0.328923,0.183375,0.155435,1.0,...,0.278558,0.04931,0.153506,0.065471,0.060088,0.033686,0.197107,0.085402,0.118945,0.162538


In [18]:
#Usando a média ponderada das avaliações
def cf_user_wmean(user_id, movie_id):
    
    #Checa se filme existe
    if movie_id in r_matrix:
        
        #Obtém a similaridade do usuário com os outros
        sim_scores = cosine_sim[user_id]
        
        #Obtém a avaliação do usuário para o filme
        m_ratings = r_matrix[movie_id]
        
        #Extrai os índices que contém NaN
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop os valores NaN 
        m_ratings = m_ratings.dropna()
        
        #Drop os scores de cosseno correspondentes
        sim_scores = sim_scores.drop(idx)
        
        #Computa a média ponderada final
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
    
    else:
        #valor default caso o filme não exista na matriz
        wmean_rating = 3.0
    
    return wmean_rating

In [19]:
score(cf_user_wmean)

1.0174483808407588

### Demographics

In [20]:
#Merge os dados originais dos usuários com os dados de treino
merged_df = pd.merge(X_train, users)

merged_df.head(3)

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,889,684,2,24,M,technician,78704
1,889,279,2,24,M,technician,78704
2,889,29,3,24,M,technician,78704


In [21]:
#Computa a média de avaliação por gênero
gender_mean = merged_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

In [22]:
#Seta o índice para o user_id
users = users.set_index('user_id')

In [23]:
def cf_gender(user_id, movie_id):
    
    #Checa se filme existe
    if movie_id in r_matrix:
        #Identifica o sexo
        gender = users.loc[user_id]['sex']
        
        #Checa se tem avaliação dado o gênero
        if gender in gender_mean[movie_id]:
            
            #Computa a média das avaliações dado o sexo
            gender_rating = gender_mean[movie_id][gender]
        
        else:
            gender_rating = 3.0
    
    else:
        #se não existe o filme
        gender_rating = 3.0
    
    return gender_rating

In [24]:
score(cf_gender)

1.0330308800874282

## Model Based Approaches

In [None]:
!pip install surprise

In [25]:
#https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html
from surprise import Reader, Dataset, KNNBasic,SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import cross_validate

In [26]:
#O Reader ajuda no parsing do dataframe
reader = Reader()

data = Dataset.load_from_df(ratings, reader)

In [27]:
benchmark = []
# Iterar por todos algoritmos
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Performar  validação cruzada
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Obtém resultados e append o nome do algoritmo
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.927911,127.589592,6.039167
KNNBaseline,0.936439,0.453593,6.346548
SVD,0.94509,3.941414,0.343456
BaselineOnly,0.946817,0.196653,0.256686
SlopeOne,0.950234,0.575013,4.763751
KNNWithMeans,0.957043,0.32896,5.520427
KNNWithZScore,0.957209,0.389325,5.885609
NMF,0.973821,4.319245,0.310797
CoClustering,0.977405,1.500507,0.248993
KNNBasic,0.988168,0.267684,5.185513


In [28]:
# https://surprise.readthedocs.io/en/stable/FAQ.html
from collections import defaultdict
def get_top_n(predictions, n=10):
    """Retorna as top n recomendações para cada usuário de teste
    Args:
        predictions(lista de objetos preditos): A lista de predições, como retornado pelo algoritmo
        n(int): Número de recomendações

    Returns:
    Um dicionário, onde as chaves são ids de usuário e valores são listas de tuplas:
        [(raw item id, rating estimation), ...] de tamanho n
    """

    # mapeamento das predições para cada usuário
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # ordena predições de cada usuário e retorna as top k
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [29]:
trainset = data.build_full_trainset()
algo = SVDpp()
algo.fit(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print as recomendações para cada usuário
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

196 [286, 285, 8, 655, 663, 251, 306, 1007, 242, 692]
186 [79, 117, 98, 71, 300, 257, 742, 159, 226, 568]
22 [173, 172, 127, 50, 174, 187, 144, 258, 168, 89]
244 [179, 9, 318, 475, 42, 56, 357, 151, 50, 191]
166 [313, 300, 258, 315, 328, 323, 322, 751, 294, 748]
298 [318, 496, 174, 483, 603, 22, 498, 357, 178, 98]
115 [475, 56, 654, 23, 100, 657, 89, 12, 922, 357]
253 [318, 496, 64, 22, 50, 427, 98, 483, 79, 1]
305 [408, 169, 178, 48, 474, 134, 199, 479, 357, 483]
6 [127, 474, 483, 427, 134, 124, 285, 9, 479, 511]
62 [50, 12, 56, 173, 172, 114, 174, 168, 134, 127]
286 [367, 1101, 251, 707, 316, 176, 1039, 272, 147, 301]
200 [318, 79, 174, 191, 483, 265, 195, 98, 50, 28]
210 [50, 98, 174, 357, 483, 302, 657, 127, 134, 172]
224 [22, 313, 318, 15, 69, 300, 215, 28, 333, 282]
303 [172, 357, 50, 100, 56, 98, 474, 127, 174, 187]
122 [190, 191, 175, 513, 86, 127, 187, 511, 46, 427]
194 [127, 64, 12, 173, 174, 187, 318, 178, 50, 511]
291 [168, 173, 50, 172, 64, 56, 48, 98, 174, 12]
234 [170, 4

In [31]:
#https://nbviewer.jupyter.org/github/NicolasHug/Surprise/blob/master/examples/notebooks/KNNBasic_analysis.ipynb
def get_Iu(uid):
    """ Retorna o número de itens avaliados pelo usuário uid
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # usuário não presente nos dados de treino
        return 0
    
def get_Ui(iid):
    """ Retorna o número de usuários que avaliaram item iid
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)#número de itens avaliados por usuário
df['Ui'] = df.iid.apply(get_Ui)#número de usuários que avaliaram item
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [32]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
31983,312,515,5.0,5.0,{'was_impossible': False},223,201,0.0
65282,592,174,5.0,5.0,{'was_impossible': False},360,420,0.0
65266,592,64,5.0,5.0,{'was_impossible': False},360,283,0.0
32594,314,318,5.0,5.0,{'was_impossible': False},245,298,0.0
32605,314,15,5.0,5.0,{'was_impossible': False},245,293,0.0
65260,592,169,5.0,5.0,{'was_impossible': False},360,118,0.0
32679,136,318,5.0,5.0,{'was_impossible': False},35,298,0.0
65257,592,179,5.0,5.0,{'was_impossible': False},360,221,0.0
65255,592,168,5.0,5.0,{'was_impossible': False},360,316,0.0
65250,592,272,5.0,5.0,{'was_impossible': False},360,198,0.0


In [33]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
76807,675,223,1.0,4.136775,{'was_impossible': False},34,136,3.136775
17417,174,168,1.0,4.136896,{'was_impossible': False},177,316,3.136896
46434,416,307,1.0,4.206665,{'was_impossible': False},493,188,3.206665
6184,97,83,1.0,4.217035,{'was_impossible': False},63,176,3.217035
22959,295,183,1.0,4.255656,{'was_impossible': False},196,291,3.255656
40973,366,234,1.0,4.292032,{'was_impossible': False},33,280,3.292032
63697,567,100,1.0,4.31134,{'was_impossible': False},155,508,3.31134
54863,481,318,1.0,4.41889,{'was_impossible': False},56,298,3.41889
19858,130,245,1.0,4.432153,{'was_impossible': False},353,240,3.432153
8482,246,56,1.0,4.475569,{'was_impossible': False},195,394,3.475569


In [36]:
import matplotlib.pyplot as plt
%matplotlib notebook

ratings.loc[ratings['movie_id'] == 223]['rating'].hist()
plt.xlabel('Avaliação')
plt.ylabel('Número de avaliações of ratings')
plt.title('Número de avaliações que filme recebeu')
plt.rc('axes', axisbelow=True)
plt.show()


<IPython.core.display.Javascript object>