# Aula 04 - Exemplos

In [1]:
import numpy as np
import pandas as pd

## Fazendo download da base

In [2]:
# import wget
# !python3 -m wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-compact.tar.gz
# !tar -xvzf ml-20m-compact.tar.gz

## Ler e preparar dados (vide notebooks anteriores)


In [3]:
movies = pd.read_csv('./dataset/movies_sample.csv')
ratings = pd.read_csv('./dataset/ratings_sample.csv')
df = ratings[['userId', 'movieId', 'rating']]
df = df.merge(movies[['movieId', 'title']])
map_users = {user: idx for idx, user in enumerate(df.userId.unique())}
map_items = {item: idx for idx, item in enumerate(df.movieId.unique())}
df['userId'] = df['userId'].map(map_users)
df['movieId'] = df['movieId'].map(map_items)

map_title = {}
for _, row in df.iterrows():
    map_title[row.movieId] = row.title

  ratings = pd.read_csv('./dataset/ratings_sample.csv')


## Avaliação no cenário de predição de notas (rating prediction)

### Cross-Validation

In [4]:
from caserec.utils.cross_validation import CrossValidation
from caserec.recommenders.rating_prediction.itemknn import ItemKNN

df.to_csv('ratings.dat', index=False, header=False, sep='\t')

recommender = ItemKNN()
CrossValidation(input_file='ratings.dat', recommender=recommender, dir_folds='./', header=1, k_folds=5).compute()

[Case Recommender: Cross Validation]

Database:: ratings.dat 
Recommender Algorithm:: ItemKNN Algorithm | K Folds: 5

Eval:: MAE: 0.79632 RMSE: 1.05884 
Eval:: MAE: 0.790976 RMSE: 1.052007 
Eval:: MAE: 0.797181 RMSE: 1.058714 
Eval:: MAE: 0.804463 RMSE: 1.067889 
Eval:: MAE: 0.800824 RMSE: 1.064751 
Mean:: MAE: 0.797953 RMSE: 1.060440 
STD:: MAE: 0.004529 RMSE: 0.005490 


### Hold-Out

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=.2, random_state=2)
train.to_csv('train.dat', index=False, header=False, sep='\t')
test.to_csv('test.dat', index=False, header=False, sep='\t')


ItemKNN('train.dat', 'test.dat', 'rp_iknn.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > ItemKNN Algorithm]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 2.832554 sec
prediction_time:: 0.379059 sec
Eval:: MAE: 0.663504 RMSE: 0.872175 


### Explorando as predições

In [6]:
preds = pd.read_csv('./rp_iknn.dat', sep='\t', names=['userId', 'movieId', 'rating'])
preds_user = preds.loc[(preds.userId==0), 'rating'].tolist()
print(preds_user)

[4.150225, 3.832249]


In [7]:
ratings_user = test.loc[(test.userId==0), 'rating'].tolist()
print(ratings_user)

[5.0, 5.0]


In [8]:
from math import sqrt

def rmse_user(preds, ratings):
    if len(preds) != len(ratings):
        return -1
    sum = 0
    for i in range(len(preds)):
        sum += pow(preds[i]-ratings[i], 2)
    return sqrt(sum/len(preds))

print(rmse_user(preds_user, ratings_user))

1.0212149501025727


## Avaliação no cenário de recomendação de itens (item recommendation)

In [10]:
from caserec.recommenders.item_recommendation.bprmf import BprMF

BprMF('train.dat', 'test.dat', 'ir_bprmf.dat', factors=3).compute()

[Case Recommender: Item Recommendation > BPRMF]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 95.315761 sec
prediction_time:: 2.212678 sec




AttributeError: `np.asfarray` was removed in the NumPy 2.0 release. Use `np.asarray` with a proper dtype instead.

In [None]:
from caserec.recommenders.item_recommendation.itemknn import ItemKNN

ItemKNN('train.dat', 'test.dat', 'ir_itemknn.dat').compute()

[Case Recommender: Item Recommendation > ItemKNN Algorithm]

train data:: 11090 users and 403 items (152496 interactions) | sparsity:: 96.59%
test data:: 10503 users and 340 items (38125 interactions) | sparsity:: 98.93%

training_time:: 1.059788 sec
prediction_time:: 37.244149 sec


Eval:: PREC@1: 0.419023 PREC@3: 0.307975 PREC@5: 0.254384 PREC@10: 0.187261 RECALL@1: 0.134438 RECALL@3: 0.281617 RECALL@5: 0.378529 RECALL@10: 0.546924 MAP@1: 0.419023 MAP@3: 0.513599 MAP@5: 0.516788 MAP@10: 0.487732 NDCG@1: 0.419023 NDCG@3: 0.603503 NDCG@5: 0.620406 NDCG@10: 0.613585 


### Explorando as recomendações

In [None]:
recs = pd.read_csv('./ir_bprmf.dat', sep='\t', names=['userId', 'movieId', 'score'])
recs_user = recs.loc[(recs.userId==1), 'movieId'].tolist()
print(recs_user)

[12, 22, 10, 4, 17, 21, 30, 33, 13, 28]


In [None]:
ground_truth = test.loc[(test.userId==1), 'movieId'].tolist()
print(ground_truth)

[6, 106, 21, 30, 12]


In [None]:
intersec = list(set(recs_user) & set(ground_truth))
print('Precisão: ' + str(len(intersec)/len(recs_user)))
print('Revocação: ' + str(len(intersec)/len(ground_truth)))

Precisão: 0.3
Revocação: 0.6
