# Рекомендательная система на библиотеке surprise (Домашнее задание)


In [1]:
import pandas as pd
import numpy as np

In [2]:
from surprise import Dataset
from surprise import Reader

In [3]:
from surprise import KNNBaseline

In [4]:
from surprise.model_selection import train_test_split

In [5]:
def Precision_at_n(df_ratings, n=3, threshold=5):
    rel_n = set(df_ratings[df_ratings.real_rating>=threshold]['itemID'])
    t=df_ratings.sort_values(by='rating', ascending=False)[0:n]
    ret_n=set(t['itemID'])
    return len(rel_n & ret_n)/n

In [6]:
def Avg_Precision_at_n(df_ratings, n=10,threshold=5):
    avg_p=0
    rel_n = set(df_ratings[df_ratings.real_rating>=threshold]['itemID'])
    
    for i in range(1,n+1):
        t=df_ratings.sort_values(by='rating', ascending=False)[0:i]
        ret_n=set(t['itemID'])
        t2=df_ratings.sort_values(by='rating', ascending=False)[i-1:i]
        ret_n2=set(t2['itemID'])
        #print(ret_n2,ret_n)
        if len((ret_n2 & rel_n))>0:
            avg_p=avg_p+len(rel_n & ret_n)/n
    return avg_p/n

In [7]:
def Recall_at_n(df_ratings, n=3,threshold=5):
    rel_n = set(df_ratings[df_ratings.real_rating>=threshold]['itemID'])
    t=df_ratings.sort_values(by='rating', ascending=False)[0:n]
    ret_n=set(t['itemID'])
    return len(rel_n & ret_n)/len(set(rel_n))

In [8]:
def MAP_at_n(df_ratings_all, n=10,threshold=5):
    users=df_ratings_all['userID'].unique()
    map_at_n=0
    for i in users:
        df_ratings=df_ratings_all[df_ratings_all.userID==i]
        map_at_n=map_at_n+Avg_Precision_at_n(df_ratings,n=n,threshold=threshold)
    return map_at_n/len(users)   

## Легкое задание
Для датасэта

```
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.30)

```

Для каждого k= 10, 20 и 30 выбрать какой вариант KNN лучше item-based или user-based по метрике MAP_at_n (n=5)


In [9]:
### YOUR CODE HERE ###
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.30)

In [10]:
map_list = []
count = 0
for k in range(10, 31, 10):
    for based in [True, False]:
        algo = KNNBaseline(k=k, sim_options={
                           'user_based': based}, verbose=False)
        predictions = algo.fit(trainset).test(testset)
        df_ratings = pd.DataFrame(
            columns=['userID', 'itemID', 'rating', 'real_rating'])
        for i in predictions:
            df_ratings = df_ratings.append(
                {'userID': i.uid, 'itemID': i.iid, 'rating': i.est, 'real_rating': i.r_ui}, ignore_index=True)
        map_list.append(MAP_at_n(df_ratings, 5))

    if map_list[count+1] > map_list[count]:
        print(
            f'При k = {k} лучший вариант item-based. MAP_at_n_item = {map_list[count+1]:.3f} , MAP_at_n_user = {map_list[count]:.3f}')
    else:
        print(
            f'При k = {k} лучший вариант user_based. MAP_at_n_item = {map_list[count+1]:.3f} , MAP_at_n_user = {map_list[count]:.3f}')
    count += 2

При k = 10 лучший вариант item-based. MAP_at_n_item = 0.151 , MAP_at_n_user = 0.144
При k = 20 лучший вариант item-based. MAP_at_n_item = 0.156 , MAP_at_n_user = 0.150
При k = 30 лучший вариант user_based. MAP_at_n_item = 0.155 , MAP_at_n_user = 0.156


## Сложное задание

Для датасэта

```
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.30)

```

а) Для KNN(k=30) для каждого пользователя найти лучший алгоритм по Avg_Precision_at_n(7)
Cравнивая между item-based, user-based подход, а также разные меры для (косинусную, Пирсона) в KNN.
Докуметация к мерам:
https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#similarity-measures-configuration

б) Для каждого пользователя расчитайте рейтинги фильмов используя лучший алгоритм из пункта а)

в) Сравнить результат из б) с обычном kNN k=30 по метрике MAP (n=7).


#### а)

In [12]:
### YOUR CODE HERE ###
avg_pn_list = []
algo_list = []
users_list = []
df_metrics = pd.DataFrame()

for name in ['cosine', 'pearson_baseline']:
    for based in [True, False]:
        column_name = str('name:') + name + str(' user_based:') + str(based)
        algo = KNNBaseline(k=30, sim_options={
                           'name': name, 'user_based': based}, verbose=False)
        predictions = algo.fit(trainset).test(testset)
        df_ratings = pd.DataFrame(
            columns=['userID', 'itemID', 'rating', 'real_rating'])
        for i in predictions:
            df_ratings = df_ratings.append(
                {'userID': i.uid, 'itemID': i.iid, 'rating': i.est, 'real_rating': i.r_ui}, ignore_index=True)

        for users in df_ratings['userID'].unique():
            avg_pn_list.append(Avg_Precision_at_n(
                df_ratings[df_ratings['userID'] == users], 7))
            
        df_metrics[column_name] = avg_pn_list
        avg_pn_list = []
    
for users in df_ratings['userID'].unique():
    users_list.append(users)
        
df_metrics.insert(0, 'userID', users_list)
df_metrics['max_apn_value'] = df_metrics.max(axis=1)

for idx in df_metrics.index:
    algo_list.append(np.where(df_metrics.drop(
        ['userID', 'max_apn_value'], axis=1).iloc[idx] == df_metrics.iloc[idx].max_apn_value)[0][0])
df_metrics['algorithm number'] = algo_list
df_metrics

Unnamed: 0,userID,name:cosine user_based:True,name:cosine user_based:False,name:pearson_baseline user_based:True,name:pearson_baseline user_based:False,max_apn_value,algorithm number
0,318,0.020408,0.000000,0.061224,0.020408,0.061224,2
1,145,0.122449,0.122449,0.204082,0.122449,0.204082,2
2,479,0.061224,0.061224,0.122449,0.122449,0.122449,2
3,106,0.020408,0.020408,0.020408,0.020408,0.020408,0
4,268,0.204082,0.061224,0.061224,0.204082,0.204082,0
...,...,...,...,...,...,...,...
938,133,0.020408,0.020408,0.020408,0.020408,0.020408,0
939,604,0.061224,0.061224,0.061224,0.061224,0.061224,0
940,696,0.061224,0.061224,0.061224,0.061224,0.061224,0
941,153,0.020408,0.020408,0.020408,0.020408,0.020408,0


#### б)

In [13]:
### YOUR CODE HERE ###
idx = 0
count = 0
algo_list = []
best_rating = []
df_full = pd.DataFrame()

for name in ['cosine', 'pearson_baseline']:
    for based in [True, False]:
        algo = KNNBaseline(k=30, sim_options={
                           'name': name, 'user_based': based}, verbose=False)
        predictions = algo.fit(trainset).test(testset)
        df_ratings = pd.DataFrame(
            columns=['userID', 'itemID', 'rating', 'real_rating'])
        for i in predictions:
            df_ratings = df_ratings.append(
                {'userID': i.uid, 'itemID': i.iid, 'rating': i.est, 'real_rating': i.r_ui}, ignore_index=True)
        df_full['rating '+ str(count)] = df_ratings['rating']
        count += 1
        
df_full.insert(0, 'userID', df_ratings.userID)
df_full.insert(1, 'itemID', df_ratings.itemID)

for users in df_full.userID:
    algo_list.append(df_metrics[df_metrics['userID'] == users]['algorithm number'].tolist()[0])

df_full['algorithm number'] = algo_list
df_full['real_rating'] = df_ratings['real_rating']

for alg_num in df_full['algorithm number']:
    best_rating.append(df_full.iloc[idx][str('rating ') + str(alg_num)])
    idx += 1
df_full['best_rating'] = best_rating

df_full

Unnamed: 0,userID,itemID,rating 0,rating 1,rating 2,rating 3,algorithm number,real_rating,best_rating
0,318,396,2.984944,3.110189,3.541634,3.451996,2,1.0,3.541634
1,145,273,3.485906,3.594470,3.635734,3.706439,2,5.0,3.635734
2,479,436,3.510231,3.925196,3.681299,3.837491,2,4.0,3.681299
3,106,699,3.715394,3.655683,3.622171,3.694116,0,4.0,3.715394
4,268,178,3.912343,3.962063,4.059085,4.283019,0,4.0,3.912343
...,...,...,...,...,...,...,...,...,...
29995,291,82,3.850669,4.098276,4.157344,4.154286,0,4.0,3.850669
29996,747,85,3.476893,3.266402,3.421975,3.548517,3,3.0,3.548517
29997,82,274,3.120601,2.894061,3.137816,3.138774,1,3.0,2.894061
29998,303,867,4.378420,3.562080,4.264553,4.226200,1,3.0,3.562080


#### в)

In [14]:
### YOUR CODE HERE ###
algo = KNNBaseline(k=30, sim_options={'user_based': True}, verbose=False)
predictions = algo.fit(trainset).test(testset)
df_ratings = pd.DataFrame(columns=['userID', 'itemID', 'rating', 'real_rating'])
for i in predictions:
    df_ratings = df_ratings.append({'userID': i.uid, 'itemID': i.iid, 'rating': i.est, 'real_rating': i.r_ui}, ignore_index=True)
MAP_at_n(df_ratings, 7)

0.12138853420477393

In [15]:
df_full = df_full[['userID','itemID','best_rating','real_rating']]
df_full = df_full.rename(columns={'best_rating': 'rating'})
MAP_at_n(df_full, 7)

0.15367801415369878

Результат из блока Б намного лучше.