In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [78]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
df_item = pd.read_csv('input/item-features.csv')
df_user = pd.read_csv('input/user-features.csv')
subm = pd.read_csv('input/sample-submission.csv')

In [79]:
df_item = df_item.drop(columns=['19','27','30','9'])
df_item = df_item.sort_values('item_id').reset_index(drop=True)

df_user = df_user.iloc[:, :3]
df_user = df_user.sort_values('user_id').reset_index(drop=True)

In [80]:
train.head()

Unnamed: 0,user_id,item_id,like,timestamp
0,140,342,0,1490936622
1,378,172,1,1490936628
2,150,182,0,1490936650
3,455,17,0,1490936704
4,350,409,0,1490936735


In [81]:
from scipy.sparse import csr_matrix

df_user_np = df_user.iloc[:, 1:]
df_item_np = df_item.iloc[:, 1:]

y = train['like'].map({0: -1, 1:1}).to_numpy()
X_train = train.drop(columns=['like'])

data_csr = csr_matrix((y, (X_train['user_id'] , X_train['item_id'])))
df_user_np.shape, df_item_np.shape, data_csr.shape

((497, 2), (444, 28), (497, 444))

In [82]:
data_csr[0, 0], data_csr[378, 172], data_csr[140, 342]

(0, 1, -1)

In [83]:
from  sklearn.metrics.pairwise import pairwise_distances

# считаем косинусное расстояние для пользователей и фильмов 
# (построчно и поколоночно соотвественно).
user_similarity = pairwise_distances(df_user_np, metric='cosine')
item_similarity = pairwise_distances(df_item_np, metric='cosine')
user_similarity.shape, item_similarity.shape

((497, 497), (444, 444))

# Surprise

In [119]:
from surprise import NormalPredictor, SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor
from surprise import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import BaselineOnly, CoClustering
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy

In [115]:
train.head()

Unnamed: 0,user_id,item_id,like,timestamp
0,140,342,0,1490936622
1,378,172,1,1490936628
2,150,182,0,1490936650
3,455,17,0,1490936704
4,350,409,0,1490936735


In [114]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(train[['user_id', 'item_id', 'like']], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(SVD(), data, cv=5)

{'test_rmse': array([0.28480864, 0.28737401, 0.29640801, 0.28969182, 0.29218496]),
 'test_mae': array([0.17720405, 0.17962929, 0.18028459, 0.18070854, 0.1839145 ]),
 'fit_time': (0.39881157875061035,
  0.3943197727203369,
  0.39627623558044434,
  0.39580821990966797,
  0.39583539962768555),
 'test_time': (0.008928060531616211,
  0.008928298950195312,
  0.009452104568481445,
  0.009424448013305664,
  0.008930683135986328)}

In [None]:
# [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), 
#                   KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]

# <class 'surprise.prediction_algorithms.matrix_factorization.SVD'> 0.1825750931054188
# <class 'surprise.prediction_algorithms.matrix_factorization.SVDpp'> 0.16760332940541206
# <class 'surprise.prediction_algorithms.slope_one.SlopeOne'> 0.15265446170771874
# <class 'surprise.prediction_algorithms.random_pred.NormalPredictor'> 0.33987833116682625
# <class 'surprise.prediction_algorithms.knns.KNNBaseline'> 0.1297762283632601
# <class 'surprise.prediction_algorithms.knns.KNNBasic'> 0.1228509348207455
# <class 'surprise.prediction_algorithms.knns.KNNWithMeans'> 0.14724135594053778
# <class 'surprise.prediction_algorithms.knns.KNNWithZScore'> 0.14440677662246001
# <class 'surprise.prediction_algorithms.baseline_only.BaselineOnly'> 0.1839018552188873
# <class 'surprise.prediction_algorithms.co_clustering.CoClustering'> 0.15628048493380628

In [178]:
algos = [SVD(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(verbose=False), KNNBasic(verbose=False), 
         KNNWithMeans(verbose=False), KNNWithZScore(verbose=False), 
         BaselineOnly(verbose=False), CoClustering()]

for algorithm in algos:
    
    result = cross_validate(algorithm, data, cv=5)
    print(algorithm.__class__, result['test_mae'].mean())

<class 'surprise.prediction_algorithms.matrix_factorization.SVD'> 0.1825750931054188
<class 'surprise.prediction_algorithms.matrix_factorization.SVDpp'> 0.16760332940541206
<class 'surprise.prediction_algorithms.slope_one.SlopeOne'> 0.15265446170771874
<class 'surprise.prediction_algorithms.random_pred.NormalPredictor'> 0.33987833116682625
<class 'surprise.prediction_algorithms.knns.KNNBaseline'> 0.1297762283632601
<class 'surprise.prediction_algorithms.knns.KNNBasic'> 0.1228509348207455
<class 'surprise.prediction_algorithms.knns.KNNWithMeans'> 0.14724135594053778
<class 'surprise.prediction_algorithms.knns.KNNWithZScore'> 0.14440677662246001
<class 'surprise.prediction_algorithms.baseline_only.BaselineOnly'> 0.1839018552188873
<class 'surprise.prediction_algorithms.co_clustering.CoClustering'> 0.15628048493380628


In [179]:
# 0.14098334301265458

algorithm = SVDpp(n_factors=5, lr_all=0.1, reg_all=0.02)  # , reg_all=0.02

result = cross_validate(algorithm, data, cv=5)
print(result['test_mae'].mean())

0.14792045242869906


In [186]:
# 0.11899593180524995

algorithm = KNNBasic(k=7, verbose=False)

res_sum = list()
for i in range(10):
    result = cross_validate(algorithm, data, cv=5)
    res_sum.append(result['test_mae'].mean())
print(np.mean(res_sum))

0.11949606732813518


In [245]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        
        # ЧИСТИМ СОГЛАСНО ЗНАЧЕНИЮ USER_ID
        user_ratings_filtered = [ur for ur in user_ratings if ur[0] < max_id[uid]]
        
        top_n[uid] = user_ratings_filtered[:n]

    return top_n

# USE USER_ID
max_id = train[train['like'] == 1].groupby('user_id')['item_id'].max().rolling(40, min_periods=1).max()
max_id += ((max_id + 30) / (max_id.index / 10 + 1)).rolling(10, min_periods=1).mean()


algo = KNNBasic(k=7, verbose=False)
algo.fit(data.build_full_trainset())

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=20)
y_pred = pd.DataFrame.from_dict({k: [z[0] for z in v] for k, v in top_n.items()}).T.reset_index()
y_pred = y_pred.rename(columns={'index': 'user_id'})

# Print the recommended items for each user
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])

In [214]:
subm.columns, y_pred.columns

(Index(['user_id', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
        '12', '13', '14', '15', '16', '17', '18', '19'],
       dtype='object'),
 Index(['user_id', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19],
       dtype='object'))

In [None]:
subm.shape, y_pred.shape

In [248]:
subm.iloc[:, 1:] = subm.iloc[:, :1].merge(y_pred, on='user_id').iloc[:, 1:].values

In [249]:
subm.to_csv('input/subm005.csv', index=None)

In [250]:
subm.sort_values('user_id')

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
393,0,22,11,15,7,19,18,31,3,21,...,25,26,17,9,5,28,2,6,10,20
152,1,35,22,11,15,5,1,12,7,19,...,26,32,33,28,6,31,2,30,25,36
81,2,30,21,23,35,26,24,34,22,11,...,32,19,33,8,36,13,28,12,17,5
114,3,21,23,35,24,22,14,7,0,37,...,1,6,11,33,5,32,18,39,17,30
210,4,35,24,22,11,15,37,31,8,1,...,13,3,28,25,21,18,7,19,26,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,492,35,65,58,24,66,22,15,72,202,...,186,103,19,49,86,167,98,170,136,37
345,493,35,58,66,22,40,76,72,119,19,...,86,98,37,60,171,110,8,142,199,85
260,494,35,65,58,24,66,22,15,76,72,...,202,75,186,103,19,49,86,167,98,170
332,495,168,205,207,177,4,80,125,43,65,...,22,94,14,69,76,72,174,75,79,147
