In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
import seaborn as sns
import os
import sys



from scipy.sparse import csr_matrix


import warnings


warnings.filterwarnings("ignore")

In [23]:
!gdown --fuzzy https://drive.google.com/file/d/1G8uZQ0bad20fVjWhCQmBnnVvwlL-ZsDJ/view?usp=share_link

Downloading...
From: https://drive.google.com/uc?id=1G8uZQ0bad20fVjWhCQmBnnVvwlL-ZsDJ
To: /content/train_joke_df.csv
  0% 0.00/21.4M [00:00<?, ?B/s] 47% 9.96M/21.4M [00:00<00:00, 98.7MB/s]100% 21.4M/21.4M [00:00<00:00, 118MB/s] 


In [24]:
df = pd.read_csv("/content/train_joke_df.csv")

In [25]:
df = df.sort_values("UID").reset_index()
df.head()

Unnamed: 0,index,UID,JID,Rating
0,885661,1,45,-7.14
1,219554,1,36,4.95
2,1045552,1,6,-8.5
3,349879,1,61,8.59
4,32265,1,53,3.2


In [26]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [27]:
df = df.sort_values(by=['UID', 'JID'])
df = df.reset_index(drop=True)

In [28]:
from surprise import NMF, SVD, SVDpp, KNNBasic, KNNWithMeans, KNNWithZScore, CoClustering
from surprise.model_selection import cross_validate
from surprise import Reader, Dataset, accuracy

In [29]:
reader = Reader(rating_scale=(-10, 10))


data = Dataset.load_from_df(df[['UID', 'JID', 'Rating']], reader)

In [30]:
from surprise.model_selection import train_test_split

In [31]:
trainset_data = data.build_full_trainset()


trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [32]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f50ca181a80>

In [33]:
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 4.1647


4.164689870321724

In [34]:
def get_num_user_ratings(uid):
 
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: 
        return 0
    
def get_num_item_ratings(iid):

    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    


In [36]:
trainset = algo.trainset

predictions_df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])

predictions_df['№ кол-во пользовательских рейтингов'] = predictions_df.uid.apply(get_num_user_ratings)
predictions_df['№ кол-во рейтингов элементов'] = predictions_df.iid.apply(get_num_item_ratings)
predictions_df['error'] = abs(predictions_df.est - predictions_df.rui)

best_predictions = predictions_df.sort_values(by='error')[:10]
worst_predictions = predictions_df.sort_values(by='error')[-10:]

In [37]:
best_predictions.head(5)

Unnamed: 0,uid,iid,rui,est,details,№ кол-во пользовательских рейтингов,№ кол-во рейтингов элементов,error
232593,11261,38,-0.68,-0.680003,{'was_impossible': False},45,14520,3e-06
176174,5234,19,-4.71,-4.709966,{'was_impossible': False},65,15899,3.4e-05
103483,22530,77,8.64,8.639947,{'was_impossible': False},60,5731,5.3e-05
246305,12431,48,2.23,2.229929,{'was_impossible': False},64,15747,7.1e-05
172100,14517,65,0.1,0.100119,{'was_impossible': False},63,15826,0.000119


In [38]:
!gdown --fuzzy https://drive.google.com/file/d/1xI6VmQFIvOUMDrati5bVBr6WvyiqJPb-/view?usp=share_link

Downloading...
From: https://drive.google.com/uc?id=1xI6VmQFIvOUMDrati5bVBr6WvyiqJPb-
To: /content/test_joke_df_nofactrating.csv
  0% 0.00/5.49M [00:00<?, ?B/s]100% 5.49M/5.49M [00:00<00:00, 232MB/s]


In [39]:
test = pd.read_csv('/content/test_joke_df_nofactrating.csv', index_col=0)
test.head(5)

Unnamed: 0_level_0,UID,JID
InteractionID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11228,39
1,21724,85
2,16782,56
3,12105,42
4,14427,2


In [40]:
test['Rating'] = test[['UID', 'JID']].apply(lambda x: algo.predict(x[0], x[1], verbose=False).est,
                                                      axis = 1)
                                                      

In [41]:
test['Rating'].to_frame().head(5)

Unnamed: 0_level_0,Rating
InteractionID,Unnamed: 1_level_1
0,3.069921
1,-3.637244
2,1.477107
3,5.1222
4,4.727609


In [42]:
test['Rating'].to_frame().to_csv('baseline.csv')