In [30]:
import pandas as pd
import numpy as np
from surprise.model_selection import GridSearchCV
from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise import SVDpp
from surprise import SVD
from surprise import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [31]:
rating = pd.read_csv('rating.csv')
rating.rating.replace({-1: np.nan}, regex=True, inplace = True)
rating=rating.dropna()
# Drop a row by condition
df=rating[rating.user_id<= 10000]
df.head()

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10.0
81,1,11617,10.0
83,1,11757,10.0
101,1,15451,10.0
153,2,11771,10.0


In [32]:
data = df['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               marker=dict(
               color='rgb(58,200,225)',
               line=dict(
               color='rgb(8,48,107)',
               width=1.5),
               ),
               opacity=0.6,
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} anime-ratings'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)


In [33]:
data = df.groupby('anime_id')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2),marker=dict(
                color='rgb(58,200,225)',
               line=dict(
               color='rgb(8,48,107)',
               width=1.5),
               ),
               opacity=0.6,)
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Anime (Clipped at 100)',
                   xaxis = dict(title = 'Number of Ratings Per Anime'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [34]:
df.groupby('anime_id')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]


Unnamed: 0,anime_id,rating
1380,1535,4727
6100,16498,3660
5486,11757,3588
1416,1575,3375
201,226,3226
10,20,3062
2553,2904,3019
99,121,3018
4162,6547,2909
3674,5114,2841


In [35]:
data = df.groupby('user_id')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2),
                    marker=dict(
                color='rgb(124,52,95)',
               line=dict(
               color='rgb(71,01,93)',
               width=1.5),
               ),
               opacity=0.6)
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per User (Clipped at 50)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [29]:
df.groupby('user_id')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]


Unnamed: 0,user_id,rating
6953,7345,2429
8558,9032,1702
1436,1530,1584
2761,2951,1412
7792,8217,1403
7696,8115,1338
6731,7114,1227
6209,6569,1188
7113,7511,1180
6861,7247,1156


In [19]:
min_anime_ratings = 50
filter_anime = df['anime_id'].value_counts() > min_anime_ratings
filter_anime = filter_anime[filter_anime].index.tolist()

min_user_ratings = 50
filter_users = df['user_id'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

df_new = df[(df['anime_id'].isin(filter_anime)) & (df['user_id'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(df.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

The original data frame shape:	(849164, 3)
The new data frame shape:	(700103, 3)


In [20]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_new[['user_id', 'anime_id', 'rating']], reader)

In [21]:
benchmark = []
# Iterate over all algorithms
for algorithm in [KNNBaseline(), 
                  KNNBasic(), KNNWithMeans(), KNNWithZScore()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline,1.172407,14.757496,82.035948
KNNWithZScore,1.186716,14.747206,81.545469
KNNWithMeans,1.191375,15.414035,85.825091
KNNBasic,1.243835,13.355404,71.896635


In [26]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

test_pred = algo.test(testset)

print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Item-based Model : Test Set
RMSE: 1.1439


1.1438788699491085