## load dependency

In [18]:
import sys
sys.path.append("../../")
import os
import surprise
import papermill as pm
import scrapbook as sb
import pandas as pd

from reco_utils.common.timer import Timer
from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_random_split
from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from reco_utils.recommender.surprise.surprise_utils import predict, compute_ranking_predictions

print("System version: {}".format(sys.version))
print("Surprise version: {}".format(surprise.__version__))

System version: 3.7.1 (default, Dec 14 2018, 13:28:58) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
Surprise version: 1.1.1


## load data

In [30]:
user = pd.read_csv('../dataset/dataset1/user.csv')
user.head()

Unnamed: 0,评分,用户名,评论时间,用户ID,电影名,类型
0,2,身似,2018-01-05 15:05:06,1,心雨花露,爱情
1,4,有意识的贱民,2018-01-05 15:05:06,3,战争的恐怖,战争
2,2,亿万露电,2018-01-05 15:05:06,4,豪勇七蛟龙,战争
3,2,Marni,2018-01-05 15:05:06,5,无序之主,犯罪
4,4,马西嘻嘻嘻,2018-01-05 15:05:06,6,时装店风波,同性


In [31]:
#电影名转换成ID
from sklearn.preprocessing import LabelEncoder
label_encode = LabelEncoder()
user['movie_id'] = label_encode.fit_transform(user['电影名'])
user.head()

Unnamed: 0,评分,用户名,评论时间,用户ID,电影名,类型,movie_id
0,2,身似,2018-01-05 15:05:06,1,心雨花露,爱情,9076
1,4,有意识的贱民,2018-01-05 15:05:06,3,战争的恐怖,战争,10394
2,2,亿万露电,2018-01-05 15:05:06,4,豪勇七蛟龙,战争,19710
3,2,Marni,2018-01-05 15:05:06,5,无序之主,犯罪,11332
4,4,马西嘻嘻嘻,2018-01-05 15:05:06,6,时装店风波,同性,11557


To movie-lens format 

In [32]:
score_set = user[['用户ID','movie_id','评分','评论时间']]

In [33]:
score_set.columns=['user_id','movie_id','score','date']

In [34]:
score_set.loc[:,'date']=pd.to_datetime(score_set.loc[:,'date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [35]:
score_set.loc[:,('date')] = score_set['date'].apply(lambda x:x.timestamp())

In [36]:
score_set.head()

Unnamed: 0,user_id,movie_id,score,date
0,1,9076,2,1515165000.0
1,3,10394,4,1515165000.0
2,4,19710,2,1515165000.0
3,5,11332,2,1515165000.0
4,6,11557,4,1515165000.0


In [37]:
score_set.drop(['date'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [38]:
score_set.head()

Unnamed: 0,user_id,movie_id,score
0,1,9076,2
1,3,10394,4
2,4,19710,2
3,5,11332,2
4,6,11557,4


In [39]:
data=score_set.loc[:3000,:]

In [40]:
data['score'].value_counts()

8     1182
4      897
10     701
2      219
6        2
Name: score, dtype: int64

## train svd model

In [41]:
train, test = python_random_split(data, 0.75)

In [42]:
train.shape

(2250, 3)

In [43]:
train_set = surprise.Dataset.load_from_df(train, reader=surprise.Reader(rating_scale=(1,11))).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x7ffa75172860>

In [44]:
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print("Took {} seconds for training.".format(train_time.interval))

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Took 0.3268772049996187 seconds for training.


## prediction

In [45]:
predictions = predict(svd, test, usercol='user_id', itemcol='movie_id')
predictions.head()

Unnamed: 0,user_id,movie_id,prediction
0,46,14388,6.612542
1,1844,17193,7.0144
2,187,16967,6.605654
3,453,12204,6.899556
4,3963,861,6.899556


In [46]:
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(svd, train, usercol='user_id', itemcol='movie_id', remove_seen=True)
    
print("Took {} seconds for prediction.".format(test_time.interval))

Took 30.91035301499869 seconds for prediction.


In [47]:
all_predictions.head()

Unnamed: 0,user_id,movie_id,prediction
2250,579,9493,7.425809
2251,579,14660,7.227335
2252,579,19530,6.592365
2253,579,16759,6.956443
2254,579,14014,6.992031
