In [30]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, CoClustering, BaselineOnly
from surprise.model_selection.validation import cross_validate
from surprise import accuracy

In [31]:
data = pd.read_csv('Cung cap HV/Review_new.csv')
data.head()

Unnamed: 0,customer_id,product_id,name,rating,title,content
0,709310,10001012,Lân Nguyễn Hoàng,3,Ko dùng đc thẻ nhớ,Lúcđầu quên thông tin nên dùng 512gb thì ko đc...
1,10701688,10001012,Nguyễn Khánh Hòa,5,Cực kì hài lòng,"Tiki giao hàng nhanh. Sản phẩm đúng như mô tả,..."
2,11763074,10001012,Toàn Phạm Khánh,5,Cực kì hài lòng,"chất lượng camera rõ nét, chống mưa nắng tuyệt..."
3,9909549,10001012,Nguyen Quang Minh,5,Rất hài lòng,"Hàng được đóng gói cẩn thận, giao hàng nhanh ,..."
4,1827148,10001012,Phạm Bá Đức,5,Cực kì hài lòng,"dễ cài đặt, chất lượng tốt, chế độ xem hồng ng..."


In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363439 entries, 0 to 363438
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_id  363439 non-null  int64 
 1   product_id   363439 non-null  int64 
 2   name         363009 non-null  object
 3   rating       363439 non-null  int64 
 4   title        363411 non-null  object
 5   content      165161 non-null  object
dtypes: int64(3), object(3)
memory usage: 16.6+ MB


In [33]:
n_ratings = len(data)
n_products = len(data['product_id'].unique())
n_customers = len(data['customer_id'].unique())

In [34]:
display(n_ratings, n_products, n_customers)

363439

4214

251149

In [35]:
data['product_id'].value_counts()

299461      4796
1600005     2643
47321729    2424
405243      2326
8141868     2214
            ... 
57970261       1
76732229       1
1513667        1
26431486       1
73123059       1
Name: product_id, Length: 4214, dtype: int64

In [36]:
reader = Reader()
data_new = Dataset.load_from_df(data[['customer_id','product_id','rating']], reader)

In [37]:
from statistics import mean
def cal_avg_metrics(result):    
    result['test_rmse'] = result['test_rmse'].mean()
    result['test_mae'] = result['test_mae'].mean()    
    result['fit_time'] = round(mean(list(result['fit_time'])),3)
    result['test_time'] = round(mean(list(result['test_time'])),3)
    return result

In [None]:
#SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, CoClustering, BaselineOnly

algorithms=[SVD()]
results=[]
names=[]

for algo in algorithms:
    result = cross_validate(algo, data_new, measures = ['RMSE', 'MAE'], cv=5, verbose=True) 
    result = cal_avg_metrics(result)
    results.append(result)
    names.append(algo.__class__.__name__)
df=pd.DataFrame(results)
df.insert(0, 'Algorithm', pd.Series(names))
df.head()

### Tunning Parameter: chỉ dành cho SVD

In [85]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}

In [86]:
from surprise.model_selection import GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data_new)

In [87]:
# best RMSE score
print(gs.best_score['rmse'])

0.9731225778979011


In [88]:
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


### Train model

In [21]:
model = BaselineOnly()

In [22]:
trainset = data_new.build_full_trainset()
model.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x247ecbfbb20>

### Recommender for a specific user

In [23]:
def recommendation (customerid, algorithm, df):
    df_score = df[['product_id']]
    df_score['EstimateScore'] = df_score['product_id'].apply(lambda x: algorithm.predict(customerid, x).est) 
    df_score = df_score.sort_values(by=['EstimateScore'], ascending=False)
    df_score = df_score.drop_duplicates()
    df_score = df_score[df_score['EstimateScore']>=3][:5]
    return df_score

In [28]:
customer_id = 5917275
df_new = recommendation(customer_id, model, data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_score['EstimateScore'] = df_score['product_id'].apply(lambda x: algorithm.predict(customerid, x).est)


In [29]:
df_new

Unnamed: 0,product_id,EstimateScore
315632,70771651,5.0
238142,53056161,5.0
244249,54017350,5.0
307311,68025746,5.0
320427,71523843,5.0
