<a href="https://colab.research.google.com/github/Oleksandr190378/data-computing/blob/main/Hm7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357264 sha256=4bc6f21de827f1fdc7ca0e90a8107b6ce834b8ff1b61d8215d06b76e144d951a
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [None]:
import pandas as pd
import numpy as np
from surprise import Dataset
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

data = Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


In [None]:
trainset, testset = train_test_split(data, test_size=0.2)
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)
mae = accuracy.mae(predictions)
rmse = accuracy.rmse(predictions)
mae, rmse

MAE:  0.7306
RMSE: 0.9275


(0.7306369865572243, 0.9275343889932054)

In [None]:

from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.1]
}

svd = SVD()
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
print("Best Parameters RMSE:", gs.best_params['rmse'])
print("Best Score RMSE:", gs.best_score['rmse'])
print("\nBest Parameters MAE:", gs.best_params['mae'])
print("Best Score MAE:", gs.best_score['mae'])
best_svd = gs.best_estimator['rmse']
results = cross_validate(best_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

print("\nAverage Score RMSE:", results['test_rmse'].mean())
print("Average Score MAE:", results['test_mae'].mean())

Best Parameters RMSE: {'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.1}
Best Score RMSE: 0.9339644079403207

Best Parameters MAE: {'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.1}
Best Score MAE: 0.739639897329237
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9272  0.9214  0.9261  0.9293  0.9215  0.9251  0.0031  
MAE (testset)     0.7347  0.7322  0.7324  0.7341  0.7301  0.7327  0.0016  
Fit time          2.54    2.56    3.31    2.61    2.59    2.72    0.29    
Test time         0.27    0.12    0.20    0.12    0.12    0.17    0.06    

Average Score RMSE: 0.9250941332851552
Average Score MAE: 0.7326990527835989


In [None]:
from surprise import Dataset, SVDpp, NMF, NormalPredictor

algorithms = [
    SVD(),
    SVDpp(),
    NMF(),
    NormalPredictor()
]

benchmark = []
for algorithm in algorithms:
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp['Algorithm'] = str(algorithm).split(' ')[0].split('.')[-1]
    benchmark.append(tmp)
df_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

print(df_results)
best_algorithm = df_results['test_rmse'].idxmin()
print(f"\nBest algorithm: {best_algorithm}")
print(f"RMSE: {df_results.loc[best_algorithm, 'test_rmse']:.4f}")
print(f"MAE: {df_results.loc[best_algorithm, 'test_mae']:.4f}")

                 test_rmse  test_mae   fit_time  test_time
Algorithm                                                 
SVDpp             0.925759  0.727585  19.161381   7.986858
SVD               0.944765  0.745889   1.153443   0.304730
NMF               0.976761  0.766878   1.762883   0.287655
NormalPredictor   1.523722  1.224030   0.124394   0.288393

Best algorithm: SVDpp
RMSE: 0.9258
MAE: 0.7276
