# Task
1. Датасет ml-latest
2. Вспомнить подходы, которые мы разбирали
3. Выбрать понравившийся подход к гибридным системам
4. Написать свою

# Load data

In [86]:
!wget "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

--2022-08-12 18:32:23--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.3’


2022-08-12 18:32:23 (8.20 MB/s) - ‘ml-latest-small.zip.3’ saved [978202/978202]



In [87]:
import zipfile

z = zipfile.ZipFile('ml-latest-small.zip')
z.printdir()

File Name                                             Modified             Size
ml-latest-small/                               2018-09-26 15:50:12            0
ml-latest-small/links.csv                      2018-09-26 15:50:10       197979
ml-latest-small/tags.csv                       2018-09-26 15:49:40       118660
ml-latest-small/ratings.csv                    2018-09-26 15:49:38      2483723
ml-latest-small/README.txt                     2018-09-26 15:50:12         8342
ml-latest-small/movies.csv                     2018-09-26 15:49:56       494431


In [88]:
with zipfile.ZipFile('/content/ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [89]:
import pandas as pd

ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [90]:
ratings.shape

(100836, 4)

In [91]:
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [92]:
movies.shape

(9742, 3)

In [93]:
df = ratings.merge(movies)
print(df.shape)
df.head()

(100836, 6)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


# Prepare data

In [94]:
df = df.sort_values('timestamp').reset_index(drop=True)
df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,429,595,5.0,828124615,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
1,429,432,3.0,828124615,City Slickers II: The Legend of Curly's Gold (...,Adventure|Comedy|Western
2,429,227,3.0,828124615,Drop Zone (1994),Action|Thriller


In [95]:
train = df.loc[:90000]
test = df.loc[90000:]

In [96]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [97]:
from surprise import Dataset
from surprise import Reader

reader = Reader(rating_scale=(df['rating'].min(), df['rating'].max()))
data_train = Dataset.load_from_df(train[['userId',	'movieId', 'rating']], reader)
data_test = Dataset.load_from_df(test[['userId',	'movieId', 'rating']], reader)

In [98]:
data_train.df.shape, data_test.df.shape

((90001, 3), (10836, 3))

# Model 1 - SVD latent factors

In [99]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise.model_selection import cross_validate

algo = SVD(n_factors=20, random_state=42)
trainset = data_train.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc2f0dd9e10>

In [100]:
from surprise.model_selection import train_test_split

# sample random trainset and testset
trainset, testset = train_test_split(data_test, test_size=1.0)

In [101]:
from surprise import accuracy

predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions, verbose=False)

1.0596496202199897

# Model 2 - based content (genres)

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [103]:
tfidf = TfidfVectorizer()
train['genres'] = train['genres'].apply(lambda x: x.replace('|', ' '))
X_train = pd.DataFrame(tfidf.fit_transform(train['genres']).toarray(),
                       columns=tfidf.get_feature_names_out()).reset_index(drop=True)
y_train = train['rating']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [105]:
import lightgbm as lgb

lgbm = lgb.LGBMRegressor()
lgbm.fit(X_train, y_train)

LGBMRegressor()

In [106]:
test['genres'] = test['genres'].apply(lambda x: x.replace('|', ' '))
X_test = pd.DataFrame(tfidf.transform(test['genres']).toarray(),
                        columns=tfidf.get_feature_names_out()).reset_index(drop=True)
y_test = test['rating']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [108]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, lgbm.predict(X_test), squared=False)

1.0751577999871185

# Hybrid recommendation (blending)

In [109]:
import numpy as np

def hybrid_rec(lgbm, X_test: pd.DataFrame, algo, testset: list, w1: float, w2: float)  -> np.array:
    x1 = lgbm.predict(X_test)

    x2 : list = []
    for i in algo.test(testset):
        x2.append(i.est)

    return w1 * np.array(x1) + w2 * np.array(x2)

In [110]:
from sklearn.metrics import mean_squared_error

for i in range(1, 10):
    print(mean_squared_error(y_test, hybrid_rec(lgbm, X_test, algo, testset, w1=i*0.1, w2=1-i*0.1), squared=False))

1.1686963389123448
1.150941150111225
1.1348771474952801
1.1205770652883007
1.1081091954535438
1.0975359808692846
1.088912613579321
1.082285702195606
1.0776920769855214
