In [145]:
import pandas as pd
from IPython.display import display, clear_output
from typing import List, Dict, Union
from pathlib import Path
from rectools import Columns
import math
from sklearn.model_selection import train_test_split
import torch
from scipy.sparse import csr_matrix
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import json

In [146]:
DATA_PATH = Path("/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train")
users_df = pd.read_csv(DATA_PATH / 'users.csv')
items_df = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

In [147]:
interactions_df = interactions.drop(columns='total_dur')
interactions_df['watched_pct'] = interactions_df['watched_pct'] / 100
interactions_df.rename(columns={'user_id': Columns.User, 'item_id': Columns.Item,
                                'last_watch_dt': Columns.Datetime, 'watched_pct': Columns.Weight}, inplace=True)
interactions_df.dropna(inplace=True)

In [148]:
interactions_df

Unnamed: 0,user_id,item_id,datetime,weight
0,176549,9506,2021-05-11,0.72
1,699317,1659,2021-05-29,1.00
2,656683,7107,2021-05-09,0.00
3,864613,7638,2021-07-05,1.00
4,964868,9506,2021-04-30,1.00
...,...,...,...,...
5476246,648596,12225,2021-08-13,0.00
5476247,546862,9673,2021-04-13,0.49
5476248,697262,15297,2021-08-20,0.63
5476249,384202,16197,2021-04-19,1.00


In [149]:
interactions_df = interactions_df[interactions_df[Columns.Datetime] < '2021-04-01']
interactions_df.shape

(263794, 4)

In [150]:
users_interactions_count_df = interactions_df.groupby(['user_id', 'item_id']).size().groupby('user_id').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['user_id']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 86605
# users with at least 5 interactions: 14557


In [151]:
print('# of interactions: %d' % len(interactions_df))
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df,
               how = 'right',
               left_on = 'user_id',
               right_on = 'user_id')
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 263794
# of interactions from users with at least 5 interactions: 142586


In [152]:
interactions_from_selected_users_df

Unnamed: 0,user_id,item_id,datetime,weight
0,21,12261,2021-03-30,1.00
1,21,15997,2021-03-28,1.00
2,21,4345,2021-03-30,1.00
3,21,10283,2021-03-20,1.00
4,21,849,2021-03-30,0.82
...,...,...,...,...
142581,1097530,11237,2021-03-13,0.94
142582,1097530,849,2021-03-23,0.34
142583,1097530,11512,2021-03-15,0.01
142584,1097530,8386,2021-03-15,1.00


In [153]:
def smooth_user_preference(x):
    return math.log(1+x, 2)

interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['user_id', 'item_id'])[Columns.Weight].sum() \
                    .apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head(10)

# of unique user/item interactions: 142586


Unnamed: 0,user_id,item_id,weight
0,21,849,0.863938
1,21,4345,1.0
2,21,10283,1.0
3,21,12261,1.0
4,21,15997,1.0
5,32,952,0.722466
6,32,4382,0.378512
7,32,4807,1.0
8,32,10436,1.0
9,32,12132,1.0


In [154]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['user_id'],
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 114068
# interactions on Test set: 28518


In [155]:
interactions_full_indexed_df = interactions_full_df.set_index('user_id')
interactions_train_indexed_df = interactions_train_df.set_index('user_id')
interactions_test_indexed_df = interactions_test_df.set_index('user_id')

In [156]:
interactions_full_indexed_df

Unnamed: 0_level_0,item_id,weight
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
21,849,0.863938
21,4345,1.000000
21,10283,1.000000
21,12261,1.000000
21,15997,1.000000
...,...,...
1097530,849,0.422233
1097530,4155,0.000000
1097530,8386,1.000000
1097530,11237,0.956057


In [157]:
def get_items_interacted(person_id, interactions_df):
    interacted_items = interactions_df.loc[person_id]['item_id']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

### Модель

In [158]:
SEED = 42
LR = 1e-3
WEIGHT_DECAY = 0.01
NUM_EPOCHS = 50
GAMMA = 0.9995
BATCH_SIZE = 3000
EVAL_BATCH_SIZE = 3000
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ALPHA = 0.000002

In [159]:
interactions_train_df

Unnamed: 0,user_id,item_id,weight
118603,912569,12248,0.014355
132345,1018293,16174,1.000000
7771,59037,14576,0.056584
111698,860942,4475,1.000000
27380,208326,4471,1.000000
...,...,...,...
119816,921437,14359,0.150560
23641,182319,9132,0.464668
109281,841121,2358,0.056584
14547,110832,10974,0.432959


In [160]:
interactions_test_indexed_df.reset_index()

Unnamed: 0,user_id,item_id,weight
0,311215,7029,0.070389
1,614188,11231,0.400538
2,270804,9132,0.985500
3,190245,16135,0.594549
4,1007513,8618,0.584963
...,...,...,...
28513,786701,5669,0.042644
28514,556782,3095,0.933573
28515,693897,162,0.807355
28516,630614,11675,1.000000


In [161]:
total_df = interactions_train_df.append(interactions_test_indexed_df.reset_index())
total_df['user_id'], users_keys = total_df.user_id.factorize()
total_df['item_id'], items_keys = total_df.item_id.factorize()

train_encoded = total_df.iloc[:len(interactions_train_df)].values
test_encoded = total_df.iloc[len(interactions_train_df):].values

  total_df = interactions_train_df.append(interactions_test_indexed_df.reset_index())


In [162]:
users_key_dict = {}
for i in range(len(users_keys)):
    users_key_dict[users_keys[i]] = i

In [163]:
shape = [int(total_df['user_id'].max()+1), int(total_df['item_id'].max()+1)]
X_train = csr_matrix((train_encoded[:, 2], (train_encoded[:, 0], train_encoded[:, 1])), shape=shape).toarray()
X_test = csr_matrix((test_encoded[:, 2], (test_encoded[:, 0], test_encoded[:, 1])), shape=shape).toarray()

In [164]:
class UserOrientedDataset(Dataset):
    def __init__(self, X):
        super().__init__()
        self.X = X.astype(np.float32)
        self.len = len(X)

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        return self.X[index]

In [165]:
train_dl = DataLoader(
    UserOrientedDataset(X_train),
    batch_size = BATCH_SIZE,
    shuffle = True
)

test_dl = DataLoader(
    UserOrientedDataset(X_test),
    batch_size = EVAL_BATCH_SIZE,
    shuffle = False
)

dls = {'train': train_dl, 'test': test_dl}


In [166]:
class Encoder(nn.Module):
    def __init__(self, input_size: int, hidden_size: int) -> None:
        super(Encoder, self).__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(input_size, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, hidden_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.dropout1(x)
        x = self.relu(self.fc2(x))
        return x

class Decoder(nn.Module):
    def __init__(self, hidden_size: int, output_size: int) -> None:
        super(Decoder, self).__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(hidden_size, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, output_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.relu(self.fc2(x))
        return x

class Autoencoder(nn.Module):
    def __init__(self, input_size: int = 8248, hidden_size: int = 128) -> None:
        super(Autoencoder, self).__init__()
        self.encoder = Encoder(input_size, hidden_size)
        self.decoder = Decoder(hidden_size, input_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [167]:
torch.manual_seed(SEED)

model = Autoencoder()
model.to(DEVICE)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=GAMMA)

def rmse_for_sparse(x_pred, x_true):
    mask = (x_true > 0)
    sq_diff = (x_pred * mask - x_true) ** 2
    mse = sq_diff.sum() / mask.sum()
    return mse ** (1/2)

In [168]:
metrics_dict = {
    "Epoch": [],
    "Train RMSE": [],
    "Test RMSE": [],
}

for epoch in range(NUM_EPOCHS):
    metrics_dict["Epoch"].append(epoch)
    for stage in ['train', 'test']:
        with torch.set_grad_enabled(stage == 'train'):
            if stage == 'train':
                model.train()
            else:
                model.eval()

            loss_at_stage = 0
            for batch in dls[stage]:
                batch = batch.to(DEVICE)
                x_pred = model(batch)
                loss = rmse_for_sparse(x_pred, batch)
                if stage == "train":
                    loss.backward()
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                loss_at_stage += loss.item() * len(batch)
            rmse_at_stage = (loss_at_stage / len(dls[stage].dataset)) ** (1/2)
            metrics_dict[f"{stage.title()} RMSE"].append(rmse_at_stage)

    if (epoch == NUM_EPOCHS - 1) or epoch % 10 == 9:
        clear_output(wait=True)
        display(pd.DataFrame(metrics_dict))

Unnamed: 0,Epoch,Train RMSE,Test RMSE
0,0,0.816113,0.850001
1,1,0.773413,0.840054
2,2,0.742213,0.826252
3,3,0.713708,0.809251
4,4,0.689716,0.789249
5,5,0.666499,0.76543
6,6,0.645937,0.739405
7,7,0.624002,0.711159
8,8,0.604461,0.685098
9,9,0.585675,0.662482


In [169]:
model.eval()
torch.save(model.state_dict(), '/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/service/api/recsys/models/autoencoder.ckpt')

In [170]:
model = Autoencoder()
model.load_state_dict(torch.load('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/service/api/recsys/models/autoencoder.ckpt'))

<All keys matched successfully>

In [171]:
model.eval()
with torch.no_grad():
    X_pred = model(torch.Tensor(X_train + X_test).to(DEVICE)).cpu().detach().numpy()

In [172]:
popular = np.load('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/service/api/recsys/models/popular.npy')

In [182]:
class AutoencoderRecommender:
    def __init__(
        self,
        X_preds: np.ndarray,
        X_train_and_val: np.ndarray,
        X_test: np.ndarray,
        users_key_dict: Dict[int, int],
        items_keys: np.ndarray,
        popular: np.ndarray
    ) -> None:
        self.X_preds = X_preds
        self.X_train = X_train_and_val
        self.X_test = X_test
        self.X_total = X_train_and_val + X_test
        self.users_key_dict = users_key_dict
        self.items_keys = np.array(items_keys)
        self.popular = popular

    def recommend_items(self, user_id: int, items_to_select_idx: np.ndarray, topn: int = 10) -> np.ndarray:
        user_preds = self.X_preds[user_id][items_to_select_idx]
        items_idx = items_to_select_idx[np.argsort(-user_preds)[:topn]]
        return items_idx

    def evaluate(self) -> Dict[str, float]:
        true_5, true_10 = [], []

        for user_id in range(len(self.X_test)):
            non_zero = np.argwhere(self.X_test[user_id] > 0).ravel()
            all_nonzero = np.argwhere(self.X_total[user_id] > 0).ravel()
            select_from = np.setdiff1d(np.arange(self.X_total.shape[1]), all_nonzero)

            for non_zero_idx in non_zero:
                random_non_interacted_100_items = np.random.choice(select_from, size=100, replace=False)
                preds = self.recommend_items(user_id, np.append(random_non_interacted_100_items, non_zero_idx), topn=10)
                true_5.append(non_zero_idx in preds[:5])
                true_10.append(non_zero_idx in preds)

        return {"recall@5": np.mean(true_5), "recall@10": np.mean(true_10)}

    def add_popular(self, item_ids: np.ndarray) -> np.ndarray:
        mask = ~np.isin(self.popular, item_ids)
        filtered_popular = self.popular[mask]
        combined = np.concatenate([item_ids, filtered_popular])
        combined = combined.astype(int)
        result = combined[:self.N]
        return result

    def recommend(self, user_id: Union[int, str], topn: int = 10) -> List[int]:
        uid = self.users_key_dict.get(user_id, None)
        if uid is None:
            return self.popular[:topn].tolist()
        all_nonzero = np.argwhere(self.X_total[uid] > 0).ravel()
        select_from = np.setdiff1d(np.arange(self.X_total.shape[1]), all_nonzero)
        preds = self.X_preds[uid][select_from]
        items_idx = select_from[np.argsort(-preds)[:topn]]
        items = [self.items_keys[item_idx] for item_idx in items_idx]
        if len(items) < topn:
            items = self.add_popular(items)
        return items

    def save(self, file_path: str) -> None:
        np.save(file_path + 'autoencoder_x_train.npy', self.X_train)
        np.save(file_path + 'autoencoder_x_test.npy', self.X_test)
        np.save(file_path + 'autoencoder_x_pred.npy', self.X_preds)
        self.users_key_dict = {int(key): item for key, item in self.users_key_dict.items()}
        with open(file_path + 'autoencoder_users_key_dict.json', 'w') as json_file:
            json.dump(self.users_key_dict, json_file)
        np.save(file_path + 'autoencoder_items_keys.npy', self.items_keys)

    @classmethod
    def load(cls, file_path: str) -> 'AutoencoderRecommender':
        X_train = np.load(file_path + 'autoencoder_x_train.npy')
        X_test = np.load(file_path + 'autoencoder_x_test.npy')
        X_preds = np.load(file_path + 'autoencoder_x_pred.npy')

        with open(file_path + 'autoencoder_users_key_dict.json', 'r') as json_file:
            users_key_dict = json.load(json_file)
            users_key_dict = {int(k): int(v) for k, v in users_key_dict.items()}

        items_keys = np.load(file_path + 'autoencoder_items_keys.npy')
        popular = np.load(file_path + 'popular.npy')

        return AutoencoderRecommender(X_preds, X_train, X_test, users_key_dict, items_keys, popular)

In [183]:
ae_recommender_model = AutoencoderRecommender(X_pred, X_train, X_test, users_key_dict, items_keys, popular)

In [177]:
ae_recommender_model.evaluate()

{'recall@5': 0.17729332708235662, 'recall@10': 0.30352398812314424}

In [184]:
ae_recommender_model.recommend(122432)

[10927, 3646, 2837, 921, 6425, 2384, 12576, 14280, 6986, 10535]

In [185]:
ae_recommender_model.save('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/service/api/recsys/models/')

In [186]:
loaded_model = AutoencoderRecommender.load('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/service/api/recsys/models/')

In [187]:
loaded_model.recommend(122432)

[10927, 3646, 2837, 921, 6425, 2384, 12576, 14280, 6986, 10535]