In [None]:
from typing import Iterable, List

import pickle
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

class MaxFactorDataset(Dataset):

    def __init__(
            self,
            users: Iterable[int],
            items: Iterable[int],
            device: str = 'cpu',
    ):
        self.device = device
        self.users = torch.LongTensor(users)
        self.items = torch.LongTensor(items)
        self.num_interactions = len(users)
        self.num_items = int(max(items) + 1)
        
        self.index = None
        self.batch_size = None
        self.neg_sample = None
        self.num_batches = None
        self.targets = None

    def init_params(self, batch_size: int, neg_sample: int):

        self.batch_size = batch_size
        self.neg_sample = neg_sample
        self.num_batches = int((self.num_interactions - 1) / batch_size + 1)
        self.targets = torch.zeros(self.batch_size, dtype=torch.long)

    def __getitem__(self, batch_num):

        i = batch_num * self.batch_size
        size = min(self.num_interactions - i, self.batch_size)

        index = self.index[i: i + size].to(self.device)
        items_pos = self.items[index].to(self.device)
        users = self.users[index].to(self.device)

        items_pos = items_pos.reshape(-1, 1)
        items_neg = torch.randint(high=self.num_items, size=(size, self.neg_sample), device=self.device)
        targets = self.targets[:size].to(self.device)

        return (
            users,
            items_pos,
            items_neg,
            targets,
        )

    def __iter__(self):
        self.index = torch.randperm(self.num_interactions)
        for i in range(self.num_batches):
            yield self[i]

    def __len__(self):
        return self.num_batches


class MaxFactorModel(torch.nn.Module):

    def __init__(
            self,
            num_users: int,
            num_items: int,
            dim: int,
            learning_rate: float,
            device: str = 'cpu',
    ):
        super().__init__()

        self.negative_sampling_batch_size = None
        self.hard_neg_sample = None
        self.device = device
        
        self.item_embeddings = torch.nn.Embedding(num_items, dim).to(self.device)
        self.user_embeddings = torch.nn.Embedding(num_users, dim).to(self.device)
        torch.nn.init.xavier_uniform_(self.item_embeddings.weight)
        torch.nn.init.xavier_uniform_(self.user_embeddings.weight)
        self.optimizer = torch.optim.Adagrad(self.parameters(), lr=learning_rate)

    def get_hard_negatives(self, users, items_neg):

        hard_negatives = []
        with torch.no_grad():
            for i in range(0, len(users), self.negative_sampling_batch_size):
                neg = self(
                    users[i: i + self.negative_sampling_batch_size],
                    items_neg[i: i + self.negative_sampling_batch_size],
                )
                topk = torch.topk(neg, self.hard_neg_sample)[1]
                hard_negatives.append(items_neg[i: i + self.negative_sampling_batch_size].gather(1, topk))
        items_neg = torch.cat(hard_negatives, dim=0)
        return items_neg

    def _fit(
            self,
            dataset: MaxFactorDataset,
            epochs: int,
            learning_rate: float,
             penalty_alpha: float,
    ):

        loss_function = torch.nn.CrossEntropyLoss()
        for epoch in range(epochs):
            
            for users, items_pos, items_neg, targets in dataset:
                self.optimizer.zero_grad()
                if self.hard_neg_sample:
                    items_neg = self.get_hard_negatives(users, items_neg)
                items = torch.cat([items_pos, items_neg], dim=1)
                penalty = (((self.item_embeddings.weight ** 2).sum(1) - 1) ** 2).mean()
                score = self(users, items) 
                loss = loss_function(score, targets) + penalty * penalty_alpha
                loss.backward()
                self.optimizer.step()

    def fit(
            self,
            dataset: MaxFactorDataset,
            epochs: int,
            batch_size: int,
            neg_sample: int,
            negative_sampling_batch_size: int = None,
            hard_neg_sample: int = None,
            learning_rate: float = 0.015,
            penalty_alpha: float = 0.003,
    ):
        dataset.init_params(batch_size, neg_sample)
        self.negative_sampling_batch_size = negative_sampling_batch_size
        self.hard_neg_sample = hard_neg_sample

        self._fit(dataset, epochs, learning_rate, penalty_alpha)

    def forward(self, users: torch.LongTensor, items: torch.LongTensor) -> torch.FloatTensor:

        user_embeddings = self.user_embeddings(users).unsqueeze(2)
        item_embeddings = self.item_embeddings(items)
        score = torch.bmm(item_embeddings, user_embeddings).squeeze(2)

        return score

    def predict(self, users: torch.LongTensor, items: torch.LongTensor) -> torch.FloatTensor:

        user_embeddings = self.user_embeddings(users)
        item_embeddings = self.item_embeddings(items).t()
        score = torch.mm(user_embeddings, item_embeddings)

        return score

    def _create_recommendations(
            self,
            target_users: Iterable[int],
            target_items: Iterable[int],
            num_recommendations: int,
    ):
        target_users = torch.LongTensor(target_users).to(self.device)
        target_items = torch.LongTensor(target_items).to(self.device)

        topk = min(num_recommendations, target_items.shape[0])

        with torch.no_grad():
            res = self.predict(target_users, target_items)
            recom = torch.topk(res, topk)
            items = target_items[recom[1]].flatten()
            scores = recom[0].flatten()
            users = target_users.reshape(-1, 1).repeat(1, topk).flatten()

        users = users.cpu().detach().numpy()
        items = items.cpu().detach().numpy()
        scores = scores.cpu().detach().numpy()

        return users, items, scores

    def create_recommendations(
            self,
            target_users: Iterable[int],
            target_items: Iterable[int],
            num_recommendations: int,
    ) -> (np.array, np.array, np.array):
        
        num_batch_users = int(200 ** 3 / 4 / len(target_items))

        all_users = []
        all_items = []
        all_scores = []

        for i in range(0, len(target_users), num_batch_users):
            users, items, scores = self._create_recommendations(
                target_users[i:i + num_batch_users],
                target_items,
                num_recommendations,
            )

            all_users.append(users)
            all_items.append(items)
            all_scores.append(scores)

        all_users = np.hstack(all_users)
        all_items = np.hstack(all_items)
        all_scores = np.hstack(all_scores)

        return all_users, all_items, all_scores
    
    
class MaxFactorRecommender:

    def __init__(self, config):
        self.config = config

        self.cnt = {}
        self.user_encoder = LabelEncoder()
        self.item_encoder = LabelEncoder()
        self.recs = None
        self.train_set = None
        self.dataset = None
        self.model = None
        self.already_seen = None

    def init_model(self):

        self.dataset = MaxFactorDataset(
            users=self.train_set['user'].values,
            items=self.train_set['item'].values,
            device=self.config['device'],
        )

        self.model = MaxFactorModel(
            num_users=self.cnt['users'],
            num_items=self.cnt['items'],
            dim=self.config['dim'],
            learning_rate=self.config['fit_params']['learning_rate'],
            device=self.config['device'],
        )

    def encode_ids(self):
        self.train_set['user'] = self.user_encoder.fit_transform(self.train_set['user_id'])
        self.train_set['item'] = self.item_encoder.fit_transform(self.train_set['cluster_id'])
        self.cnt['items'] = self.train_set.item.max() + 1
        self.cnt['users'] = self.train_set.user.max() + 1
        self.already_seen = self.get_user_item_id(
            user_col=self.train_set['user'],
            item_col=self.train_set['item'],
        ).drop_duplicates().values

    def decode_ids(self):
        self.recs['user_id'] = self.user_encoder.classes_[self.recs.user]
        self.recs['cluster_id'] = self.item_encoder.classes_[self.recs.item]

    def fit(self):
        self.model.fit(
            dataset=self.dataset,
            **self.config['fit_params'],
        )

    def torch_recommend(self, users):
        all_items = []
        all_users = []
        all_scores = []
        target_users = self.user_encoder.transform(users)
        target_items = np.arange(self.cnt['items'])
        users, items, scores = self.model.create_recommendations(
            target_users,
            target_items,
            self.config['num_recommendations'],
        )
        all_items.append(items.astype(np.uint16))
        all_users.append(users.astype(np.int32))
        all_scores.append(scores)

        all_items = np.hstack(all_items)
        all_users = np.hstack(all_users)
        all_scores = np.hstack(all_scores)

        self.recs = pd.DataFrame()
        self.recs['user'] = all_users
        self.recs['item'] = all_items
        self.recs['score'] = all_scores
       
    @staticmethod
    def get_user_item_id(user_col: pd.Series, item_col: pd.Series) -> pd.Series:
        return item_col.astype(np.int64) * (10 ** 8) + user_col
    
    @staticmethod
    def apply_rank(col, df):
        if len(df) == 0:
            return []
        _, index, num_ranges = np.unique(df[col], return_counts=True, return_index=True)
        num_ranges = num_ranges[index.argsort()]
        arange = np.arange(num_ranges.max(), dtype=int)
        ranks = np.hstack([arange[:i] for i in num_ranges])
        return ranks
        
    def filter_seen_recs(self):
        # self.recs['ui'] = self.get_user_item_id(
        #     user_col=self.recs['user'],
        #     item_col=self.recs['item'],
        # )
        # seen = self.recs.ui.isin(self.already_seen)
        # self.recs = self.recs[~seen]
        self.recs['rnk'] = self.apply_rank('user', self.recs)
        
    def create_recommendations(
            self,
            train_set: pd.DataFrame,
            users: Iterable[int],
    ) -> pd.DataFrame:
        """"
        :return
        pd.DataFrame({
            id: [1, 2, 3],
            cluster_id: [4, 5, 6],
            score: [0.1, 0.3, -0.2],
            rnk: [0, 1, 2],
        })
        """
        self.train_set = train_set
        self.encode_ids()
        self.init_model()
        self.fit()
        self.torch_recommend(users)
        self.filter_seen_recs()
        self.decode_ids()
        return self.recs[['user_id', 'cluster_id', 'score']]

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
import os

DIR = 'drive/MyDrive/sber'

TRAIN_VAL_PATH = os.path.join(DIR, 'train_val1.parquet') 
VAL_PATH = os.path.join(DIR, 'val1.parquet')  
RECS_VAL_PATH = os.path.join(DIR, 'recs_val1.parquet') 
TRAIN_TEST_PATH = os.path.join(DIR, 'train_test.parquet')  
RECS_TEST_PATH = os.path.join(DIR, 'recs_test.parquet')
TEST_IDS_PATH = os.path.join(DIR, 'test_ids.csv') 
VAL_USER_IDS_PATH = os.path.join(DIR, 'val1_user_ids.parquet')

MF_MODEL_PATH = os.path.join(DIR, 'mf_model.pkl')

In [None]:
config = {
    'dim': 384,
    'device': 'cuda',
    'num_recommendations': 160,
    'fit_params': {
        'epochs': 6,
        'batch_size': 40000,
        'neg_sample': 40,    
        'learning_rate': 0.02,
        'negative_sampling_batch_size': 10000,
        'hard_neg_sample': 40,
        'penalty_alpha': 0.003,
    },
}

def create_candidates_val():
    
    train_val = pd.read_parquet(TRAIN_VAL_PATH)
    val_ids = pd.read_parquet(VAL_USER_IDS_PATH)
    recommender = MaxFactorRecommender(config)
    recs_val = recommender.create_recommendations(train_val, val_ids['user_id'])
    recs_val.to_parquet(RECS_VAL_PATH)

def fit_and_save_mf_model():
    
    train_test = pd.read_parquet(TRAIN_TEST_PATH)
    recommender = MaxFactorRecommender(config)
    recommender.train_set = train_test
    recommender.encode_ids()
    recommender.init_model()
    del recommender.train_set
    recommender.fit()
    del recommender.dataset
    del recommender.model.optimizer
    recommender.model.cpu()
    recommender.device = 'cpu'
    recommender.model.device = 'cpu'
    pickle.dump(recommender, open(MF_MODEL_PATH, 'wb'))

In [None]:
create_candidates_val()

In [None]:
fit_and_save_mf_model()