In [77]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import wandb
from tqdm.autonotebook import tqdm
from transformers import get_cosine_schedule_with_warmup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pd.set_option("display.max_columns", None)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings("ignore")

In [5]:
sentence_model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')

In [17]:
train_dataset = pd.read_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Interaction_table.csv').drop(columns = ['Unnamed: 0'])
train_dataset = train_dataset.rename(columns={'watchtime': 'target'})
item_features = pd.read_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Item_features.csv').drop(columns = ['Unnamed: 0', 'row_number'])
test = pd.read_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Interaction_table_test.csv').drop(columns = ['Unnamed: 0'])

#train_dataset = pd.merge(train_dataset, item_features, on='video_id', how='left')
#test = pd.merge(test, item_features, on='video_id', how='left')

categorical = ['region', 'city', 'category_id', 'author_id', 'long_video']
embbedings = ['title', 'description']
numeric = [c for c in (item_features.columns.values.tolist() + train_dataset.columns.values.tolist()) if c not in categorical + embbedings +['video_id', 'user_id', 'target']]
item_features[embbedings] = item_features[embbedings].fillna('<PAD>')

print('Ready!')

Ready!


In [6]:
DEBUG = True

if DEBUG:
    epochs = 1
    max_users = 10
else:
    epochs = 10
    max_users = 10000


In [7]:
filtered_users = (
    train_dataset.groupby('user_id')
    .agg(
        total_interactions=('video_id', 'size'),
        positive_interactions=('target', lambda x: (x == 1).sum()),
        negative_interactions=('target', lambda x: (x == 0).sum())
    )
    .query('total_interactions >= 3 and positive_interactions >= 1 and negative_interactions >= 1')
    .index
)
# train_dataset = train_dataset[train_dataset['user_id'].isin(filtered_users)]
# train_dataset.reset_index(drop=True, 
#                inplace=True)

random_users = random.sample(list(filtered_users), max_users)
X_train, X_val = train_test_split(random_users, test_size=0.2, random_state=42)

train_dataset = train_dataset[train_dataset['user_id'].isin(X_train)]
val_dataset = train_dataset[train_dataset['user_id'].isin(X_val)]

train_dataset.reset_index(drop=True, 
               inplace=True)

val_dataset.reset_index(drop=True, 
               inplace=True)

In [11]:
label_encoder = LabelEncoder()

cat_PAD = {}
label_encoder_holder = {}
for cat in categorical:
    label_encoder = LabelEncoder()
    train_dataset[cat] = f'{cat}_' + train_dataset[cat].astype(str)

    label_encoder.fit(train_dataset[cat].values.tolist() + [f'{cat}_PAD'])
    label_encoder_holder[cat] = label_encoder
    cat_PAD[cat] = label_encoder.transform([f'{cat}_PAD']).tolist()[0]
    # train_dataset[cat] = label_encoder.transform(train_dataset[[cat]])

    # def encode_category(category):
    #     if category in label_encoder.classes_:
    #         return category
    #     else:
    #         return f'{cat}_PAD'

    # test[cat] = test[cat].apply(encode_category)

    # test[cat] = label_encoder.transform(test[[cat]])

In [175]:
class DSSMDataset(Dataset):
    def __init__(self, train_dataset, item_features, users, numeric, categorical, embbedings, label_encoder_holder, sentence_model, trainfor = True, predictfor = False):
        self.train_dataset = train_dataset
        if predictfor == False: 
            self.train_dataset = train_dataset[train_dataset['user_id'].isin(users)]
            self.train_dataset.reset_index(drop=True,inplace=True)
        
        self.item_features =item_features
        self.users = users
        self.numeric = numeric
        self.categorical = categorical
        self.embbedings = embbedings
        self.label_encoder_holder = label_encoder_holder
        self.sentence_model = sentence_model
        
        self.trainfor = trainfor
        self.predictfor = predictfor
        
    def __getitem__(self, idx: int):
        if self.trainfor == True and self.predictfor == False:
            user = self.users[idx]
            users_iter = self.train_dataset[self.train_dataset['user_id'] == user].copy()
            users_iter = pd.merge(users_iter, self.item_features, on='video_id', how='left')
            users_iter = users_iter.drop(columns = ['user_id', 'video_id'])
        
            for col in self.categorical:
                users_iter[col] = f'{col}_' + users_iter[col].astype(str)
            
            positive = users_iter[users_iter['target'] == 0].sample(n=1).drop(columns = ['target'])
            negative = users_iter[users_iter['target'] == 1].sample(n=1).drop(columns = ['target'])
            anchor = users_iter[~users_iter.index.isin(positive.index) & ~users_iter.index.isin(negative.index)]
            anchor, key_padding_mask  = self.pad_interactions(anchor)
        
            return [[self.extract_features(positive, False)], [self.extract_features(negative, False)], [self.extract_features(anchor, True), key_padding_mask]]
         
        elif self.trainfor == False and self.predictfor == False: 
            user = self.users[idx]
            users_iter = self.train_dataset[self.train_dataset['user_id'] == user].copy()
            users_iter = pd.merge(users_iter, self.item_features, on='video_id', how='left')
            users_iter = users_iter.drop(columns = ['user_id', 'video_id'])
        
            for col in self.categorical:
                users_iter[col] = f'{col}_' + users_iter[col].astype(str)
            
            interaction = users_iter[users_iter['target']==random.randint(0, 1)].sample(n=1)
            targets = interaction['target'].values
            interaction = interaction.drop(columns = ['target'])
            
            history = users_iter[~users_iter.index.isin(interaction.index)]
            history, key_padding_mask  = self.pad_interactions(history)
        
            return [[self.extract_features(interaction, False)], [self.extract_features(history, True), key_padding_mask]], targets
            
        elif self.trainfor == False and self.predictfor == True:
            interaction = self.train_dataset.iloc[[idx]]
            interaction = pd.merge(interaction, self.item_features, on='video_id', how='left')
            interaction = interaction.drop(columns = ['target'])
            for col in self.categorical:
                interaction[col] = f'{col}_' + interaction[col].astype(str)
            
            users_iter = self.train_dataset[self.train_dataset['user_id'] == interaction['user_id']].copy()
            users_iter = pd.merge(users_iter, self.item_features, on='video_id', how='left')
            users_iter = users_iter.drop(columns = ['user_id', 'video_id'])
            history = users_iter[~users_iter.index.isin(interaction.index)]
            for col in self.categorical:
                history[col] = f'{col}_' + history[col].astype(str)
            
            history, key_padding_mask  = self.pad_interactions(history)
        
            return [[self.extract_features(interaction, False)], [self.extract_features(history, True), key_padding_mask]]
    
    def extract_features(self, data, is_anchor):
        if is_anchor == True:
            numeric_data = data[(self.numeric + ['target'])].values
        if is_anchor == False:
            numeric_data = data[(self.numeric)].values

        for col in self.categorical:
            data[col] = data[col].apply(lambda value: self.encode_category(value , col))
            data[col] = data[col].apply(lambda value: self.add_padding_with_prob(value, col))
            data[col] = self.label_encoder_holder[col].transform(data[[col]]).astype(int)
        categorical_data = data[self.categorical].values
        first_iter = True
        text_data = [] 
        for seq in data[self.embbedings].values:
            if first_iter:
                if is_anchor:
                    text_data = np.array([self.sentence_model.encode(seq)])
                else:
                    text_data = np.array(self.sentence_model.encode(seq))
                first_iter = False
            else:
                sentence = np.array([self.sentence_model.encode(seq)])
                text_data = np.concatenate((text_data, sentence), axis=0)

        return numeric_data, categorical_data, text_data

    def encode_category(self, category, col):
        if category in self.label_encoder_holder[col].classes_:
            return category
        else:
            return f'{cat}_PAD'

    def add_padding_with_prob(self, value, col):
        if np.random.rand() < 0.001:  # 0.1% шанс
            return f'{col}_PAD'
        return value
        
    def pad_interactions(self, user_interactions, target_length=20):
        num_missing = target_length - len(user_interactions)     
        if num_missing > 0:
            pad_layers =  user_interactions[:1]
            pad_layers = pd.concat([pad_layers] * num_missing, ignore_index=True)
            
            for col in self.numeric:
                pad_layers[col] = -1
                
            for col in self.categorical:
                pad_layers[col] = f'{col}_PAD'

            for col in self.embbedings:
                pad_layers[col] = '<PAD>'

        key_padding_mask = torch.cat([
        torch.zeros(target_length - num_missing, dtype=torch.bool),  
            torch.ones(num_missing, dtype=torch.bool)
        ])
        
        user_interactions = pd.concat([user_interactions, pad_layers], ignore_index=True)
        return user_interactions.head(target_length), key_padding_mask
        
    def __len__(self):
        if self.predictfor:
            return len(self.train_dataset)
        else:
            return len(self.users)

len_of_cat_embb = sum(len(enc.classes_) for enc in label_encoder_holder.values())

In [176]:
Dataset_train = DSSMDataset(train_dataset, item_features, X_train, numeric, categorical, embbedings, label_encoder_holder, sentence_model)
Dataloader_train = DataLoader(Dataset_train, batch_size=64, shuffle=True)

Dataset_val = DSSMDataset(train_dataset, item_features, X_val, numeric, categorical, embbedings, label_encoder_holder, sentence_model, trainfor = True)
Dataloader_val = DataLoader(Dataset_train, batch_size=32, shuffle=False)

Dataset_test = DSSMDataset(test, item_features, X_val, numeric, categorical, embbedings, label_encoder_holder, sentence_model, trainfor = False, predictfor = True)
Dataloader_test = DataLoader(Dataset_train, batch_size=64, shuffle=False)

In [179]:
class History_head(nn.Module):
    def __init__(self,input_dim, ountput_dim, model_dim = 512, nhead = 8, num_layers = 10, dim_feedforward = 1024):
        super(History_head, self).__init__()
        self.input_layer = nn.Linear(input_dim, model_dim)
        self.transformer = nn.Transformer(
            d_model=model_dim,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            batch_first = True
        )
        self.output_layer_1 = nn.Linear(model_dim, ountput_dim)
        self.activation_out_1 = nn.GELU()
        self.output_layer_2 = nn.Linear(model_dim, ountput_dim)
        self.activation_out_2 = nn.GELU()

    def forward(self, X, pad_mask):
        X = X.float()
        out = self.input_layer(X)
        out = self.transformer(out, out, src_key_padding_mask = pad_mask, tgt_key_padding_mask = pad_mask)
        out = out[:, :(~pad_mask).float().sum().int(), :]
        out = torch.mean(out, dim = 1)

        out = self.output_layer_1(out)
        out = self.activation_out_1(out)
        out = self.output_layer_2(out)
        out = self.activation_out_2(out)

        return out

class Interaction_head(nn.Module):
    def __init__(self,input_dim, output_dim, model_dim = 512, nhead = 8):
        super(Interaction_head, self).__init__()
        self.input_layer = nn.Linear(input_dim, model_dim)

        self.attention_layer1 = nn.MultiheadAttention(embed_dim=model_dim, num_heads=nhead, batch_first=True)

        self.hidden_layer_1 = nn.Linear(model_dim, model_dim)
        self.activation_hidden_1 = nn.GELU()
        self.hidden_layer_2 = nn.Linear(model_dim, model_dim)
        self.activation_hidden_2 = nn.GELU()
        self.hidden_layer_3 = nn.Linear(model_dim, model_dim)
        self.activation_hidden_3 = nn.GELU()

        self.attention_layer2 = nn.MultiheadAttention(embed_dim=model_dim, num_heads=nhead, batch_first=True)

        self.output_layer_1 = nn.Linear(model_dim, model_dim)
        self.activation_out_1 = nn.GELU()
        self.output_layer_2 = nn.Linear(model_dim, output_dim)

        self.dropout = nn.Dropout(p=0.1)

    def forward(self, X):
        X = X.float()
        out = self.input_layer(X)
        out_ = out

        out, _ = self.attention_layer1(out, out, out)

        out = self.hidden_layer_1(out)
        out = self.activation_hidden_1(out)
        out = self.hidden_layer_2(out)
        out = self.activation_hidden_2(out)
        out = self.hidden_layer_3(out)
        out = self.activation_hidden_3(out)

        out, _ = self.attention_layer2(out, out, out)

        out = self.output_layer_1(out)
        out = self.activation_out_1(out)
        out = out + out_
        out = self.output_layer_2(out)

        out = self.dropout(out)

        return out


class Text_MLP(nn.Module):
    def __init__(self,input_dim, output_dim, hist = False, model_dim = 512,):
        super(Text_MLP, self).__init__()
        self.hist = hist

        self.input_layer = nn.Linear(input_dim, model_dim)
        self.activation_input_1 = nn.GELU()

        self.hidden_layer_1 = nn.Linear(model_dim, model_dim)
        self.activation_hidden_1 = nn.GELU()
        self.hidden_layer_2 = nn.Linear(model_dim, model_dim)
        self.activation_hidden_2 = nn.GELU()

        self.output_layer_1 = nn.Linear(model_dim, output_dim)
        self.output_layer_2 = nn.Linear(model_dim, output_dim)

        self.dropout = nn.Dropout(p=0.1)

    def forward(self, X):
        out = self.input_layer(X)
        out = self.activation_input_1(out)
        if self.hist == False:
            out_1 = out[:, 0,:]
            out_2 = out[:, 1, :]
        else:
            out_1 = out[:, :, 0, :]
            out_2 = out[:, :, 1, :]

        out_1_ = out_1
        out_2_ = out_2

        out_1 = self.hidden_layer_1(out_1)
        out_1 = self.activation_hidden_1(out_1)

        out_2 = self.hidden_layer_2(out_2)
        out_2 = self.activation_hidden_2(out_2)


        out_1 = out_1 + out_1_
        out_2 = out_2 + out_2_
        out_1 = self.output_layer_1(out_1)
        out_2 = self.output_layer_2(out_2)

        out = (out_1 + out_2) / 2

        if self.hist == True:
            out = torch.mean(out, dim = 1)

        out = self.dropout(out)

        return out


class Final_heads(nn.Module):
    def __init__(self,input_dim, output_dim, model_dim):
        super(Final_heads, self).__init__()
        self.input_layer = nn.Linear(input_dim, model_dim)
        self.activation_input_1 = nn.GELU()

        self.hidden_layer_1 = nn.Linear(model_dim, model_dim)
        self.activation_hidden_1 = nn.GELU()

        self.output_layer_2 = nn.Linear(model_dim, output_dim)
        self.dropout = nn.Dropout(p=0.1)

        self.norm = nn.LayerNorm(output_dim)

    def forward(self, data_emb, text_emb):

        out = torch.cat((data_emb, text_emb), dim = 1)

        out = self.input_layer(out)
        out = self.activation_input_1(out)
        out_ = out

        out = self.hidden_layer_1(out)
        out = self.activation_hidden_1(out)

        out = out + out_
        out = self.output_layer_2(out)

        out = self.dropout(out)

        out = self.norm(out)
        return out

class Yassm_model(nn.Module):
    def __init__(self, d_model, txt_dim, num_of_cat, cat_emb_dim, embbedings_counts, num_of_num, emb_distance_f = nn.CosineSimilarity()):
        super(Yassm_model, self).__init__()
        self.embeddings_cat = nn.Embedding(embbedings_counts, cat_emb_dim)
        self.input_dim = int(cat_emb_dim * num_of_cat + num_of_num)

        self.History_head = History_head(self.input_dim + 1, d_model, d_model)
        self.Interaction_head = Interaction_head(self.input_dim, d_model, d_model)
        self.Text_MLP_Hist = Text_MLP(txt_dim, d_model, hist = True)
        self.Text_MLP_Iter = Text_MLP(txt_dim, d_model)
        self.Final_heads_Hist = Final_heads(2 * d_model, 2 * d_model, 2 * d_model)
        self.Final_heads_Iter = Final_heads(2 * d_model, 2 * d_model, 2 * d_model)

        self.emb_distance = emb_distance_f

    def positive_negative_class(self, data):
        cat_emb = self.embeddings_cat(data[0][1].to(device)).squeeze(1).flatten(start_dim=1,end_dim=2)
        num_emb = data[0][0].squeeze(1).to(device)
        output = torch.cat((num_emb, cat_emb), dim=1).to(device)

        txt_emb = data[0][2].to(device)

        return output, txt_emb


    def anchor_class(self, data):
        cat_emb = self.embeddings_cat(data[0][1].to(device)).flatten(start_dim=2, end_dim=-1)
        num_emb = data[0][0].to(device)
        output = torch.cat((num_emb, cat_emb), dim=2).to(device)

        txt_emb = data[0][2].to(device)

        pad_mask = data[1].to(device)

        return output, txt_emb, pad_mask
        

    def forward(self, pos, neg, anchor):
        pos_data, pos_text_emb = self.positive_negative_class(pos)
        neg_data, neg_text_emb = self.positive_negative_class(neg)

        anchor_data, anchor_text_emb, pad_mask = self.anchor_class(anchor)

        pos_out = self.Interaction_head(pos_data)
        pos_out_text = self.Text_MLP_Iter(pos_text_emb)
        pos_emb = self.Final_heads_Iter(pos_out, pos_out_text)

        neg_out = self.Interaction_head(neg_data)
        neg_out_text = self.Text_MLP_Iter(neg_text_emb)
        neg_emb = self.Final_heads_Iter(neg_out, neg_out_text)

        anchor_out = self.History_head(anchor_data, pad_mask)
        anchor_out_text = self.Text_MLP_Hist(anchor_text_emb)
        anchor_emb = self.Final_heads_Iter(anchor_out, anchor_out_text)

        return pos_emb, neg_emb, anchor_emb
        

    def predict(self, interaction, history,):
        interaction_data, interaction_text_emb = self.positive_negative_class(interaction)

        interaction_out = self.Interaction_head(interaction_data)
        interaction_out_text = self.Text_MLP_Iter(interaction_text_emb)
        interaction_emb = self.Final_heads_Iter(interaction_out, interaction_out_text)

        history_data, history_text_emb = self.positive_negative_class(history)

        history_out = self.History_head(history_data)
        history_out_text = self.Text_MLP_Hist(history_text_emb)
        history_emb = self.Final_heads_Iter(history_out, history_out_text)

        res = self.emb_distance(interaction_emb, history_emb)

        return res

class Yassm():
    def __init__(self, d_model = 512, txt_dim = 1024, num_of_cat = 1, cat_emb_dim = 64, embbedings_counts = 1, num_of_num = 1):
        self.model = Yassm_model(d_model, txt_dim, num_of_cat, cat_emb_dim, embbedings_counts, num_of_num)
        self.model = self.model.to(device)

    def fit(self, dataset, validation, epochs):
        wandb.login(key = 'bb574f54db03f89674e1dba7770189c8f56e5a26')
        wandb.init(project='Yassm-recsys')
        optimizer = torch.optim.Adam(self.model.parameters(), lr=5e-2)
        criterion = nn.TripletMarginLoss()
        # lambda_lr = lambda epoch: 0.7 ** epoch
        # scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_lr)
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=epochs)

        for epoch in range(epochs):
            self.model.train()
            for pos, neg, anchor in tqdm(dataset, desc=f'Training epoch: {epoch + 1}', colour="cyan"):
                
                optimizer.zero_grad()
                pos_emb, neg_emb, anchor_emb = self.model(pos, neg, anchor)
                loss = criterion(anchor_emb, pos_emb, neg_emb)
                loss.backward()
                optimizer.step()

                wandb.log({"loss": loss})
            scheduler.step()

            self.model.eval()
            metric = []
            loss_val = nn.BCELoss()
            with torch.no_grad():
                for interaction, history, targets in tqdm(validation, desc='Validation', colour="green"):

                    model_output = self.model.predict(interaction, history)

                    loss_metric = loss_val(model_output, targets)
                    metric.append(torch.mean(loss_metric, dim = 0).item())

            metric = sum(metric)/len(metric)
            wandb.log({"Val_metric": metric})

            wandb.log({"epoch": epoch + 1})

            PATH = '/home/pret/PycharmProjects/Vseros_classification/Models/Yassm.pt'
            torch.save(self.model.state_dict(), PATH)
            
    def predict(self, dataset):
        self.model.eval()
        with torch.no_grad():
            predict = []
            for interaction, history in tqdm(dataset, desc='Predicting', colour="green"):
                interaction = interaction.to(device)
                history = history.to(device)

                model_output = self.model.predict(interaction, history)

                predict = predict + model_output.tolist()

        return predict


In [180]:
model = Yassm(num_of_cat = len(categorical), num_of_num = len(numeric), embbedings_counts = len_of_cat_embb)
model.fit(Dataloader_train, Dataloader_val, epochs)



0,1
loss,


Training epoch: 1:   0%|          | 0/1 [00:00<?, ?it/s]

Validation:   0%|          | 0/1 [00:00<?, ?it/s]

AttributeError: 'list' object has no attribute 'to'