# Task undestanding

Задача накопленные в разных csv файлах признаки загрузить и найти лучшую комбинацию этих признаков, максимизируя заданную метрику (F1-weighted). При этом надо реализовать задачу в рамках имеющихся ресурсов по ОЗУ. 

Заметил, что для роста метрики достаточно сосредоточиться на признаке "title", получив его различные эмбеддинги.

В память влазит около  4000 признаков, при сохранении полного датасета, распределим их по размерности эмбеддингов 768+768+768+768+(1024 или 768)

# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Hyperparameters and global variables

In [28]:
prefix = '/content/drive/MyDrive/Colab Notebooks/PetProject 2023/test_kazan_express/X/'
PATH_BASE = f'{prefix}X_title_sberbankroberta.csv'
PATH_DF1 = f'{prefix}X_title_DeepPavlov.csv'
PATH_DF2 = f'{prefix}X_description_DeepPavlov.csv'
PATH_DF3 = f'{prefix}X_title_cointegrated_rubert_base_cased_nli_twoway.csv'
PATH_DF4 = f'{prefix}X_title_Den4ikAI_ruBert_base_intent_detection.csv'

prefix_test = '/content/drive/MyDrive/Colab Notebooks/PetProject 2023/test_kazan_express/FI/'
PATH_BASE_TEST = f'{prefix_test}test_title_sberbank_ai_ruRoberta_large.csv'
PATH_DF1_TEST = f'{prefix_test}test_title_DeepPavlov_rubert_base_cased_conversational.csv'
PATH_DF2_TEST = f'{prefix_test}test_description_DeepPavlov_rubert_base_cased_conversational.csv'
PATH_DF3_TEST = f'{prefix_test}test_title_cointegrated_rubert_base_cased_nli_twoway.csv'
PATH_DF4_TEST = f'{prefix_test}test_title_Den4ikAI_ruBert_base_intent_detection.csv'

BATCH_SIZE = 4096 * 8*64
PATH = '/content/model.torch'

In [4]:
PATH_CLASSES = '/content/drive/MyDrive/Colab Notebooks/PetProject 2023/test_kazan_express/X/classes.npy'

In [5]:
class param:
    test_size = 0.33
    lr = 3e-4
    num_epochs = 100
    seed = 42
    weight_decay = 1e-6
    dropout = 0.5
    num_classes = 874
    num_inputs = 4101

# Load data

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load dataframe

In [7]:
df = pd.read_csv(PATH_BASE)
df1 = pd.read_csv(PATH_DF1)
df2 = pd.read_csv(PATH_DF2)
df3 = pd.read_csv(PATH_DF3)
df4 = pd.read_csv(PATH_DF4)

drop_columns = ['product_id', 'category_id', 'sale',
             'shop_id', 'shop_title', 'rating', 'text_fields',
             'category_name', 'title', 'description', 'attributes',
             'custom_characteristics', 'defined_characteristics',
             'filters', 'target', 'bool_attributes', 
             'bool_custom_characteristics',	'bool_defined_characteristics',	
             'bool_filters', 'bool_keys_defined_characteristics', 
             'keys_defined_characteristics', 'clothing_size', 'color',
             ]

In [8]:
df1 = df1.drop(columns=drop_columns)
df2 = df2.drop(columns=drop_columns)
df3 = df3.drop(columns=drop_columns)
df4 = df4.drop(columns=drop_columns)
print(df.shape, df1.shape, df2.shape, df3.shape, df4.shape)

df = df.reset_index(drop=True) \
.join(df1.reset_index(drop=True)) \
.join(df2.reset_index(drop=True)) \
.join(df3.reset_index(drop=True)) \
.join(df4.reset_index(drop=True))
df.shape

(91120, 1047) (91120, 768) (91120, 768) (91120, 768) (91120, 768)


(91120, 4119)

In [9]:
del df1
del df2
del df3
del df4

## Split train and val

Учтем что есть классы представленные в 1 экземпляре

In [10]:
print(df.shape)
X_one_example = df[df['target'].isin(df['target'].value_counts().loc[lambda x: x == 1].index.values)]
df_for_train = df.drop(labels=list(X_one_example.index.values), axis=0)
print(df_for_train.shape)

(91120, 4119)
(91116, 4119)


In [11]:
y = df_for_train[['target']]
drop_column_base_df = ['product_id', 'category_id', 'sale',
             'shop_id', 'shop_title', 'rating', 'text_fields',
             'category_name', 'title', 'description', 'attributes',
             'custom_characteristics', 'defined_characteristics',
             'filters', 'target', 'bool_defined_characteristics', 
             'bool_keys_defined_characteristics', 'keys_defined_characteristics',
              ]
X = df_for_train.drop(columns=drop_column_base_df)
X.shape, y.shape

((91116, 4101), (91116, 1))

In [12]:
del df
del df_for_train

In [13]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=param.test_size, random_state=param.seed, stratify=y, shuffle=True)
len(X_train), len(X_val), len(y_train), len(y_val)

(61047, 30069, 61047, 30069)

In [14]:
y_one_example = X_one_example[['target']]
X_one_example = X_one_example.drop(columns=drop_column_base_df)

X_train = pd.concat([X_train, X_one_example], axis=0)
print(X_train.shape)

y_train = pd.concat([y_train, y_one_example], axis=0)
print(y_train.shape)

(61051, 4101)
(61051, 1)


In [15]:
del X
del y
del y_one_example
del X_one_example

In [16]:
dataset_train = torch.utils.data.TensorDataset(torch.FloatTensor(X_train.values),
                                               torch.Tensor(y_train.values).long())
dataset_val = torch.utils.data.TensorDataset(torch.FloatTensor(X_val.values),
                                             torch.Tensor(y_val.values).long())

train = torch.utils.data.DataLoader(dataset_train, BATCH_SIZE, shuffle=True)
val = torch.utils.data.DataLoader(dataset_val, BATCH_SIZE, shuffle=True)

# Modeling and Evaluation

In [17]:
num_classes = len(np.unique(y_train))
print(num_classes)  # 874 
num_inputs = X_train.shape[1]
print(num_inputs)

874
4101


In [18]:
class Net(torch.nn.Module):
    def __init__(self, num_inputs, hidden_size1, hidden_size2, num_outputs=num_classes):
        super(Net, self).__init__()
        self.linear1 = nn.Linear(num_inputs, hidden_size1)
        self.linear2 = nn.Linear(hidden_size1, hidden_size2)
        self.linear3 = nn.Linear(hidden_size2, num_outputs)
        self.bn1 = nn.BatchNorm1d(hidden_size1)
        self.bn2 = nn.BatchNorm1d(hidden_size2)
        self.drop = torch.nn.Dropout(param.dropout)

    def forward(self, x):
        x = self.drop(self.bn1(F.relu(self.linear1(x))))
        x = self.drop(self.bn2(F.relu(self.linear2(x))))
        out = self.linear3(x)
        return out
    
net = Net(num_inputs = num_inputs, 
          hidden_size1 = 2048,
          hidden_size2 = 1024,
          num_outputs=num_classes
          )
net.to(device)
print(net)

Net(
  (linear1): Linear(in_features=4101, out_features=2048, bias=True)
  (linear2): Linear(in_features=2048, out_features=1024, bias=True)
  (linear3): Linear(in_features=1024, out_features=874, bias=True)
  (bn1): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop): Dropout(p=0.5, inplace=False)
)


In [19]:
loss = torch.nn.CrossEntropyLoss()
trainer = torch.optim.Adam(net.parameters(), lr=param.lr, weight_decay=param.weight_decay)
num_epochs = param.num_epochs

In [22]:
for ep in range(num_epochs):
    train_loss, train_iters = 0., 0.
    train_y_true, train_y_pred = [], []
    start=time.time()
    
    net.train()
    for X, y in train:
        X, y = X.float().to(device), y.squeeze().to(device)
        trainer.zero_grad()
        y_logits = net(X)
        y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1)
        l = loss(y_logits, y)
        l.backward()
        trainer.step()
        train_y_true += y
        train_y_pred += y_pred
        train_loss += l.item()
        train_iters += 1

    val_loss, val_iters = 0., 0.
    val_y_true, val_y_pred = [], []
    
    net.eval()
    for X, y in val:
        X, y = X.to(device), y.squeeze().to(device)
        y_logits = net(X)
        y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1)
        l = loss(y_logits, y)
        val_y_true += y
        val_y_pred += y_pred
        val_loss += l.item()
        val_iters += 1

    message = (f"ep: {ep}, taked: {time.time() - start:.3f},"
               f" train_loss: {train_loss / train_iters:.3f},"
               f" train_f1_score: {f1_score(torch.Tensor(train_y_true).cpu(), torch.Tensor(train_y_pred).cpu(), average='weighted'):.3f},"
               f" val_loss: {val_loss / val_iters:.3f},"
               f" val_f1_score: {f1_score(torch.Tensor(val_y_true).cpu(), torch.Tensor(val_y_pred).cpu(), average='weighted'):.3f}"
              )    
    print(message)

ep: 0, taked: 13.524, train_loss: 0.247, train_f1_score: 0.948, val_loss: 0.641, val_f1_score: 0.845
ep: 1, taked: 13.492, train_loss: 0.244, train_f1_score: 0.948, val_loss: 0.634, val_f1_score: 0.845
ep: 2, taked: 13.603, train_loss: 0.237, train_f1_score: 0.951, val_loss: 0.627, val_f1_score: 0.845
ep: 3, taked: 13.689, train_loss: 0.232, train_f1_score: 0.952, val_loss: 0.621, val_f1_score: 0.846
ep: 4, taked: 13.421, train_loss: 0.225, train_f1_score: 0.954, val_loss: 0.616, val_f1_score: 0.847
ep: 5, taked: 13.442, train_loss: 0.221, train_f1_score: 0.954, val_loss: 0.612, val_f1_score: 0.846
ep: 6, taked: 13.669, train_loss: 0.218, train_f1_score: 0.954, val_loss: 0.608, val_f1_score: 0.848
ep: 7, taked: 13.574, train_loss: 0.210, train_f1_score: 0.957, val_loss: 0.605, val_f1_score: 0.848
ep: 8, taked: 13.451, train_loss: 0.208, train_f1_score: 0.958, val_loss: 0.601, val_f1_score: 0.848
ep: 9, taked: 13.288, train_loss: 0.203, train_f1_score: 0.959, val_loss: 0.598, val_f1_sco

KeyboardInterrupt: ignored

In [23]:
# https://pytorch.org/tutorials/beginner/saving_loading_models.html
torch.save(net.state_dict(), PATH)

In [32]:
del dataset_train
del dataset_val
del train
del val

# Inference

In [25]:
le = LabelEncoder()
le.classes_ = np.load(PATH_CLASSES)

In [None]:
net = Net(num_inputs = param.num_inputs, 
          hidden_size1 = 2048,
          hidden_size2 = 1024,
          num_outputs=param.num_classes)
net.load_state_dict(torch.load(PATH))
net.eval()

In [33]:
df_test = pd.read_csv(PATH_BASE_TEST)
df1 = pd.read_csv(PATH_DF1_TEST)
df2 = pd.read_csv(PATH_DF2_TEST)
df3 = pd.read_csv(PATH_DF3_TEST)
df4 = pd.read_csv(PATH_DF4_TEST)

drop_columns_test = ['product_id', 'sale',
             'shop_id', 'shop_title', 'rating', 'text_fields',
             'title', 'description', 'attributes',
             'custom_characteristics', 'defined_characteristics',
             'filters', 'bool_attributes', 'bool_custom_characteristics', 
             'bool_defined_characteristics', 'bool_filters',
             'bool_keys_defined_characteristics', 'keys_defined_characteristics',
             'clothing_size', 'color',
             ]

df1 = df1.drop(columns=drop_columns_test)
df2 = df2.drop(columns=drop_columns_test)
df3 = df3.drop(columns=drop_columns_test)
df4 = df4.drop(columns=drop_columns_test)

df_test = df_test.reset_index(drop=True) \
.join(df1.reset_index(drop=True)) \
.join(df2.reset_index(drop=True)) \
.join(df3.reset_index(drop=True)) \
.join(df4.reset_index(drop=True))
df_test.shape

(16860, 4116)

In [34]:
drop_column_base_df_test = ['product_id', 'sale',
             'shop_id', 'shop_title', 'rating', 'text_fields',
             'title', 'description', 'attributes',
             'custom_characteristics', 'defined_characteristics',
             'filters', 'bool_defined_characteristics', 
             'bool_keys_defined_characteristics', 'keys_defined_characteristics',
              ]
X_test = df_test.drop(columns=drop_column_base_df_test)

In [38]:
dataset_test = torch.utils.data.TensorDataset(torch.FloatTensor(X_test.values))
test = torch.utils.data.DataLoader(dataset_test, BATCH_SIZE, shuffle=False)

In [39]:
test_y_pred : list = []

net.eval()
for X in test:
    X = X[0].to(device)
    y_logits = net(X)
    y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1)
    test_y_pred += y_pred
predicted_category_id = le.inverse_transform(torch.LongTensor(test_y_pred).cpu())

In [40]:
df1 = pd.DataFrame(df_test['product_id']).reset_index(drop=True)
df2 = pd.DataFrame({'predicted_category_id': predicted_category_id}).reset_index(drop=True)
result = pd.concat([df1, df2], axis=1)
result.shape

(16860, 2)

In [41]:
result.to_parquet('result.parquet')
pd.read_parquet('/content/result.parquet', engine='pyarrow').head()

Unnamed: 0,product_id,predicted_category_id
0,1997646,13495
1,927375,14922
2,1921513,12980
3,1668662,12044
4,1467778,12524
