In [86]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.nn import MSELoss, L1Loss
from torch.optim import Adam
from torch.utils.data import DataLoader, Subset
from math import sqrt
from transformers import BartTokenizer, BartModel

from bart_utils.BartDataset import BartDataset
from bart_utils.BartSiamese import BartSiamese
from bart_utils.bart_utils import *
from utils.ClassicalML import *
from utils.GLOBALS import *
from utils.new_preproc import *

In [None]:
# define bart
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
bart_model = BartModel.from_pretrained('facebook/bart-base').to(device)
bart_model.eval()

In [72]:
# read and preprocess descriptions
descriptions = pd.read_csv('product_descriptions.csv')
descriptions['product_description'] = descriptions['product_description'].apply(
    lambda x: get_bart_embeddings(x, bart_tokenizer, bart_model))

In [74]:
# read and preprocess train
train = pd.read_csv('train.csv', encoding='ISO-8859-1')
train = pd.merge(train, descriptions, on='product_uid')
train = train[['search_term', 'product_description', 'relevance']]
train['search_term'] = train['search_term'].apply(get_bart_embeddings)
train['relevance'] = train['relevance'].apply(min_max_scaling)

# read and preprocess test
test = pd.read_csv('test.csv', encoding='ISO-8859-1')
test = pd.merge(test, descriptions, on='product_uid')
test_sol = pd.read_csv('solution.csv')
test = pd.merge(test, test_sol, on='id')
test['search_term'] = test['search_term'].apply(get_bart_embeddings)
test = test[['search_term', 'product_description', 'relevance']]
test = test[test['relevance'] != -1].reset_index()
test['relevance'] = test['relevance'].apply(min_max_scaling)

In [76]:
train_dataset = BartDataset(train['search_term'], train['product_description'], train['relevance'])
test_dataset = BartDataset(test['search_term'], test['product_description'], test['relevance'])

In [164]:
# train parameters
embedding_dim = 768
hidden_dim1 = 512
hidden_dim2 = 256
learning_rate = 0.0001
batch_size = 64
num_epochs = 100

# stratify validation set
binned_labels = pd.qcut(train['relevance'], q=3, labels=False, duplicates='drop')
train_indices, val_indices = train_test_split(range(len(train_dataset)), test_size=0.2, stratify=binned_labels,
                                              random_state=42)

# split train to train-val
train_subset = Subset(train_dataset, train_indices)
val_subset = Subset(train_dataset, val_indices)
train_data_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
val_data_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)

# model, metrics, optimizer
model = BartSiamese(embedding_dim, hidden_dim1, hidden_dim2).to(device)
mse_loss = MSELoss()
mae_loss = L1Loss()
optimizer = Adam(model.parameters(), lr=learning_rate)

# used for saving best model
best_val_rmse = float('inf')

start_time = time.time()
for epoch in range(num_epochs):
    epoch_start_time = time.time()

    model.train()
    total_mse_train_loss = 0
    total_mae_train_loss = 0
    total_train_samples = 0
    # train loop
    for search_term, product_description, relevance in train_data_loader:
        search_term, product_description, relevance = search_term.to(device), product_description.to(
            device), relevance.to(device)

        # backpropagation
        optimizer.zero_grad()
        outputs = model(search_term, product_description).squeeze(1)
        loss_mse = mse_loss(outputs, relevance)
        loss_mse.backward()
        optimizer.step()

        # inverse labels for correct calculation of metrics
        loss_mse = mse_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))
        loss_mae = mae_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))

        total_mse_train_loss += loss_mse.item() * len(relevance)
        total_mae_train_loss += loss_mae.item() * len(relevance)
        total_train_samples += len(relevance)

    train_rmse = sqrt(total_mse_train_loss / total_train_samples)
    train_mae = total_mae_train_loss / total_train_samples

    # validation loop
    model.eval()
    total_mse_val_loss = 0
    total_mae_val_loss = 0
    total_val_samples = 0
    with torch.no_grad():
        for search_term, product_description, relevance in val_data_loader:
            search_term, product_description, relevance = search_term.to(device), product_description.to(
                device), relevance.to(device)
            outputs = model(search_term, product_description).squeeze(1)

            val_loss_mse = mse_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))
            val_loss_mae = mae_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))

            total_mse_val_loss += val_loss_mse.item() * len(relevance)
            total_mae_val_loss += val_loss_mae.item() * len(relevance)
            total_val_samples += len(relevance)

    val_rmse = sqrt(total_mse_val_loss / total_val_samples)
    val_mae = total_mae_val_loss / total_val_samples

    epoch_time = time.time() - epoch_start_time
    print(
        f"Epoch {epoch + 1} ({epoch_time:.1f}s), Train RMSE: {train_rmse}, Val RMSE: {val_rmse}, Train MAE: {train_mae}, Val MAE: {val_mae}")

    # save best model
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        torch.save(model.state_dict(), 'best_model_character.pth')
        print(f"New best model saved at epoch {epoch + 1} with Val RMSE: {best_val_rmse:.4f}")

training_time = time.time() - start_time
print(f"Training time: {training_time:.4f}s")

Using cuda device.
Epoch 1 (3.6s), Train RMSE: 0.6505565709724405, Val RMSE: 0.6126509045425054, Train MAE: 0.5399867419123966, Val MAE: 0.5165034627341477
New best model saved at epoch 1 with Val RMSE: 0.6127
Epoch 2 (3.5s), Train RMSE: 0.5999916382645302, Val RMSE: 0.5791824644408086, Train MAE: 0.4984012950577861, Val MAE: 0.48679927613903545
New best model saved at epoch 2 with Val RMSE: 0.5792
Epoch 3 (3.1s), Train RMSE: 0.5627257009835366, Val RMSE: 0.5602225671193396, Train MAE: 0.46446619923783966, Val MAE: 0.46717770219219346
New best model saved at epoch 3 with Val RMSE: 0.5602
Epoch 4 (2.9s), Train RMSE: 0.5297977963637842, Val RMSE: 0.5488809092297694, Train MAE: 0.4353335676792895, Val MAE: 0.4572575960192134
New best model saved at epoch 4 with Val RMSE: 0.5489
Epoch 5 (3.5s), Train RMSE: 0.49768739760645797, Val RMSE: 0.5544900725447603, Train MAE: 0.4091832199561502, Val MAE: 0.4601547104298095
Epoch 6 (3.3s), Train RMSE: 0.467045226715859, Val RMSE: 0.5024685629907921,

KeyboardInterrupt: 

In [165]:
# load best model
model = BartSiamese(embedding_dim, hidden_dim1, hidden_dim2).to(device)
model.load_state_dict(torch.load('best_model_character.pth'))

<All keys matched successfully>

In [167]:
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model.eval()
total_mse_test_loss = 0
total_mae_test_loss = 0
total_test_samples = 0
test_start_time = time.time()
with torch.no_grad():
    # testing loop
    for search_term, product_description, relevance in test_data_loader:
        search_term, product_description, relevance = search_term.to(device), product_description.to(
            device), relevance.to(device)
        outputs = model(search_term, product_description).squeeze(1)

        test_loss_mse = mse_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))
        test_loss_mae = mae_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))

        total_mse_test_loss += test_loss_mse.item() * len(relevance)
        total_mae_test_loss += test_loss_mae.item() * len(relevance)
        total_test_samples += len(relevance)

test_time = time.time() - test_start_time
test_rmse = sqrt(total_mse_test_loss / total_test_samples)
test_mae = total_mae_test_loss / total_test_samples
print(f'Test time: {test_time:.1f}s, Test RMSE: {test_rmse}, Test MAE: {test_mae}')

Test time: 3.2s, Test RMSE: 0.5198253001883457, Test MAE: 0.4223737092737296


In [ ]:
# get train/test data for classical ML algorithms
all_train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
X_train, y_train, X_test, y_test = get_classical_ml_train_test_data(model, all_train_data_loader, test_data_loader)

In [169]:
train_rf(X_train, y_train, X_test, y_test, n_estimators=10)

Mean Absolute Error (MAE): 0.15100494442402046
Root Mean Squared Error (RMSE): 0.20607646491599785
Mean Absolute Error (MAE): 0.43745403175373826
Root Mean Squared Error (RMSE): 0.540144015528063


In [170]:
train_gbr(X_train, y_train, X_test, y_test, n_estimators=10)

Train Mean Absolute Error (MAE): 0.4202823489661842
Train Root Mean Squared Error (RMSE): 0.5111906005392803
Test Mean Absolute Error (MAE): 0.4343958323631227
Test Root Mean Squared Error (RMSE): 0.529577572795704


In [171]:
train_linear_regression(X_train, y_train, X_test, y_test)

Train Mean Absolute Error (MAE): 0.38856953382492065
Train Root Mean Squared Error (RMSE): 0.4797027111053467
Test Mean Absolute Error (MAE): 0.4337574243545532
Test Root Mean Squared Error (RMSE): 0.5349620580673218


In [214]:
train_xgboost(X_train, y_train, X_test, y_test, n_estimators=10, learning_rate=0.09, max_depth=10)

Train Mean Absolute Error (MAE): 0.34970012307167053
Train Root Mean Squared Error (RMSE): 0.42272648215293884
Test Mean Absolute Error (MAE): 0.4265356957912445
Test Root Mean Squared Error (RMSE): 0.5222816467285156
