In [1]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.nn import MSELoss, L1Loss
from torch.optim import Adam
from torch.utils.data import DataLoader, Subset
from math import sqrt

from char_utils.CharDataset import CharDataset
from char_utils.CharSiameseLSTM import CharSiameseLSTM
from char_utils.char_utils import *
from utils.old_preproc import *
from utils.GLOBALS import *

In [None]:
# read and preprocess descriptions
descriptions = pd.read_csv('csv/product_descriptions.csv')
descriptions['product_description'] = descriptions['product_description'].apply(remove_html_tags_old)
descriptions['product_description'] = descriptions['product_description'].apply(remove_stopwords)
descriptions['product_description'] = descriptions['product_description'].apply(remove_spaces)

In [5]:
# read and preprocess train
train = pd.read_csv('csv/train.csv', encoding='ISO-8859-1')
train = pd.merge(train, descriptions, on='product_uid')
train = train[['search_term', 'product_description', 'relevance']]
train['relevance'] = train['relevance'].apply(label_min_max_scaling)

# read and preprocess test
test = pd.read_csv('csv/test.csv', encoding='ISO-8859-1')
test = pd.merge(test, descriptions, on='product_uid')
test_sol = pd.read_csv('csv/solution.csv')
test = pd.merge(test, test_sol, on='id')
test = test[['search_term', 'product_description', 'relevance']]
test = test[test['relevance'] != -1].reset_index()
test['relevance'] = test['relevance'].apply(label_min_max_scaling)

In [7]:
# convert to sequence of characters
train['search_term'] = train['search_term'].apply(to_character_sequences)
train['product_description'] = train['product_description'].apply(to_character_sequences)

test['search_term'] = test['search_term'].apply(to_character_sequences)
test['product_description'] = test['product_description'].apply(to_character_sequences)

In [8]:
# calculate max lengths for padding in dataset
max_len_search = 0
for row in train['search_term']:
    max_len_search = max(max_len_search, len(row))

max_len_desc = 0
for row in train['product_description']:
    max_len_desc = max(max_len_desc, len(row))

In [ ]:
# create mapping and train/test datasets
char_to_int = create_char_to_int_mapping(train['search_term'], train['product_description'])
train_dataset = CharDataset(train['search_term'], train['product_description'], train['relevance'],
                            char_to_int, max_len_search, max_len_desc)
test_dataset = CharDataset(test['search_term'], test['product_description'], test['relevance'],
                           char_to_int, max_len_search, max_len_desc)

In [11]:
# train parameters
vocab_size = len(char_to_int) + 1  # adding 1 for padding index: 0
embedding_dim = 64
hidden_dim = 128
learning_rate = 0.001
batch_size = 64
num_epochs = 50
print(f"Using {device} device.")

# split train to train-val
train_indices, val_indices = train_test_split(range(len(train)), test_size=0.2, random_state=42)
train_subset = Subset(train_dataset, train_indices)
val_subset = Subset(train_dataset, val_indices)
train_data_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
val_data_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)

# model, metrics, optimizer
model = CharSiameseLSTM(vocab_size, embedding_dim, hidden_dim).to(device)
mse_loss = MSELoss()
mae_loss = L1Loss()
optimizer = Adam(model.parameters(), lr=learning_rate)

# used for saving best model
best_val_rmse = float('inf')

start_time = time.time()
for epoch in range(num_epochs):
    epoch_start_time = time.time()

    model.train()
    total_mse_train_loss = 0
    total_mae_train_loss = 0
    total_train_samples = 0
    # train loop
    for search_term, product_description, relevance in train_data_loader:
        search_term, product_description, relevance = (search_term.to(device),
                                                       product_description.to(device),
                                                       relevance.to(device))
        # backpropagation
        optimizer.zero_grad()
        outputs = model(search_term, product_description).squeeze(1)
        loss_mse = mse_loss(outputs, relevance)
        loss_mse.backward()
        optimizer.step()

        # inverse labels for correct calculation of metrics
        loss_mse = mse_loss(inverse_label_min_max_scaling(outputs), inverse_label_min_max_scaling(relevance))
        loss_mae = mae_loss(inverse_label_min_max_scaling(outputs), inverse_label_min_max_scaling(relevance))

        total_mse_train_loss += loss_mse.item() * len(relevance)
        total_mae_train_loss += loss_mae.item() * len(relevance)
        total_train_samples += len(relevance)

    train_rmse = sqrt(total_mse_train_loss / total_train_samples)
    train_mae = total_mae_train_loss / total_train_samples

    # validation loop
    model.eval()
    total_mse_val_loss = 0
    total_mae_val_loss = 0
    total_val_samples = 0
    with (torch.no_grad()):
        for search_term, product_description, relevance in val_data_loader:
            search_term, product_description, relevance = search_term.to(device), product_description.to(
                device), relevance.to(device)

            outputs = model(search_term, product_description).squeeze(1)
            val_loss_mse = mse_loss(inverse_label_min_max_scaling(outputs), inverse_label_min_max_scaling(relevance))
            val_loss_mae = mae_loss(inverse_label_min_max_scaling(outputs), inverse_label_min_max_scaling(relevance))

            total_mse_val_loss += val_loss_mse.item() * len(relevance)
            total_mae_val_loss += val_loss_mae.item() * len(relevance)
            total_val_samples += len(relevance)

    val_rmse = sqrt(total_mse_val_loss / total_val_samples)
    val_mae = total_mae_val_loss / total_val_samples

    epoch_time = time.time() - epoch_start_time
    print(f"Epoch {epoch + 1} ({epoch_time:.1f}s), Train RMSE: {train_rmse}, Val RMSE: {val_rmse}, Train MAE: {train_mae}, Val MAE: {val_mae}")

    # save best model
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        torch.save(model.state_dict(), 'best_model_character.pth')
        print(f"New best model saved at epoch {epoch + 1} with Val RMSE: {best_val_rmse:.4f}")

training_time = time.time() - start_time
print(f"Training time: {training_time:.4f}s")

Using cuda device.
Epoch 1 (67.5s), Train RMSE: 0.6365880877361887, Val RMSE: 0.6244732538452197, Train MAE: 0.5261875341821296, Val MAE: 0.5304179991468077
New best model saved at epoch 1 with Val RMSE: 0.6245
Epoch 2 (69.0s), Train RMSE: 0.532951635868758, Val RMSE: 0.5265944750298877, Train MAE: 0.4379062405822066, Val MAE: 0.43342422456478674
New best model saved at epoch 2 with Val RMSE: 0.5266
Epoch 3 (68.6s), Train RMSE: 0.5279955747337498, Val RMSE: 0.5239115365394476, Train MAE: 0.4329959358363937, Val MAE: 0.42881884012158555
New best model saved at epoch 3 with Val RMSE: 0.5239
Epoch 4 (68.4s), Train RMSE: 0.5255886189610628, Val RMSE: 0.5220285890981735, Train MAE: 0.431265924517407, Val MAE: 0.4306275996180767
New best model saved at epoch 4 with Val RMSE: 0.5220
Epoch 5 (67.8s), Train RMSE: 0.5232274291398465, Val RMSE: 0.5186105105750187, Train MAE: 0.4298817529667561, Val MAE: 0.4261180908513871
New best model saved at epoch 5 with Val RMSE: 0.5186
Epoch 6 (68.2s), Trai

In [12]:
# load best model
model = CharSiameseLSTM(vocab_size, embedding_dim, hidden_dim).to(device)
model.load_state_dict(torch.load('best_model_character.pth'))

<All keys matched successfully>

In [13]:
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model.eval()
total_mse_test_loss = 0
total_mae_test_loss = 0
total_test_samples = 0
test_start_time = time.time()
with torch.no_grad():
    # testing loop
    for search_term, product_description, relevance in test_data_loader:
        search_term, product_description, relevance = search_term.to(device), product_description.to(
            device), relevance.to(device)
        outputs = model(search_term, product_description).squeeze(1)

        test_loss_mse = mse_loss(inverse_label_min_max_scaling(outputs), inverse_label_min_max_scaling(relevance))
        test_loss_mae = mae_loss(inverse_label_min_max_scaling(outputs), inverse_label_min_max_scaling(relevance))

        total_mse_test_loss += test_loss_mse.item() * len(relevance)
        total_mae_test_loss += test_loss_mae.item() * len(relevance)
        total_test_samples += len(relevance)

test_time = time.time() - test_start_time
test_rmse = sqrt(total_mse_test_loss / total_test_samples)
test_mae = total_mae_test_loss / total_test_samples
print(f'Test time: {test_time:.1f}s, Test RMSE: {test_rmse}, Test MAE: {test_mae}')

Test time: 60.2s, Test RMSE: 0.5367674077954866, Test MAE: 0.4274112538822403
