In [44]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.nn import MSELoss, L1Loss
from torch.optim import Adam
from torch.utils.data import DataLoader, Subset
from math import sqrt
from gensim.models import Word2Vec

from utils.new_preproc import *
from utils.GLOBALS import *
from DoubleLSTM_utils.DoubleLSTMDataset import *
from DoubleLSTM_utils.DoubleLSTMSiameseLSTM import *
from word_utils.word_utils import pad_collate_fn

In [50]:
# read and preprocess descriptions
descriptions = pd.read_csv('csv/product_descriptions.csv')
descriptions['product_description'] = descriptions['product_description'].apply(
    lambda x: preprocess_text(x, drop_stopwords=False))

In [226]:
# read and preprocess train
train = pd.read_csv('csv/train.csv', encoding='ISO-8859-1')
train = pd.merge(train, descriptions, on='product_uid')
train = train[['search_term', 'product_description', 'relevance']]
train['search_term'] = train['search_term'].apply(lambda x: preprocess_text(x, drop_stopwords=False))
train['relevance'] = train['relevance'].apply(min_max_scaling)

# read and preprocess test
test = pd.read_csv('csv/test.csv', encoding='ISO-8859-1')
test = pd.merge(test, descriptions, on='product_uid')
test_sol = pd.read_csv('csv/solution.csv')
test = pd.merge(test, test_sol, on='id')
test['search_term'] = test['search_term'].apply(lambda x: preprocess_text(x, drop_stopwords=False))
test = test[['search_term', 'product_description', 'relevance']]
test = test[test['relevance'] != -1].reset_index()
test['relevance'] = test['relevance'].apply(min_max_scaling)

In [234]:
# get different tokens and train 2 word2vec models
search_tokens = list(train['search_term']) + list(test['search_term'])
desc_tokens = list(train['product_description']) + list(test['product_description'])

word2vec_model_search = Word2Vec(sentences=search_tokens, vector_size=64, window=3, min_count=1, workers=4)
word2vec_model_desc = Word2Vec(sentences=desc_tokens, vector_size=256, window=10, min_count=1, workers=4)

In [240]:
# get train and test datasets
train_dataset = DoubleLSTMDataset(train['search_term'], train['product_description'], train['relevance'],
                                  word2vec_model_search, word2vec_model_desc)
test_dataset = DoubleLSTMDataset(test['search_term'], test['product_description'], test['relevance'],
                                 word2vec_model_search,
                                 word2vec_model_desc)

In [None]:
# train parameters
embedding_dim_search = word2vec_model_search.vector_size
embedding_dim_desc = word2vec_model_desc.vector_size
hidden_dim = 128
learning_rate = 0.001
batch_size = 64
num_epochs = 100

# stratify validation set
binned_labels = pd.qcut(train['relevance'], q=3, labels=False, duplicates='drop')
train_indices, val_indices = train_test_split(range(len(train_dataset)), test_size=0.2, stratify=binned_labels,
                                              random_state=42)

# split train to train-val
train_subset = Subset(train_dataset, train_indices)
val_subset = Subset(train_dataset, val_indices)
train_data_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)
val_data_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate_fn)

# model, metrics, optimizer
model = DoubleLSTMSiameseLSTM(embedding_dim_search, embedding_dim_desc, hidden_dim).to(device)
mse_loss = MSELoss()
mae_loss = L1Loss()
optimizer = Adam(model.parameters(), lr=learning_rate)

# used for saving best model
best_val_rmse = float('inf')

start_time = time.time()
model.train()
for epoch in range(num_epochs):
    epoch_start_time = time.time()

    model.train()
    total_mse_train_loss = 0
    total_mae_train_loss = 0
    total_train_samples = 0
    # train loop
    for search_term, product_description, relevance in train_data_loader:
        search_term, product_description, relevance = search_term.to(device), product_description.to(
            device), relevance.to(device)

        # backpropagation
        optimizer.zero_grad()
        outputs = model(search_term, product_description).squeeze(1)
        loss_mse = mse_loss(outputs, relevance)
        loss_mse.backward()
        optimizer.step()

        # inverse labels for correct calculation of metrics
        loss_mse = mse_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))
        loss_mae = mae_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))

        total_mse_train_loss += loss_mse.item() * len(relevance)
        total_mae_train_loss += loss_mae.item() * len(relevance)
        total_train_samples += len(relevance)

    train_rmse = sqrt(total_mse_train_loss / total_train_samples)
    train_mae = total_mae_train_loss / total_train_samples

    # validation loop
    model.eval()
    total_mse_val_loss = 0
    total_mae_val_loss = 0
    total_val_samples = 0
    with torch.no_grad():
        for search_term, product_description, relevance in val_data_loader:
            search_term, product_description, relevance = search_term.to(device), product_description.to(
                device), relevance.to(device)
            outputs = model(search_term, product_description).squeeze(1)

            val_loss_mse = mse_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))
            val_loss_mae = mae_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))

            total_mse_val_loss += val_loss_mse.item() * len(relevance)
            total_mae_val_loss += val_loss_mae.item() * len(relevance)
            total_val_samples += len(relevance)

    val_rmse = sqrt(total_mse_val_loss / total_val_samples)
    val_mae = total_mae_val_loss / total_val_samples

    epoch_time = time.time() - epoch_start_time
    print(
        f"Epoch {epoch + 1} ({epoch_time:.1f}s), Train RMSE: {train_rmse}, Val RMSE: {val_rmse}, Train MAE: {train_mae}, Val MAE: {val_mae}")

    # save best model
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        torch.save(model.state_dict(), 'best_model_character.pth')
        print(f"New best model saved at epoch {epoch + 1} with Val RMSE: {best_val_rmse:.4f}")

training_time = time.time() - start_time
print(f"Training time: {training_time:.4f}s")

Using cuda device.
Epoch 1 (36.7s), Train RMSE: 0.5994360307510387, Val RMSE: 0.5338863029025355, Train MAE: 0.4939505035209529, Val MAE: 0.44080752995359573
New best model saved at epoch 1 with Val RMSE: 0.5339
Epoch 2 (36.6s), Train RMSE: 0.5196622151710452, Val RMSE: 0.5183462907188503, Train MAE: 0.42853733127015314, Val MAE: 0.42474584185483244
New best model saved at epoch 2 with Val RMSE: 0.5183
Epoch 3 (36.4s), Train RMSE: 0.5106002663663126, Val RMSE: 0.5134962952144745, Train MAE: 0.4190081533828514, Val MAE: 0.41958791725674477
New best model saved at epoch 3 with Val RMSE: 0.5135
Epoch 4 (36.7s), Train RMSE: 0.5049939071364626, Val RMSE: 0.5098959605954732, Train MAE: 0.41406650427168273, Val MAE: 0.41112735876754436
New best model saved at epoch 4 with Val RMSE: 0.5099
Epoch 5 (36.7s), Train RMSE: 0.4966879636459949, Val RMSE: 0.5130621217563953, Train MAE: 0.40528281542218053, Val MAE: 0.4109092529838475
Epoch 6 (36.7s), Train RMSE: 0.48648589861797176, Val RMSE: 0.503339

In [232]:
# load best model
model = DoubleLSTMSiameseLSTM(embedding_dim_search, embedding_dim_desc, hidden_dim).to(device)
model.load_state_dict(torch.load('best_model_character.pth'))

<All keys matched successfully>

In [233]:
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate_fn)
model.eval()
total_mse_test_loss = 0
total_mae_test_loss = 0
total_test_samples = 0
test_start_time = time.time()
with torch.no_grad():
    # testing loop
    for search_term, product_description, relevance in test_data_loader:
        search_term, product_description, relevance = search_term.to(device), product_description.to(
            device), relevance.to(device)
        outputs = model(search_term, product_description).squeeze(1)

        test_loss_mse = mse_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))
        test_loss_mae = mae_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))

        total_mse_test_loss += test_loss_mse.item() * len(relevance)
        total_mae_test_loss += test_loss_mae.item() * len(relevance)
        total_test_samples += len(relevance)

test_time = time.time() - test_start_time
test_rmse = sqrt(total_mse_test_loss / total_test_samples)
test_mae = total_mae_test_loss / total_test_samples
print(f'Test time: {test_time:.1f}s, Test RMSE: {test_rmse}, Test MAE: {test_mae}')

Test time: 43.0s, Test RMSE: 0.5294878105664198, Test MAE: 0.42492599654661695
