In [44]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.nn import MSELoss, L1Loss
from torch.optim import Adam
from torch.utils.data import DataLoader, Subset
from math import sqrt
from gensim.models import Word2Vec

from utils.ClassicalML import *
from utils.new_preproc import *
from word_utils.WordDataset import WordDataset
from word_utils.WordSiameseLSTM import WordSiameseLSTM
from word_utils.word_utils import *
from utils.GLOBALS import *


In [50]:
# read and preprocess descriptions
descriptions = pd.read_csv('csv/product_descriptions.csv')
descriptions['product_description'] = descriptions['product_description'].apply(
    lambda x: preprocess_text(x, drop_stopwords=False))

In [226]:
# read and preprocess train
train = pd.read_csv('csv/train.csv', encoding='ISO-8859-1')
train = pd.merge(train, descriptions, on='product_uid')
train = train[['search_term', 'product_description', 'relevance']]
train['search_term'] = train['search_term'].apply(lambda x: preprocess_text(x, drop_stopwords=False))
train['relevance'] = train['relevance'].apply(min_max_scaling)

# read and preprocess test
test = pd.read_csv('csv/test.csv', encoding='ISO-8859-1')
test = pd.merge(test, descriptions, on='product_uid')
test_sol = pd.read_csv('csv/solution.csv')
test = pd.merge(test, test_sol, on='id')
test['search_term'] = test['search_term'].apply(lambda x: preprocess_text(x, drop_stopwords=False))
test = test[['search_term', 'product_description', 'relevance']]
test = test[test['relevance'] != -1].reset_index()
test['relevance'] = test['relevance'].apply(min_max_scaling)

In [246]:
# get all tokens and train word2vec model
all_tokens = list(train['search_term']) + list(train['product_description']) + list(test['search_term']) + list(
    test['product_description'])
word2vec_model = Word2Vec(sentences=all_tokens, vector_size=64, window=7, min_count=1, workers=4)
word2vec_model.save("word2vec.model")

# we tried normalizing word2vec model inputs but it did not perform well
# word2vec_model.wv.fill_norms()

In [247]:
# get train and test datasets
train_dataset = WordDataset(train['search_term'], train['product_description'], train['relevance'], word2vec_model)
test_dataset = WordDataset(test['search_term'], test['product_description'], test['relevance'], word2vec_model)

In [257]:
# train parameters
embedding_dim = word2vec_model.vector_size
hidden_dim = 64
learning_rate = 0.001
batch_size = 64
num_epochs = 100

# stratify validation set
binned_labels = pd.qcut(train['relevance'], q=3, labels=False, duplicates='drop')
train_indices, val_indices = train_test_split(range(len(train_dataset)), test_size=0.2, stratify=binned_labels,
                                              random_state=42)

# split train to train-val
train_subset = Subset(train_dataset, train_indices)
val_subset = Subset(train_dataset, val_indices)
train_data_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)
val_data_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate_fn)

# model, metrics, optimizer
model = WordSiameseLSTM(embedding_dim, hidden_dim).to(device)
mse_loss = MSELoss()
mae_loss = L1Loss()
optimizer = Adam(model.parameters(), lr=learning_rate)

# used for saving best model
best_val_rmse = float('inf')

start_time = time.time()
for epoch in range(num_epochs):
    epoch_start_time = time.time()

    model.train()
    total_mse_train_loss = 0
    total_mae_train_loss = 0
    total_train_samples = 0
    # train loop
    for search_term, product_description, relevance in train_data_loader:
        search_term, product_description, relevance = search_term.to(device), product_description.to(
            device), relevance.to(device)

        # backpropagation
        optimizer.zero_grad()
        outputs = model(search_term, product_description).squeeze(1)
        loss_mse = mse_loss(outputs, relevance)
        loss_mse.backward()
        optimizer.step()

        # inverse labels for correct calculation of metrics
        loss_mse = mse_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))
        loss_mae = mae_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))

        total_mse_train_loss += loss_mse.item() * len(relevance)
        total_mae_train_loss += loss_mae.item() * len(relevance)
        total_train_samples += len(relevance)

    train_rmse = sqrt(total_mse_train_loss / total_train_samples)
    train_mae = total_mae_train_loss / total_train_samples

    # validation loop
    model.eval()
    total_mse_val_loss = 0
    total_mae_val_loss = 0
    total_val_samples = 0
    with torch.no_grad():
        for search_term, product_description, relevance in val_data_loader:
            search_term, product_description, relevance = search_term.to(device), product_description.to(
                device), relevance.to(device)
            outputs = model(search_term, product_description).squeeze(1)

            val_loss_mse = mse_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))
            val_loss_mae = mae_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))

            total_mse_val_loss += val_loss_mse.item() * len(relevance)
            total_mae_val_loss += val_loss_mae.item() * len(relevance)
            total_val_samples += len(relevance)

    val_rmse = sqrt(total_mse_val_loss / total_val_samples)
    val_mae = total_mae_val_loss / total_val_samples

    epoch_time = time.time() - epoch_start_time
    print(
        f"Epoch {epoch + 1} ({epoch_time:.1f}s), Train RMSE: {train_rmse}, Val RMSE: {val_rmse}, Train MAE: {train_mae}, Val MAE: {val_mae}")

    # save best model
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        torch.save(model.state_dict(), 'best_model_character.pth')
        print(f"New best model saved at epoch {epoch + 1} with Val RMSE: {best_val_rmse:.4f}")

training_time = time.time() - start_time
print(f"Training time: {training_time:.4f}s")

Using cuda device.
Epoch 1 (23.8s), Train RMSE: 0.6134094162715071, Val RMSE: 0.5464724379159154, Train MAE: 0.5039927574018285, Val MAE: 0.4448336077851774
New best model saved at epoch 1 with Val RMSE: 0.5465
Epoch 2 (22.4s), Train RMSE: 0.5283244204494473, Val RMSE: 0.5322457182070796, Train MAE: 0.43524895884076376, Val MAE: 0.4338613074527862
New best model saved at epoch 2 with Val RMSE: 0.5322
Epoch 3 (22.8s), Train RMSE: 0.5221979014541914, Val RMSE: 0.5248814521025891, Train MAE: 0.43005353640130267, Val MAE: 0.4271986302623579
New best model saved at epoch 3 with Val RMSE: 0.5249
Epoch 4 (23.0s), Train RMSE: 0.5171902524754555, Val RMSE: 0.5199784545359663, Train MAE: 0.4260117705713543, Val MAE: 0.4245337900586564
New best model saved at epoch 4 with Val RMSE: 0.5200
Epoch 5 (22.8s), Train RMSE: 0.5136077832143948, Val RMSE: 0.5116403284183498, Train MAE: 0.4231072901686586, Val MAE: 0.41883682798273864
New best model saved at epoch 5 with Val RMSE: 0.5116
Epoch 6 (22.9s), T

KeyboardInterrupt: 

In [262]:
# load best model
model = WordSiameseLSTM(embedding_dim, hidden_dim).to(device)
model.load_state_dict(torch.load('best_model_character.pth'))

<All keys matched successfully>

In [260]:
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate_fn)
model.eval()
total_mse_test_loss = 0
total_mae_test_loss = 0
total_test_samples = 0
test_start_time = time.time()
with torch.no_grad():
    # testing loop
    for search_term, product_description, relevance in test_data_loader:
        search_term, product_description, relevance = search_term.to(device), product_description.to(
            device), relevance.to(device)
        outputs = model(search_term, product_description).squeeze(1)

        test_loss_mse = mse_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))
        test_loss_mae = mae_loss(inverse_min_max_scaling(outputs), inverse_min_max_scaling(relevance))

        total_mse_test_loss += test_loss_mse.item() * len(relevance)
        total_mae_test_loss += test_loss_mae.item() * len(relevance)
        total_test_samples += len(relevance)

test_time = time.time() - test_start_time
test_rmse = sqrt(total_mse_test_loss / total_test_samples)
test_mae = total_mae_test_loss / total_test_samples
print(f'Test time: {test_time:.1f}s, Test RMSE: {test_rmse}, Test MAE: {test_mae}')

Test time: 29.5s, Test RMSE: 0.5213677305455144, Test MAE: 0.4189072535836368


In [288]:
# get train/test data for classical ML algorithms
all_train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)
X_train, y_train, X_test, y_test = get_classical_ml_train_test_data(model, all_train_data_loader, test_data_loader)

In [328]:
train_rf(X_train, y_train, X_test, y_test, n_estimators=10)

Mean Absolute Error (MAE): 0.1471460350646687
Root Mean Squared Error (RMSE): 0.19857533087745619
Mean Absolute Error (MAE): 0.4354615000130297
Root Mean Squared Error (RMSE): 0.5408949218885898


In [327]:
train_gbr(X_train, y_train, X_test, y_test, n_estimators=10)

Train Mean Absolute Error (MAE): 0.3906506054622651
Train Root Mean Squared Error (RMSE): 0.47260190316357914
Test Mean Absolute Error (MAE): 0.4250206947224775
Test Root Mean Squared Error (RMSE): 0.5169295549319173


In [290]:
train_linear_regression(X_train, y_train, X_test, y_test)

Train Mean Absolute Error (MAE): 0.3706657588481903
Train Root Mean Squared Error (RMSE): 0.45788851380348206
Test Mean Absolute Error (MAE): 0.425370991230011
Test Root Mean Squared Error (RMSE): 0.5269988775253296


In [325]:
train_xgboost(X_train, y_train, X_test, y_test, n_estimators=10, learning_rate=0.09, max_depth=10)

Train Mean Absolute Error (MAE): 0.35695117712020874
Train Root Mean Squared Error (RMSE): 0.4309508800506592
Test Mean Absolute Error (MAE): 0.4214370846748352
Test Root Mean Squared Error (RMSE): 0.5144940614700317
