In [None]:
import os
import re
import numpy as np
import pandas as pd
import pickle
from typing import Literal, Union
import matplotlib.pyplot as plt
from tqdm.auto import tqdm 

import copy
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchinfo import summary

import optuna
from skorch import NeuralNetClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score

In [3]:
if torch.backends.mps.is_available():
    if torch.backends.mps.is_built():
        device = "mps"
    else:
        device = "cpu"
else:
    device = "cpu"

torch.manual_seed(42)
if device=="mps":
    torch.mps.manual_seed(42)

print("Device: ", device)

Device:  mps


# Load Data

In [4]:
with open("datasets/processed/embed_and_cat_multilingual.pkl", "rb") as f:
    embed_and_cat_multilingual = pickle.load(f)

In [5]:
embed_and_cat_multilingual

{'tr-TR': {'train': {'embedding': [array([ 0.007  , -0.01726, -0.05533, ...,  0.03342,  0.01654, -0.04214],
          dtype=float16),
    array([ 0.004833,  0.01351 , -0.04028 , ..., -0.00474 ,  0.02744 ,
           -0.04797 ], dtype=float16),
    array([-0.00944  ,  0.0495   , -0.02803  , ...,  0.0009093, -0.01041  ,
           -0.0003083], dtype=float16),
    array([-0.01119 , -0.005062, -0.01663 , ...,  0.00601 , -0.011986,
           -0.02916 ], dtype=float16),
    array([ 0.05557,  0.01677, -0.0212 , ...,  0.03275, -0.02023, -0.02089],
          dtype=float16),
    array([-0.02498 ,  0.01452 , -0.01561 , ...,  0.0325  , -0.03096 ,
           -0.006466], dtype=float16),
    array([ 0.0309 ,  0.01285, -0.03137, ...,  0.0074 , -0.00997, -0.03296],
          dtype=float16),
    array([ 0.0674  , -0.00947 , -0.02762 , ...,  0.02054 , -0.009254,
           -0.0394  ], dtype=float16),
    array([ 0.0246  ,  0.02222 , -0.04706 , ...,  0.01193 , -0.03586 ,
           -0.011284], dtype=floa

# utils

In [6]:
def fit_scaler(X_train, scaler):
    scaler.fit(X_train)
    return scaler

def scale_features(data, scaler):
    return scaler.transform(data)

def make_dataloader(X, y, batch_size: int = 1, shuffle: bool = True, seed: int = 0):
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y).type(torch.LongTensor)
    tensor_dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(tensor_dataset, batch_size=batch_size, shuffle=shuffle, generator=torch.Generator().manual_seed(seed))
    
    return loader

In [7]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
  correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
  acc = (correct / len(y_pred)) * 100 
  return acc

def train_model(model: torch.nn.Module,
               data_loader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer,
               accuracy_fn,
               device: torch.device = device,
               verbose: bool = False):
  train_loss, train_acc = 0, 0
  model.to(device)
  model.train()
  for batch, (X_train, y_train) in enumerate(data_loader):
    # Send data to GPU
    X_train, y_train = X_train.to(device), y_train.to(device)
    # 1. Forward pass
    y_logits = model(X_train) # model outputs raw logits 
    y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1) # go from logits -> prediction probabilities -> prediction labels
    # Accumulate the loss values per batch
    loss = loss_fn(y_logits, y_train)
    # Accumulate loss and accuracy values per batch
    train_loss += loss
    train_acc += accuracy_fn(y_true=y_train, y_pred=y_pred)
    
    # 3. Calculate gradients and update parameters
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  # Calculate loss and accuracy per epoch and print out what's happening
  train_loss /= len(data_loader)
  train_acc /= len(data_loader)
  if verbose:
    print(f"Train loss: {train_loss:.5f} | Train accuracy: {train_acc:.2f}%")

  return train_loss, train_acc


def test_model(data_loader: torch.utils.data.DataLoader,
              model: torch.nn.Module,
              loss_fn: torch.nn.Module,
              accuracy_fn,
              device: torch.device = device,
              verbose: bool = False):
  test_loss, test_acc = 0, 0
  model.to(device)
  model.eval()
  with torch.no_grad():
    for X_test, y_test in data_loader:
      # Send data to GPU
      X_test, y_test = X_test.to(device), y_test.to(device)
      # 1. Forward pass
      y_logits = model(X_test)
      y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1) # go from logits -> prediction probabilities -> prediction labels
      # Accumulate the loss and accuracy values per batch
      test_loss += loss_fn(y_logits, y_test)
      test_acc += accuracy_fn(y_true=y_test, y_pred=y_pred)

    # Adjust metrics and print out
    test_loss /= len(data_loader)
    test_acc /= len(data_loader)
    if verbose:
      print(f"Test loss: {test_loss:.5f} | Test accuracy: {test_acc:.2f}%\n")

  return test_loss, test_acc


def eval_model(model: torch.nn.Module,
               data_loader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               accuracy_fn,
               device: torch.device = device):
  loss, acc = 0, 0
  model.eval()
  with torch.no_grad():
    for X, y in data_loader:
      # Send data to GPU
      X, y = X.to(device), y.to(device)
      # 1. Forward pass
      y_logits = model(X)
      y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1) # go from logits -> prediction probabilities -> prediction labels
      # Accumulate the loss and accuracy values per batch
      loss += loss_fn(y_logits, y)
      acc += accuracy_fn(y_true=y, y_pred=y_pred)

    # Scale loss and acc to find the average loss/acc per batch
    loss /= len(data_loader)
    acc /= len(data_loader)

  return {"model_name": model.__class__.__name__, # only works when model was created with a class
          "model_loss": loss.item(),
          "model_acc (%)": acc}

In [27]:
def get_performance_by_lang(model, scaler, loss_fn=nn.CrossEntropyLoss()):
    rows_train, rows_test, rows_val = [], [], []
    for lang, split_dict in embed_and_cat_multilingual.items(): 
        X_train_lang = split_dict["train"]["embedding"]
        y_train_lang = split_dict["train"]["category"]
        X_test_lang = split_dict["test"]["embedding"]
        y_test_lang = split_dict["test"]["category"]
        X_val_lang = split_dict["validation"]["embedding"]
        y_val_lang = split_dict["validation"]["category"]
        X_train_lang, y_train_lang, X_test_lang, y_test_lang, X_val_lang, y_val_lang = np.array(X_train_lang), np.array(y_train_lang), np.array(X_test_lang), np.array(y_test_lang), np.array(X_val_lang), np.array(y_val_lang)

        X_train_lang_scaled = scale_features(X_train_lang, scaler)
        X_test_lang_scaled = scale_features(X_test_lang, scaler)
        X_val_lang_scaled = scale_features(X_val_lang, scaler)

        train_loader_lang = make_dataloader(X=X_train_lang_scaled, y=y_train_lang, batch_size=64, shuffle=True, seed=42)
        test_loader_lang = make_dataloader(X=X_test_lang_scaled, y=y_test_lang, batch_size=X_test_lang_scaled.shape[0], shuffle=False, seed=42)
        val_loader_lang = make_dataloader(X=X_val_lang_scaled, y=y_val_lang, batch_size=X_val_lang_scaled.shape[0], shuffle=False, seed=42)

        perf_eval_train = eval_model(data_loader=train_loader_lang,
                                    model=model,
                                    loss_fn=loss_fn,
                                    accuracy_fn=accuracy_fn,
                                    device=device)
        perf_eval_train["lang"] = lang
        rows_train.append(perf_eval_train)
        perf_eval_test = eval_model(data_loader=test_loader_lang,
                                    model=model,
                                    loss_fn=loss_fn,
                                    accuracy_fn=accuracy_fn,
                                    device=device)
        perf_eval_test["lang"] = lang
        rows_test.append(perf_eval_test)
        perf_eval_val = eval_model(data_loader=val_loader_lang,
                                    model=model,
                                    loss_fn=loss_fn,
                                    accuracy_fn=accuracy_fn,
                                    device=device)
        perf_eval_val["lang"] = lang
        rows_val.append(perf_eval_val)

        train_perf_df = pd.DataFrame(rows_train)[["lang","model_acc (%)"]].rename(columns={"model_acc (%)": "train_accuracy"})
        test_perf_df = pd.DataFrame(rows_test)[["model_acc (%)"]].rename(columns={"model_acc (%)": "test_accuracy"})
        val_perf_df = pd.DataFrame(rows_val)[["model_acc (%)"]].rename(columns={"model_acc (%)": "val_accuracy"})

    return pd.concat([train_perf_df, test_perf_df, val_perf_df], axis=1).round(3)

# Construct train/test/validation sets

In [9]:
X_train, y_train, X_test, y_test, X_val, y_val  = [], [], [], [], [], []
for lang, split_dict in embed_and_cat_multilingual.items(): 
    X_train += split_dict["train"]["embedding"]
    y_train += split_dict["train"]["category"]
    X_test += split_dict["test"]["embedding"]
    y_test += split_dict["test"]["category"]
    X_val += split_dict["validation"]["embedding"]
    y_val += split_dict["validation"]["category"]
X_train, y_train, X_test, y_test, X_val, y_val = np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), np.array(X_val), np.array(y_val)

In [10]:
scaler = StandardScaler()
scaler = fit_scaler(X_train=X_train, scaler=scaler)

X_train_scaled = scale_features(X_train, scaler)
X_test_scaled = scale_features(X_test, scaler)
X_val_scaled = scale_features(X_val, scaler)

# y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
# y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

train_loader = make_dataloader(X=X_train_scaled, y=y_train, batch_size=64, shuffle=True, seed=42)
test_loader = make_dataloader(X=X_test_scaled, y=y_test, batch_size=16, shuffle=False, seed=42)
val_loader = make_dataloader(X=X_val_scaled, y=y_val, batch_size=1, shuffle=False, seed=42)

# Build Neural Network Constructor

In [11]:
class NNClassifier(nn.Module):
    def __init__(self,
                 input_dim_size: int,
                 output_dim_size: int,
                 layer_dims: list = [50,100,50,15],
                 layer_acts: Union[list, str] = "ReLU",
                 weight_init: init = init.kaiming_uniform_):
        super(NNClassifier, self).__init__()
        self.layers = []
        if type(layer_acts)==str:
            layer_acts = [getattr(torch.nn.modules.activation, layer_acts)()]*len(layer_dims)

        for layer_no, layer_dim in enumerate(layer_dims):
            if layer_no==0:
                self.layers.append(nn.Linear(input_dim_size, layer_dim))
            else:
                self.layers.append(nn.Linear(layer_dims[layer_no-1], layer_dim))
            self.layers.append(layer_acts[layer_no])
        self.layers.append(nn.Linear(layer_dims[layer_no], output_dim_size))
        for layer in self.layers:
            if not isinstance(layer, tuple({getattr(torch.nn.modules.activation, act) for act in torch.nn.modules.activation.__all__})):
                weight_init(layer.weight)

        self.linear_layer_stack = nn.Sequential(*self.layers)

    def forward(self, x):
        return self.linear_layer_stack(x)

# Example Regressor
model = NNClassifier(input_dim_size=X_train.shape[1],
                  output_dim_size=7,
                  layer_dims=[50,100,50,15],
                  layer_acts=[nn.ReLU(),nn.ReLU(),nn.ReLU(),nn.ReLU()])

summary(model, input_size=(1,1,1024))

Layer (type:depth-idx)                   Output Shape              Param #
NNClassifier                             [1, 1, 7]                 --
├─Sequential: 1-1                        [1, 1, 7]                 --
│    └─Linear: 2-1                       [1, 1, 50]                51,250
│    └─ReLU: 2-2                         [1, 1, 50]                --
│    └─Linear: 2-3                       [1, 1, 100]               5,100
│    └─ReLU: 2-4                         [1, 1, 100]               --
│    └─Linear: 2-5                       [1, 1, 50]                5,050
│    └─ReLU: 2-6                         [1, 1, 50]                --
│    └─Linear: 2-7                       [1, 1, 15]                765
│    └─ReLU: 2-8                         [1, 1, 15]                --
│    └─Linear: 2-9                       [1, 1, 7]                 112
Total params: 62,277
Trainable params: 62,277
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.06
Input size (MB): 0.00
Forward/bac

In [12]:
def get_best_model(best_params: dict):
    num_epochs = best_params["num_epochs"]
    weight_init_name = best_params["weight_init_name"]
    weight_init = getattr(init, weight_init_name)
    
    layer_dims = best_params["layer_dims"]
    act_name = best_params["act_name"]
    layer_acts = [getattr(nn, act_name)() for _ in range(len(layer_dims))]

    model = NNClassifier(input_dim_size=X_train.shape[1],
                         output_dim_size=len(np.unique(y_train)),
                         layer_dims=layer_dims,
                         layer_acts=layer_acts,
                         weight_init=weight_init)
    
    learning_rate = best_params["learning_rate"]
    optimizer_name = best_params["optimizer_name"]
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        _ = train_model(data_loader=train_loader,
                        model=model,
                        loss_fn=loss_fn,
                        optimizer=optimizer,
                        accuracy_fn=accuracy_fn,
                        device=device,
                        verbose=False)
        
        _ = test_model(data_loader=test_loader,
                        model=model,
                        loss_fn=loss_fn,
                        accuracy_fn=accuracy_fn,
                        device=device,
                        verbose=False)
        
    return model

# Base Model Prediction

In [13]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [29]:
epochs = 40
for epoch in range(epochs):
  print(f"Epoch: {epoch}\n---------")
  _ = train_model(data_loader=train_loader,
                model=model,
                loss_fn=loss_fn,
                optimizer=optimizer,
                accuracy_fn=accuracy_fn,
                device=device,
                verbose=True)
  
  _ = test_model(data_loader=test_loader,
                model=model,
                loss_fn=loss_fn,
                accuracy_fn=accuracy_fn,
                device=device,
                verbose=True)

Epoch: 0
---------
Train loss: 2.42301 | Train accuracy: 17.80%
Test loss: 2.24464 | Test accuracy: 14.70%

Epoch: 1
---------
Train loss: 2.01977 | Train accuracy: 21.28%
Test loss: 2.02858 | Test accuracy: 17.71%

Epoch: 2
---------
Train loss: 1.87665 | Train accuracy: 25.17%
Test loss: 1.92728 | Test accuracy: 18.98%

Epoch: 3
---------
Train loss: 1.78659 | Train accuracy: 28.77%
Test loss: 1.85801 | Test accuracy: 22.11%

Epoch: 4
---------
Train loss: 1.71421 | Train accuracy: 32.74%
Test loss: 1.80384 | Test accuracy: 26.27%

Epoch: 5
---------
Train loss: 1.64761 | Train accuracy: 36.52%
Test loss: 1.75782 | Test accuracy: 30.44%

Epoch: 6
---------
Train loss: 1.58378 | Train accuracy: 40.04%
Test loss: 1.71520 | Test accuracy: 32.41%

Epoch: 7
---------
Train loss: 1.52970 | Train accuracy: 42.82%
Test loss: 1.67369 | Test accuracy: 35.53%

Epoch: 8
---------
Train loss: 1.46949 | Train accuracy: 45.69%
Test loss: 1.63646 | Test accuracy: 37.50%

Epoch: 9
---------
Train los

In [30]:
perf_evals = []
perf_eval_train = eval_model(data_loader=train_loader,
                        model=model,
                        loss_fn=loss_fn,
                        accuracy_fn=accuracy_fn,
                        device=device)
perf_eval_train["split"] = "train"
perf_evals.append(perf_eval_train)
perf_eval_test = eval_model(data_loader=test_loader,
                        model=model,
                        loss_fn=loss_fn,
                        accuracy_fn=accuracy_fn,
                        device=device)
perf_eval_test["split"] = "test"
perf_evals.append(perf_eval_test)
perf_eval_val = eval_model(data_loader=val_loader,
                        model=model,
                        loss_fn=loss_fn,
                        accuracy_fn=accuracy_fn,
                        device=device)
perf_eval_val["split"] = "validation"
perf_evals.append(perf_eval_val)

pd.DataFrame(perf_evals)[["split","model_loss","model_acc (%)"]]

Unnamed: 0,split,model_loss,model_acc (%)
0,train,0.481531,85.366826
1,test,0.946321,68.402778
2,validation,0.994976,67.055394


In [31]:
base_model_perf_by_lang = get_performance_by_lang(model, scaler=scaler, loss_fn=nn.CrossEntropyLoss())
base_model_perf_by_lang

Unnamed: 0,lang,train_accuracy,test_accuracy,val_accuracy
0,tr-TR,84.714,68.293,74.627
1,en-US,88.113,70.621,60.563
2,es-ES,84.217,65.537,69.014
3,fr-FR,84.678,72.561,63.077
4,de-DE,84.245,68.0,68.116


# Hyperparameter Tuning

## Optuna

In [39]:
def objective(trial):
    num_epochs = trial.suggest_int("num_epochs", 15, 250)
    weight_init_name = trial.suggest_categorical("weight_init_name", ["kaiming_uniform_", "kaiming_normal_", "xavier_uniform_", "xavier_normal_"])
    weight_init = getattr(init, weight_init_name)
    
    layer_dims = trial.suggest_categorical("layer_dims", [[50,100,50,15],
                                                          [15,30,45,90,60,30,10]])
    
    act_name = trial.suggest_categorical("act_name", ["ReLU", "LeakyReLU"])
    layer_acts = [getattr(nn, act_name)() for _ in range(len(layer_dims))]

    model = NNClassifier(input_dim_size=X_train.shape[1],
                         output_dim_size=len(np.unique(y_train)),
                         layer_dims=layer_dims,
                         layer_acts=layer_acts,
                         weight_init=weight_init)
    
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True)
    optimizer_name = trial.suggest_categorical("optimizer_name", ["Adam","RMSprop","SGD"])
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        _ = train_model(data_loader=train_loader,
                        model=model,
                        loss_fn=loss_fn,
                        optimizer=optimizer,
                        accuracy_fn=accuracy_fn,
                        device=device,
                        verbose=False)
        
        _, test_acc = test_model(data_loader=test_loader,
                                model=model,
                                loss_fn=loss_fn,
                                accuracy_fn=accuracy_fn,
                                device=device,
                                verbose=False)
    #     if test_accs == []:
    #         best_model = copy.deepcopy(model)
    #     elif test_acc <= min(test_accs):
    #         best_model = copy.deepcopy(model)
    #     test_accs.append(test_acc)

    # _, test_acc = test_step(data_loader=test_loader,
    #                         model=best_model,
    #                         loss_fn=loss_fn,
    #                         accuracy_fn=accuracy_fn,
    #                         device=device,
    #                         verbose=False)

    return test_acc

In [40]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

[I 2025-03-09 00:01:16,194] A new study created in memory with name: no-name-22b01d78-fa74-458a-b7c2-526b41c14f84
[I 2025-03-09 00:01:42,356] Trial 0 finished with value: 83.50051440329217 and parameters: {'num_epochs': 98, 'weight_init_name': 'kaiming_uniform_', 'layer_dims': [50, 100, 50, 15], 'act_name': 'ReLU', 'learning_rate': 0.00017990372376304765, 'optimizer_name': 'RMSprop'}. Best is trial 0 with value: 83.50051440329217.
[I 2025-03-09 00:01:48,456] Trial 1 finished with value: 79.24382716049384 and parameters: {'num_epochs': 32, 'weight_init_name': 'xavier_uniform_', 'layer_dims': [50, 100, 50, 15], 'act_name': 'ReLU', 'learning_rate': 0.015240500111833059, 'optimizer_name': 'SGD'}. Best is trial 0 with value: 83.50051440329217.
[I 2025-03-09 00:02:23,939] Trial 2 finished with value: 81.5843621399177 and parameters: {'num_epochs': 190, 'weight_init_name': 'xavier_normal_', 'layer_dims': [50, 100, 50, 15], 'act_name': 'ReLU', 'learning_rate': 0.0007121797828032858, 'optimizer

In [41]:
trial = study.best_trial
best_params_optuna = trial.params

print(f"Best trial no {trial.number}:")
print("  Objective Value:", trial.value)
print("  Parameters:")
for key, value in best_params_optuna.items():
    print("    {}: {}".format(key,value))

Best trial no 3:
  Objective Value: 83.75771604938272
  Parameters:
    num_epochs: 48
    weight_init_name: kaiming_uniform_
    layer_dims: [50, 100, 50, 15]
    act_name: LeakyReLU
    learning_rate: 0.0028759900418347394
    optimizer_name: Adam


In [42]:
best_model = get_best_model(best_params_optuna)

In [43]:
optuna_model_perf_by_lang = get_performance_by_lang(best_model, scaler=scaler, loss_fn=nn.CrossEntropyLoss())
optuna_model_perf_by_lang

Unnamed: 0,lang,train_accuracy,test_accuracy,val_accuracy
0,tr-TR,100.0,84.146,92.537
1,en-US,100.0,81.356,90.141
2,es-ES,100.0,83.616,90.141
3,fr-FR,100.0,82.317,87.692
4,de-DE,100.0,81.714,86.957


## Skorch

In [None]:
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train).type(torch.LongTensor)

skorch_model = NeuralNetClassifier(module=NNClassifier,
                                   criterion=nn.CrossEntropyLoss,
                                   device=device)

param_grid = {
    'max_epochs': np.arange(15,45),
    'optimizer': [optim.Adam,optim.SGD, optim.RMSprop],
    'optimizer__lr': np.arange(1e-4, 1e-1, 0.001),
    'module__weight_init': [init.kaiming_normal_, init.kaiming_uniform_, init.xavier_normal_, init.xavier_uniform_],
    'module__layer_dims': [[50,100,50,15],
                           [15,30,45,90,60,30,10]],
    'module__layer_acts': ["ReLU", "LeakyReLU"],
    'module__input_dim_size': [X_train.shape[1]],
    'module__output_dim_size': [7] 
}

grid_search = RandomizedSearchCV(skorch_model, 
                                param_grid, 
                                scoring="accuracy",
                                cv=3, n_iter=5, 
                                n_jobs=-1, verbose=False)
grid_search = grid_search.fit(X_train_tensor, y_train_tensor)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.9415[0m       [32m0.2229[0m        [35m1.9122[0m  1.2434
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1   [36m782053.3813[0m       [32m0.1854[0m      [35m842.4233[0m  1.2888
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.9716[0m       [32m0.2667[0m        [35m1.9170[0m  1.3080
      2        [36m1.8963[0m       [32m0.3104[0m        [35m1.8677[0m  0.0979
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.9184[0m       [32m0.2521[0m        [35m1.8737[0m  1.3930
      2        [36m1.8654[0m       [32m0.3187[0m        [35m1.7893[0m  0.1012
      2      [36m116.6872[0m       0.1

3 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/toygunkarabas/Development/NLP & SLP/nlp_slp_env/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/toygunkarabas/Development/NLP & SLP/nlp_slp_env/lib/python3.11/site-packages/skorch/classifier.py", line 165, in fit
    return super(NeuralNetClassifier, self).fit(X, y, **fit_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/toygunkarabas/Development/NLP & SLP/nlp_slp_env/lib/python3.11/site-packages/skorch/net.py", l

      2        [36m0.6138[0m       [32m0.8206[0m        [35m0.6225[0m  0.1309
      3        [36m0.2967[0m       [32m0.8540[0m        [35m0.5455[0m  0.1295
      4        [36m0.1713[0m       [32m0.9249[0m        [35m0.3797[0m  0.1313
      5        [36m0.1417[0m       [32m0.9374[0m        [35m0.3409[0m  0.1304
      6        [36m0.1339[0m       0.9318        0.4031  0.1299
      7        [36m0.1190[0m       0.9291        0.3515  0.1286
      8        [36m0.0820[0m       [32m0.9513[0m        [35m0.2871[0m  0.1293
      9        [36m0.0253[0m       [32m0.9555[0m        0.3257  0.1317
     10        [36m0.0184[0m       [32m0.9597[0m        [35m0.2584[0m  0.1270
     11        0.0210       0.9485        0.2812  0.1284
     12        0.0248       0.9360        0.3541  0.1301
     13        0.0439       0.9444        0.3216  0.1294
     14        0.0527       0.9416        0.4070  0.1284
     15        0.0598       0.9263        0.4632  0.1293
    

In [22]:
best_params_skorch = grid_search.best_params_

print("Objective Value:", grid_search.best_score_)
print("Parameters:")
for key, value in best_params_skorch.items():
    print("  {}: {}".format(key,value))

Objective Value: 0.9312905904243148
Parameters:
  optimizer__lr: 0.0091
  optimizer: <class 'torch.optim.adam.Adam'>
  module__weight_init: <function kaiming_normal_ at 0x11de947c0>
  module__output_dim_size: 7
  module__layer_dims: [15, 30, 45, 90, 60, 30, 10]
  module__layer_acts: LeakyReLU
  module__input_dim_size: 1024
  max_epochs: 20


In [23]:
best_model_skorch = grid_search.best_estimator_
print("Best model is on this device: ", best_model_skorch.device)
print("********")

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32, device=device)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32, device=device)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32, device=device)

y_train_preds = best_model_skorch.predict(X_train_tensor)
y_test_preds = best_model_skorch.predict(X_test_tensor)
y_val_preds = best_model_skorch.predict(X_val_tensor)

train_score = accuracy_score(y_train, y_train_preds)
test_score = accuracy_score(y_test, y_test_preds)
val_score = accuracy_score(y_val, y_val_preds)

print("Best model's train accuracy score: ", train_score)
print("Best model's test accuracy score: ", test_score)
print("Best model's validation accuracy score: ", val_score)

Best model is on this device:  mps
********
Best model's train accuracy score:  0.9877607788595271
Best model's test accuracy score:  0.823803967327888
Best model's validation accuracy score:  0.8279883381924198


In [47]:
rows_train, rows_test, rows_val = [], [], []
for lang, split_dict in embed_and_cat_multilingual.items(): 
    X_train_lang = split_dict["train"]["embedding"]
    y_train_lang = split_dict["train"]["category"]
    X_test_lang = split_dict["test"]["embedding"]
    y_test_lang = split_dict["test"]["category"]
    X_val_lang = split_dict["validation"]["embedding"]
    y_val_lang = split_dict["validation"]["category"]
    X_train_lang, y_train_lang, X_test_lang, y_test_lang, X_val_lang, y_val_lang = np.array(X_train_lang), np.array(y_train_lang), np.array(X_test_lang), np.array(y_test_lang), np.array(X_val_lang), np.array(y_val_lang)

    X_train_lang_scaled = scale_features(X_train_lang, scaler)
    X_test_lang_scaled = scale_features(X_test_lang, scaler)
    X_val_lang_scaled = scale_features(X_val_lang, scaler)

    X_train_lang_tensor = torch.tensor(X_train_lang_scaled, dtype=torch.float32, device=device)
    X_test_lang_tensor = torch.tensor(X_test_lang_scaled, dtype=torch.float32, device=device)
    X_val_lang_tensor = torch.tensor(X_val_lang_scaled, dtype=torch.float32, device=device)

    y_train_preds = best_model_skorch.predict(X_train_lang_tensor)
    y_test_preds = best_model_skorch.predict(X_test_lang_tensor)
    y_val_preds = best_model_skorch.predict(X_val_lang_tensor)

    train_score = accuracy_score(y_train_lang, y_train_preds)*100
    test_score = accuracy_score(y_test_lang, y_test_preds)*100
    val_score = accuracy_score(y_val_lang, y_val_preds)*100

    rows_train.append([lang, train_score])
    rows_test.append([lang, test_score])
    rows_val.append([lang, val_score])

    train_perf_df = pd.DataFrame(rows_train, columns=["lang","train_accuracy"])
    test_perf_df = pd.DataFrame(rows_test, columns=["lang","test_accuracy"])[["test_accuracy"]]
    val_perf_df = pd.DataFrame(rows_val, columns=["lang","val_accuracy"])[["val_accuracy"]]

skorch_model_perf_by_lang = pd.concat([train_perf_df, test_perf_df, val_perf_df], axis=1).round(3)
skorch_model_perf_by_lang

Unnamed: 0,lang,train_accuracy,test_accuracy,val_accuracy
0,tr-TR,94.509,84.756,82.09
1,en-US,99.864,84.181,83.099
2,es-ES,99.704,81.921,83.099
3,fr-FR,99.867,81.098,83.077
4,de-DE,99.728,80.0,82.609


## Ray Tune

In [None]:
def generate_loaders_from_raw_data():
    with open("datasets/processed/embed_and_cat_multilingual.pkl", "rb") as f:
        embed_and_cat_multilingual = pickle.load(f)

    X_train, y_train, X_test, y_test, X_val, y_val  = [], [], [], [], [], []
    for lang, split_dict in embed_and_cat_multilingual.items(): 
        X_train += split_dict["train"]["embedding"]
        y_train += split_dict["train"]["category"]
        X_test += split_dict["test"]["embedding"]
        y_test += split_dict["test"]["category"]
        X_val += split_dict["validation"]["embedding"]
        y_val += split_dict["validation"]["category"]
    X_train, y_train, X_test, y_test, X_val, y_val = np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), np.array(X_val), np.array(y_val)

    scaler = StandardScaler()
    scaler = fit_scaler(X_train=X_train, scaler=scaler)

    X_train_scaled = scale_features(X_train, scaler)
    X_test_scaled = scale_features(X_test, scaler)
    X_val_scaled = scale_features(X_val, scaler)

    train_loader = make_dataloader(X=X_train_scaled, y=y_train, batch_size=64, shuffle=True, seed=42)
    test_loader = make_dataloader(X=X_test_scaled, y=y_test, batch_size=16, shuffle=False, seed=42)
    val_loader = make_dataloader(X=X_val_scaled, y=y_val, batch_size=1, shuffle=False, seed=42)

    return train_loader, test_loader, val_loader
    

def train_classifier(config):
    num_epochs = config["num_epochs"]
    weight_init_name = config["weight_init_name"]
    weight_init = getattr(init, weight_init_name)

    layer_dims = config["layer_dims"]
    act_name = config["act_name"]
    layer_acts = [getattr(nn, act_name)() for _ in range(len(layer_dims))]

    model = NNClassifier(input_dim_size=1024,
                         output_dim_size=7,
                         layer_dims=layer_dims,
                         layer_acts=layer_acts,
                         weight_init=weight_init)
    learning_rate = config["learning_rate"]
    optimizer_name = config["optimizer_name"]
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    train_loader, test_loader, _ = generate_loaders_from_raw_data()

    for epoch in range(num_epochs):
        _ = train_model(data_loader=train_loader,
                        model=model,
                        loss_fn=criterion,
                        optimizer=optimizer,
                        accuracy_fn=accuracy_fn,
                        device=device,
                        verbose=False)
        
        test_loss, test_acc = test_model(data_loader=test_loader,
                                        model=model,
                                        loss_fn=criterion,
                                        accuracy_fn=accuracy_fn,
                                        device=device,
                                        verbose=False)
    tune.report({"loss": test_loss, "accuracy": test_acc})
        
    print("Finished Training")

In [10]:
config = {
    'num_epochs': tune.randint(5, 10),
    'layer_dims': tune.choice([[15,30,15],
                               [15,90,10]]),
    'act_name': tune.choice(["ReLU", "LeakyReLU", "RReLU"]),
    'weight_init_name': tune.choice(["kaiming_normal_", "kaiming_uniform_", "xavier_normal_", "xavier_uniform_"]),
    'learning_rate': tune.loguniform(1e-4, 1e-1),
    'optimizer_name': tune.choice(["Adam", "SGD", "RMSprop"])
}

ray.shutdown()  # Clean any previous Ray instances
# Start Ray with specific CPU/GPU allocation
ray.init(num_cpus=12, num_gpus=0)

scheduler = ASHAScheduler(grace_period=5, # Run at least 5 epochs before stopping trials
                          reduction_factor=2 # # Reduce number of trials by 0x per iteration
                        )
tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train_classifier),
        resources={"cpu": 5} #, "gpu": 0, "num_workers": 2}
    ),
    tune_config=tune.TuneConfig(
        metric="accuracy",
        mode="max",
        scheduler=scheduler,
        num_samples=3, # equivalent to parameter 'n_trials' of optuna
        max_concurrent_trials=2 # Run 2 trials in parallel
    ),
    param_space=config,
)
ray_tuner_results = tuner.fit()

0,1
Current time:,2025-03-08 17:12:17
Running for:,00:02:09.88
Memory:,45.4/64.0 GiB

Trial name,status,loc,act_name,layer_dims,learning_rate,num_epochs,optimizer_name,weight_init_name
train_classifier_0a580_00000,PENDING,,RReLU,"[15, 30, 15]",0.00293318,5,SGD,kaiming_uniform_
train_classifier_0a580_00001,PENDING,,ReLU,"[15, 90, 10]",0.000342652,7,RMSprop,kaiming_uniform_


[33m(raylet)[0m bash: /Users/toygunkarabas/Development/NLP: No such file or directory
[33m(raylet)[0m bash: line 0: exec: /Users/toygunkarabas/Development/NLP: cannot execute: No such file or directory
[33m(raylet)[0m bash: SLP/nlp_slp_env/bin/python: No such file or directory
[33m(raylet)[0m [2025-03-08 17:11:08,039 E 40153 28468980] (raylet) worker_pool.cc:581: Some workers of the worker process(40169) have not registered within the timeout. The process is dead, probably it crashed during start.
[33m(raylet)[0m bash: /Users/toygunkarabas/Development/NLP: No such file or directory
[33m(raylet)[0m bash: line 0: exec: /Users/toygunkarabas/Development/NLP: cannot execute: No such file or directory
[33m(raylet)[0m bash: SLP/nlp_slp_env/bin/python: No such file or directory
[33m(raylet)[0m bash: /Users/toygunkarabas/Development/NLP: No such file or directory
[33m(raylet)[0m bash: line 0: exec: /Users/toygunkarabas/Development/NLP: cannot execute: No such file or directory

In above implementation, something went wrong. I will check it later.

## Allegro

In [None]:
# This will be implemented later!

# Observations

Hyperparameter tuning improves base models performance in all three sets. However, there is an overfitting issue as previous models (i.e. sklearn classifiers). To overcome this one can reduce number of epochs or reduce the number of parameters of the model. Moreover, one can add droput layers to improve generalization.

In [44]:
base_model_perf_by_lang

Unnamed: 0,lang,train_accuracy,test_accuracy,val_accuracy
0,tr-TR,84.714,68.293,74.627
1,en-US,88.113,70.621,60.563
2,es-ES,84.217,65.537,69.014
3,fr-FR,84.678,72.561,63.077
4,de-DE,84.245,68.0,68.116


In [45]:
optuna_model_perf_by_lang

Unnamed: 0,lang,train_accuracy,test_accuracy,val_accuracy
0,tr-TR,100.0,84.146,92.537
1,en-US,100.0,81.356,90.141
2,es-ES,100.0,83.616,90.141
3,fr-FR,100.0,82.317,87.692
4,de-DE,100.0,81.714,86.957


In [48]:
skorch_model_perf_by_lang

Unnamed: 0,lang,train_accuracy,test_accuracy,val_accuracy
0,tr-TR,94.509,84.756,82.09
1,en-US,99.864,84.181,83.099
2,es-ES,99.704,81.921,83.099
3,fr-FR,99.867,81.098,83.077
4,de-DE,99.728,80.0,82.609


# Resources

### Optuna
- https://www.geeksforgeeks.org/hyperparameter-tuning-with-optuna-in-pytorch/

### Skorch
- https://machinelearningmastery.com/how-to-grid-search-hyperparameters-for-pytorch-models/

- https://memudualimatou.medium.com/skorch-hyper-parameter-tuning-with-pytorch-b5af0ba8d45c

- https://debuggercafe.com/hyperparameter-search-with-pytorch-and-skorch/

### Ray Tune
- https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html

- https://debuggercafe.com/hyperparameter-tuning-with-pytorch-and-ray-tune/

- https://www.geeksforgeeks.org/hyperparameter-tuning-with-ray-tune-in-pytorch/

- https://docs.ray.io/en/latest/tune/index.html

### Allegro
- https://medium.com/pytorch/accelerate-your-hyperparameter-optimization-with-pytorchs-ecosystem-tools-bc17001b9a49

- https://github.com/clearml/clearml/tree/master/examples/frameworks/pytorch/notebooks/image