In [1]:
import os
import copy
import torch
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import product
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

random_state = 42
preproc = True

In [2]:
def fix_random(seed):
    torch.manual_seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

fix_random(random_state)

## Device

In [3]:
# PyTorch Device
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print("Device: {}".format(device))

Device: cuda


## Data Loading


In [4]:
df = pd.read_csv("train.csv")
num_rows, num_cols = df.shape
print("Rows: ", num_rows)
print("Columns: ", num_cols)

Rows:  252175
Columns:  91


In [5]:
print("Null rows:", df.shape[0] - df.dropna().shape[0])
print("Duplicated rows:", df.duplicated().sum())
df.drop_duplicates(inplace=True)

Null rows: 0
Duplicated rows: 52


### Train Test Split

In [6]:
# Separate indices in train/val/set
# "stratify=y" makes sure to keep the classes proportions on the dataset (useful on imbalanced classes)
train, test = train_test_split(df, stratify=df["Year"], test_size=0.3, random_state=random_state)
val, test = train_test_split(test, stratify=test["Year"], test_size=(1 / 3), random_state=random_state)

X_train = train.drop(columns=["Year"])
y_train = train["Year"]

X_val = val.drop(columns=["Year"])
y_val = val["Year"]

X_test = test.drop(columns=["Year"])
y_test = test["Year"]

### Preprocessing 

In [7]:
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

pipeline = Pipeline(
    steps=[
        ("std", preprocessing.StandardScaler()),
        ("l2", preprocessing.Normalizer(norm="l2")),
    ]
)

if preproc==True:
    # Fit the pipeline to the data
    pipeline.fit(X_train, y_train)

    # Transform the data using the pipeline
    X_train = pipeline.transform(X_train)
    X_test = pipeline.transform(X_test)
    X_val = pipeline.transform(X_val)

    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    X_val = pd.DataFrame(X_val)

    train = pd.DataFrame(X_train)
    test = pd.DataFrame(X_test)
    val = pd.DataFrame(X_val)
    
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True) 
    y_val.reset_index(drop=True, inplace=True)

    train["Year"] = y_train
    test["Year"] = y_test
    val["Year"] = y_val

## Config


In [8]:
target = ["Year"]
continous_cols = list(train.columns)[:-1]

In [9]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import TabTransformerConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

In [10]:
data_config = DataConfig(target=target, continuous_cols=continous_cols, num_workers=0)

optimizer_config = OptimizerConfig(
    optimizer="AdamW",
    lr_scheduler="ReduceLROnPlateau",
    lr_scheduler_params={"patience": 9, "threshold": 1, "threshold_mode": "abs"},
)

head_config = LinearHeadConfig(
    layers="",  # No additional layer in head, just a mapping layer to output_dim
    # dropout=0.2,
    initialization="kaiming",
).__dict__  # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

if preproc == False:
    experiment_config = ExperimentConfig(
        project_name="TabTransformer",
        run_name="TabTransformer-raw",
        log_target="tensorboard",
    )
else:
    experiment_config = ExperimentConfig(
        project_name="TabTransformer",
        run_name="TabTransformer-preproc",
        log_target="tensorboard",
    )

## Train

### TabTransformer

In [11]:
virtual_batch_sizes = [64, 128]
batch_sizes = [256, 512]
n_epochs = [100]
learning_rates = [0.01]
num_heads = [8]  # default is 8
num_attn_blocks = [6]  # default is 6
transformer_activation = ['ReGLU','GEGLU', "SwiGLU"]  #'ReLU', 'LeakyReLU'

params = list(
    product(
        learning_rates, batch_sizes, n_epochs, virtual_batch_sizes, num_heads, num_attn_blocks, transformer_activation
    )
)

comb = (
    len(learning_rates)
    * len(batch_sizes)
    * len(n_epochs)
    * len(virtual_batch_sizes)
    * len(num_heads)
    * len(num_attn_blocks)
    * len(transformer_activation)
)


print("Number of combinations: ", comb)

Number of combinations:  12


In [12]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error


best_mse_tt = float("inf")
best_model_tt = None
best_params_tt = None
iter = 0
results_tt = pd.DataFrame(
    columns=[
        "loss",
        "r2",
        "learning_rate",
        "epochs",
        "batch_size",
        "virtual_batch_size",
        "num_heads",
        "num_attn_blocks",
        "transformer_activation",
    ]
)

for learning_rate, batch_size, epochs, virtual_batch_size, num_heads, num_attn_blocks, transformer_activation in params:
    iter += 1
    print(f"\nIteration: {iter} of {comb}")
    trainer_config = TrainerConfig(batch_size=batch_size, max_epochs=epochs, early_stopping_patience=10, load_best=True)

    model_config = TabTransformerConfig(
        task="regression",
        head="LinearHead",  # Linear Head
        head_config=head_config,  # Linear Head Config
        loss="MSELoss",
        seed=random_state,
        learning_rate=learning_rate,
        virtual_batch_size=virtual_batch_size,
        num_heads=num_heads,
        num_attn_blocks=num_attn_blocks,
        ff_hidden_multiplier=64,
        transformer_activation=transformer_activation,
    )

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        experiment_config=experiment_config,
    )

    tabular_model.fit(train=train, validation=val)
    tabular_model.evaluate(test)

    y_pred = tabular_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    config = [
        mse,
        r2,
        learning_rate,
        batch_size,
        epochs,
        virtual_batch_size,
        num_heads,
        num_attn_blocks,
        transformer_activation,
    ]

    print("MSE: ", mse)
    print("MAE: ", mae)
    print("R2: ", r2)

    if mse < best_mse_tt:
        best_mse_tt = mse
        best_model_tt = copy.deepcopy(tabular_model)
        best_params_tt = (
            learning_rate,
            batch_size,
            epochs,
            virtual_batch_size,
            num_heads,
            num_attn_blocks,
            transformer_activation,
        )
        print("Best model updated")

    results_tt.loc[len(results_tt)] = config

Seed set to 42



Iteration: 1 of 12


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs





c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\evang.HOMEEVANGELISTI\git\Data-Analytics-Project\Train_Module\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Seed set to 42


MSE:  78.06022291307843
MAE:  6.3308908199794
R2:  0.29083390607470294
Best model updated

Iteration: 2 of 12


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\evang.HOMEEVANGELISTI\git\Data-Analytics-Project\Train_Module\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Seed set to 42


MSE:  78.06022291307843
MAE:  6.3308908199794
R2:  0.29083390607470294

Iteration: 3 of 12


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\evang.HOMEEVANGELISTI\git\Data-Analytics-Project\Train_Module\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Seed set to 42


MSE:  78.06022291307843
MAE:  6.3308908199794
R2:  0.29083390607470294

Iteration: 4 of 12


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\evang.HOMEEVANGELISTI\git\Data-Analytics-Project\Train_Module\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Seed set to 42


MSE:  78.047297711548
MAE:  6.331870650534013
R2:  0.2909513297040557
Best model updated

Iteration: 5 of 12


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\evang.HOMEEVANGELISTI\git\Data-Analytics-Project\Train_Module\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Seed set to 42


MSE:  78.047297711548
MAE:  6.331870650534013
R2:  0.2909513297040557

Iteration: 6 of 12


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\evang.HOMEEVANGELISTI\git\Data-Analytics-Project\Train_Module\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Seed set to 42


MSE:  78.047297711548
MAE:  6.331870650534013
R2:  0.2909513297040557

Iteration: 7 of 12


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\evang.HOMEEVANGELISTI\git\Data-Analytics-Project\Train_Module\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Seed set to 42


MSE:  78.06285538662539
MAE:  6.341368585549556
R2:  0.29080999042454236

Iteration: 8 of 12


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\evang.HOMEEVANGELISTI\git\Data-Analytics-Project\Train_Module\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Seed set to 42


MSE:  78.06285538662539
MAE:  6.341368585549556
R2:  0.29080999042454236

Iteration: 9 of 12


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\evang.HOMEEVANGELISTI\git\Data-Analytics-Project\Train_Module\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Seed set to 42


MSE:  78.06285538662539
MAE:  6.341368585549556
R2:  0.29080999042454236

Iteration: 10 of 12


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\evang.HOMEEVANGELISTI\git\Data-Analytics-Project\Train_Module\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Seed set to 42


MSE:  78.04362090993776
MAE:  6.329411388601924
R2:  0.29098473292709004
Best model updated

Iteration: 11 of 12


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\evang.HOMEEVANGELISTI\git\Data-Analytics-Project\Train_Module\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Seed set to 42


MSE:  78.04362090993776
MAE:  6.329411388601924
R2:  0.29098473292709004

Iteration: 12 of 12


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\evang.HOMEEVANGELISTI\git\Data-Analytics-Project\Train_Module\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

c:\Users\evang.HOMEEVANGELISTI\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


MSE:  78.04362090993776
MAE:  6.329411388601924
R2:  0.29098473292709004


In [13]:
results_tt.sort_values(by="r2", ascending=False).head()

Unnamed: 0,loss,r2,learning_rate,epochs,batch_size,virtual_batch_size,num_heads,num_attn_blocks,transformer_activation
9,78.043621,0.290985,0.01,512,100,128,8,6,ReGLU
10,78.043621,0.290985,0.01,512,100,128,8,6,GEGLU
11,78.043621,0.290985,0.01,512,100,128,8,6,SwiGLU
3,78.047298,0.290951,0.01,256,100,128,8,6,ReGLU
4,78.047298,0.290951,0.01,256,100,128,8,6,GEGLU


In [14]:
if preproc == True:
    results_tt.sort_values(by="r2", ascending=False).to_csv("3_TF-preproc_output.csv")
else:
    results_tt.sort_values(by="r2", ascending=False).to_csv("3_TF-raw_output.csv")

In [15]:
import pickle

file = open("3_" + "TF" + "_preproc_" + ".save", "wb")
pickle.dump(pipeline, file)

file = open("3_" + "TF" + "_model_" + ".save", "wb")
pickle.dump(best_model_tt, file)
file.close()