In [1]:
import torch
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import product
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from torch.utils.tensorboard import SummaryWriter

random_state = 42

In [2]:
def fix_random(seed):
    torch.manual_seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


fix_random(random_state)

## Device

In [3]:
# PyTorch Device
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print("Device: {}".format(device))

Device: cpu


## Data Loading

In [4]:
df = pd.read_csv("train.csv")
num_rows, num_cols = df.shape
print("Rows: ", num_rows)
print("Columns: ", num_cols)

Rows:  252175
Columns:  91


In [5]:
print("Null rows:", df.shape[0] - df.dropna().shape[0])
print("Duplicated rows:", df.duplicated().sum())
df.drop_duplicates(inplace=True)

Null rows: 0
Duplicated rows: 52


## Train Test split

In [6]:
# Separate indices in train/val/set
# "stratify=y" makes sure to keep the classes proportions on the dataset (useful on imbalanced classes)
train, test = train_test_split(df, stratify=df["Year"], test_size=0.3, random_state=random_state)
val, test = train_test_split(test, stratify=test["Year"], test_size=(1 / 3), random_state=random_state)

X_train = train.drop(columns=["Year"])
y_train = train["Year"]

X_val = val.drop(columns=["Year"])
y_val = val["Year"]

X_test = test.drop(columns=["Year"])
y_test = test["Year"]

## Preprocessing

In [7]:
from sklearn import preprocessing
from sklearn.covariance import OAS
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline

pipeline = Pipeline(
    steps=[
        ("min-max", preprocessing.MinMaxScaler()),
        ("lmax", preprocessing.Normalizer(norm="max")),
        ("lda", LinearDiscriminantAnalysis(solver="eigen", shrinkage=None, covariance_estimator=OAS())),
    ]
)

# pipeline = Pipeline(
#     steps=[
#         ("std", preprocessing.StandardScaler()),
#         ("l2", preprocessing.Normalizer(norm="l2")),
#     ]
# )

# Fit the pipeline to the data
pipeline.fit(X_train, y_train)

# Transform the data using the pipeline
X_train_t = pipeline.transform(X_train)
X_test_t = pipeline.transform(X_test)
X_val_t = pipeline.transform(X_val)

X_train_t = pd.DataFrame(X_train_t)
X_test_t = pd.DataFrame(X_test_t)
X_val_t = pd.DataFrame(X_val_t)

train_t = pd.DataFrame(X_train_t)
test_t = pd.DataFrame(X_test_t)
val_t = pd.DataFrame(X_val_t)

# X_train_t = pd.DataFrame(X_train).reset_index()
# X_test_t = pd.DataFrame(X_test).reset_index()

# train_t = pd.DataFrame(X_train).reset_index()
# test_t = pd.DataFrame(X_test).reset_index()

In [8]:
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

In [9]:
train_t["Year"] = y_train
test_t["Year"] = y_test
val_t["Year"] = y_val

## Config

In [10]:
target = ["Year"]
continous_cols = list(train_t.columns)[:-1]

In [11]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import TabNetModelConfig, TabTransformerConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

In [12]:
data_config = DataConfig(
    target=target,
    continuous_cols=continous_cols,
    num_workers=15,
)

trainer_config = TrainerConfig(batch_size=256, max_epochs=150, early_stopping_patience=10, load_best=True)

optimizer_config = OptimizerConfig(
    lr_scheduler="ReduceLROnPlateau",
    lr_scheduler_params={"patience": 10, "threshold": 1, "threshold_mode": "abs"},
)

head_config = LinearHeadConfig(
    layers="",  # No additional layer in head, just a mapping layer to output_dim
    # dropout=0.2,
    initialization="kaiming",
).__dict__  # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

## Train
### TabNet

In [13]:
model_config = TabNetModelConfig(
    task="regression",
    learning_rate=0.005,
    head="LinearHead",  # Linear Head
    head_config=head_config,  # Linear Head Config
    n_a=16,
    n_d=64,
    gamma=1.5,
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

tabular_model.fit(
    train=train_t,
    validation=val_t,
)
tabular_model.evaluate(test_t)

Seed set to 42


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


/home/riccardo/.local/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /home/riccardo/Data-Analytics-Project/Train_Module/saved_models exists and is not empty.


Output()

Output()

[{'test_loss': 81.02619934082031,
  'test_mean_squared_error': 81.02619934082031}]

In [14]:
y_pred = tabular_model.predict(X_test_t)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE: ", mse)
print("R2: ", r2)

MSE:  81.02620947750587
R2:  0.263888337281336


### Tab Transformer