In [None]:
import os
import copy
import torch
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import product
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from torch.utils.tensorboard import SummaryWriter

random_state = 42

In [None]:
def fix_random(seed):
    torch.manual_seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

fix_random(random_state)

## Device

In [None]:
# PyTorch Device
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print("Device: {}".format(device))

## Data Loading


In [None]:
df = pd.read_csv("train.csv")
num_rows, num_cols = df.shape
print("Rows: ", num_rows)
print("Columns: ", num_cols)

In [None]:
print("Null rows:", df.shape[0] - df.dropna().shape[0])
print("Duplicated rows:", df.duplicated().sum())
df.drop_duplicates(inplace=True)

In [None]:
plt.title("Year distribution")
sns.kdeplot(data=df["Year"], fill=True, color="b")
plt.show()

### Train Test Split

In [None]:
# Separate indices in train/val/set
# "stratify=y" makes sure to keep the classes proportions on the dataset (useful on imbalanced classes)
# train, test = train_test_split(df, stratify=df["Year"], test_size=0.3, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["Year"]), df["Year"], stratify=df["Year"], test_size=0.3, random_state=random_state)

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Instantiate the RandomUnderSampler
rus = RandomUnderSampler(random_state=random_state, sampling_strategy='majority')

# Fit the RandomUnderSampler
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [None]:
print(X_resampled.shape, X_train.shape)

In [None]:
plt.title("Year distribution after Under Sampling")
sns.kdeplot(y_resampled, fill=True, color="b")
plt.show()

### Preprocessing 

In [None]:

from sklearn import preprocessing
from sklearn.covariance import OAS
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ("min-max", preprocessing.MinMaxScaler()),
    ("lmax", preprocessing.Normalizer(norm="max")),
    ("lda", LinearDiscriminantAnalysis(solver="eigen", shrinkage=None, covariance_estimator=OAS()))
])

# Fit the pipeline to the data
pipeline.fit(X_train, y_train)

# Transform the data using the pipeline
X_train_t = pipeline.transform(X_train)
X_test_t = pipeline.transform(X_test)

X_train_t = pd.DataFrame(X_train_t)
X_test_t = pd.DataFrame(X_test_t)

train_t = pd.DataFrame(X_train_t)
test_t = pd.DataFrame(X_test_t)

In [None]:
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
train_t['Year'] = y_train
test_t['Year'] = y_test

## Config


In [None]:
target = ['Year']
continous_cols = list(train_t.columns)[:-1]

In [None]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
    TabNetModelConfig, 
    TabTransformerConfig
)
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

In [None]:
data_config = DataConfig(
    target=target,
    continuous_cols=continous_cols,
    num_workers=0
)

optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="", # No additional layer in head, just a mapping layer to output_dim
    #dropout=0.1,
    initialization="kaiming"
).__dict__ # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

## Train

In [None]:
learning_rates = [0.005, 0.05]
batch_sizes = [256, 512]
#batch_sizes = [256, 512, 1024]

#virtual_batch_sizes = [128, 256]
n_epochs = [50, 100]
# Dimension of the prediction  layer
n_d = [16]
#n_d = [16, 32]
#n_d = [8, 16, 32, 64]
# Dimension of the attention  layer
n_a = [16]
#n_a = [8, 16, 32, 64]
# Number of successive steps in the network
#n_steps = [3, 5]
n_steps = [3, 5]

# Number of independent GLU layer in each GLU block
# n_indipendent = [2, 3]
# Coefficient for feature reusage in the masks. A value close to 1 will make mask selection least correlated between layers. Values range from 1.0 to 2.0.
gamma = [1.3, 1.5]

params = list(product(learning_rates, 
                      batch_sizes, 
                      #virtual_batch_sizes, 
                      n_epochs, 
                      n_d, 
                      n_a, 
                      n_steps, 
                      gamma))
comb = (
    len(learning_rates)
    * len(batch_sizes)
    #* len(virtual_batch_sizes)
    * len(n_epochs)
    * len(n_d)
    * len(n_a)
    * len(n_steps)
    * len(gamma)
)
print("Number of combinations: ", comb)

### TabNet

In [None]:
best_mse = float("inf")
best_model = None
best_params = None
iter = 0
results = pd.DataFrame(columns=['loss', 'r2', 'learning_rate', 'batch_size', 'epochs', 'n_d', 'n_a', 'n_steps', 'gamma'])


for learning_rate, batch_size, epochs, n_d, n_a, n_steps, gamma in params:
    iter += 1
    print(f"\nIteration: {iter} of {comb}")
    print(
        f"Configuration: learning_rate={learning_rate}, batch_size={batch_size}, n_epochs={epochs}, n_d={n_d}, n_a={n_a}, n_steps={n_steps}, gamma={gamma}"
    )
    trainer_config = TrainerConfig(
        batch_size=batch_size,
        max_epochs=epochs,
        early_stopping_patience=10,
    )

    model_config = TabNetModelConfig(
        task="regression",
        learning_rate=learning_rate,
        head="LinearHead",  # Linear Head
        head_config=head_config,  # Linear Head Config
        #virtual_batch_size=virtual_batch_size,
        n_d=n_d,
        n_a=n_a,
        n_steps=n_steps,
        gamma=gamma,
        # n_independent=n_independent
        )

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
    )

    tabular_model.fit(train=train_t)
    tabular_model.evaluate(test_t)

    y_pred= tabular_model.predict(X_test_t)
    mse= mean_squared_error(y_test, y_pred)
    r2= r2_score(y_test, y_pred)

    print("MSE: ", mse)
    print("R2: ", r2)

    if mse < best_mse:
        best_mse = mse
        best_model = copy.deepcopy(tabular_model)
        best_params = (learning_rate, batch_size, 
                       #virtual_batch_size, 
                       epochs, n_d, n_a, n_steps, gamma, #n_independent
                       )
        print("Best model updated")
    
    config = [mse, r2, learning_rate, batch_size, epochs, n_d, n_a, n_steps, gamma]
    results.loc[len(results)] = config

In [None]:
results.sort_values(by='r2', ascending=False).head()

In [None]:
results.sort_values(by='r2', ascending=False).to_csv('out.csv')

### TabTransformer

In [None]:
# model_config = TabTransformerConfig(
#     task="regression",
#     learning_rate = 1e-3,
#     head = "LinearHead", #Linear Head
#     head_config = head_config, # Linear Head Config
# )

# tabular_model = TabularModel(
#     data_config=data_config,
#     model_config=model_config,
#     optimizer_config=optimizer_config,
#     trainer_config=trainer_config,
# )

# tabular_model.fit(train=train_t)
# tabular_model.evaluate(test_t)

In [None]:
# y_pred= tabular_model.predict(X_test_t)
# mse= mean_squared_error(y_test, y_pred)
# r2= r2_score(y_test, y_pred)

# print("MSE: ", mse)
# print("R2: ", r2)

In [None]:
learning_rates = [0.001, 0.005]
batch_sizes = [256, 512]
n_epochs = [50, 100]
virtual_batch_sizes = [128, 256]

params = list(product(learning_rates, batch_sizes, n_epochs,virtual_batch_sizes ))
comb = (len(learning_rates)* len(batch_sizes) * len(n_epochs) * len(virtual_batch_sizes)
        )
print("Number of combinations: ", comb)

In [None]:
best_mse_tt = float("inf")
best_model_tt = None
best_params_tt = None
iter = 0
results_tt = pd.DataFrame(columns=['loss', 'r2', 'learning_rate', 'epochs', 'batch_size', 'virtual_batch_size'])

for learning_rate, batch_size, epochs, virtual_batch_size in params:
    iter += 1
    print(f"\nIteration: {iter} of {comb}")
    trainer_config = TrainerConfig(
        batch_size=batch_size,
        max_epochs=epochs,
        early_stopping_patience=5,
        load_best=True
    )

    model_config = TabTransformerConfig(
        task="regression",
        learning_rate=learning_rate,
        head="LinearHead",  # Linear Head
        head_config=head_config,  # Linear Head Config
        )

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        verbose=False
    )

    tabular_model.fit(train=train_t)
    tabular_model.evaluate(test_t)

    y_pred= tabular_model.predict(X_test_t)
    mse= mean_squared_error(y_test, y_pred)
    r2= r2_score(y_test, y_pred)

    config = [mse, r2, learning_rate, epochs, batch_size, virtual_batch_size]

    print("MSE: ", mse)
    print("R2: ", r2)

    if mse < best_mse_tt:
        best_mse_tt = mse
        best_model_tt = copy.deepcopy(tabular_model)
        best_params_tt = (learning_rate, batch_size, 
                       virtual_batch_size, 
                       epochs
                       )
        print("Best model updated")

    results_tt.loc[len(results_tt)] = config

In [None]:
results_tt.sort_values(by='r2', ascending=False).head()

In [None]:
results_tt.sort_values(by='r2', ascending=False).to_csv('out_tab_transformers.csv')

In [20]:
np.sqrt(70)

8.366600265340756