In [None]:
import torch
import copy
import os
import shutil
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import product
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import torch
from torch import nn, Tensor
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.tensorboard import SummaryWriter  # log writer to visualize the loss functions

random_state = 42

plt.style.use("seaborn-v0_8")

### Set seeds


In [None]:
# For reproducibility, fix all the seeds

def fix_random(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use.
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

Da valutare come vogliamo disporre le directory in base al preprocess che facciamo. Ho lasciato una versione che differenzia in base al preproc per averla già pronta da modificare nel caso serva


In [None]:
# versione Rick
base_dir = "NN_outputs/"
model_dir = base_dir + "models/"
runs_dir = base_dir + "runs/"

shutil.rmtree(model_dir, ignore_errors=True)
shutil.rmtree(runs_dir, ignore_errors=True)
os.makedirs(model_dir, exist_ok=True)
os.makedirs(runs_dir, exist_ok=True)

######################
# if not os.path.exists('results'):
#     os.mkdir('results')

# if not os.path.exists('results/pca'):
#     os.mkdir('results/pca')

# if not os.path.exists('results/no_pca'):
#     os.mkdir('results/no_pca')

#### Data Layer


In [None]:
class MyDataset(Dataset):
    # Save X and y as Tensors, accordingly to the type of the data
    # https://pytorch.org/docs/stable/tensors.html
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)

        # Useful attributes
        self.num_features = X.shape[1]
        self.num_classes = len(np.unique(y))

    # Dataset size
    def __len__(self):
        return self.X.shape[0]

    # Fetch a data sample (single sample or batch) for a given index/es
    # (if the dataset is not in memory, it can read from file system and return the object)
    def __getitem__(self, idx):
        return self.X[idx, :], self.y[idx]

### Defining the neural network architecture


In [None]:
# Neural Network class
# Extend the abstract class "Module"

class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNetwork, self).__init__()

        # Useful attributes
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Definition of Layers
        self.fc1 = nn.Linear(self.input_size, self.hidden_size)  # input to hidden
        self.fc2 = nn.Linear(self.hidden_size, num_classes)  # hidden to output

        # Activation Function
        self.relu = nn.ReLU()

    # How layers are connected between them
    # This even defines the graph of backpropagation
    def forward(self, x):
        h = self.fc1(x)  # first layer
        h = self.relu(h)  # activation function
        output = self.fc2(h)  # second layer
        return output

    def _get_name(self):
        return "FeedForward"

Training Function

In [None]:
# Function for the training process

def train_model(
    model: nn.Module,  # instance of class to train
    criterion,  # instance of loss function
    optimizer,  # instance of optimizer
    epochs,  # number of
    train_loader: DataLoader,
    val_loader: DataLoader,
    device,  # to train on
    log_writer,
    log_name,
):
    n_iter = 0
    best_valid_loss = float("inf")  # initialized to worst possible value
    
    #variabili per early stopping 
    #########################################DA VEDERE
    patience = 10
    epochs_no_improve = 0

    # EPOCHS
    for epoch in range(epochs):
        model.train()  # activate training mode (for BatchNorm or Dropout)

        # BATCHES
        for data, targets in train_loader:  # get_item from MyDataset class (single item or batch)
            data, targets = data.to(device), targets.to(device)  # move data and targets to cpu/gpu

            optimizer.zero_grad()  # gradient to zero

            # Forward pass
            y_pred = model(data)

            # Compute Loss
            loss = criterion(y_pred, targets)
            log_writer.add_scalar("Loss/train", loss, n_iter)  # plot the batches

            # Backward pass
            loss.backward()
            optimizer.step()

            n_iter += 1

        # Valuation
        y_test, _, y_pred = test_model(model, val_loader, device)
        loss_val = criterion(y_pred, y_test)
        log_writer.add_scalar("Loss/val", loss_val, epoch)  # plot the epochs

        # Save the model with best loss through the epochs
        if loss_val.item() < best_valid_loss:
            best_valid_loss = loss_val.item()
            torch.save(model.state_dict(), model_dir + log_name)
        # #early stopping 
        # #####################################################DA VEDERE
        # else:
        #     epochs_no_improve += 1
        #     if epochs_no_improve == patience:
        #         print('Early stopping!')
        #         break
            
### best model me lo salvo in qualche modo

Test Function

In [None]:
def test_model(model: nn.Module, data_loader: DataLoader, device) -> tuple[Tensor, Tensor, Tensor]:
    """return:
    - y_test - true lables
    - y_pred_c - has 1 column, where each element is the predicted lable with bigger probability among the "c" predicted
    - y_pred - has "c" columns as the number of classes of the test set
    """
    model.eval()  # activate evaluation mode (for BatchNorm or Dropout)

    y_pred = []
    y_test = []

    # ## oppure
    # y_pred = torch.tensor([]).to(device)
    # y_true = torch.tensor([]).to(device)

    for data, targets in data_loader:
        data, targets = data.to(device), targets.to(device)  # move data and targets to cpu/gpu
        
        y_pred.append(model(data))  # accumulate predictions
        y_test.append(targets)  # accumulate labels

    y_test = torch.stack(y_test).squeeze()  # it's one column (each row is a different sample)
    y_pred = torch.stack(
        y_pred
    ).squeeze()  # there are "c" columns as the number of classes. Each column is the probability (as float number) to that class (each row is a different sample)
    y_pred_c = y_pred.argmax(
        dim=1, keepdim=True
    ).squeeze()  # return max position of prediction array: that is the class I will associate with the sample
    return y_test, y_pred_c, y_pred

#### Device

In [None]:
# PyTorch Device
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

print("Using {} device".format(device))

---

## Train

### Hyperparameters

In [None]:
# valutare hidden_size
hidden_size = [128, 256, 512]  # 128
# oppure
# hidden_sizes =  [256, 512, 1024]

# da valutare batch_size
batch_size = [32, 64]  # 32
# oppure
# batch_size = [8,16,32]

# abbiamo detto che non facciamo drop out
dropout_p = [0.2, 0.3]  # 0.2

depth = [3, 4, 5]

# da valutare learning_rate
learning_rate = [0.001, 0.1]
# oppure
# learning_rate = [0.01, 0.001]
# learning_rate = [0.1, 0.01, 0.001]

num_epochs = 200

hyperparams = product(hidden_size, batch_size, dropout_p, depth, learning_rate)
print("Number of combinations:", len(list(hyperparams)))

############# qual è la differenza? #######################
# hyperparameters = itertools.product(hidden_sizes, depth, nums_epochs, batch_sizes, learning_rate, step_size_lr_decay, momentum)
# n_comb = len(hidden_sizes)*len(depth)*len(nums_epochs)*len(batch_sizes)*len(learning_rate)*len(step_size_lr_decay)*len(momentum)
# print (f'Number of hyperparameter combinations: {n_comb}')

### DATA LOADING


In [None]:
df = pd.read_csv("train.csv")
num_rows, num_cols = df.shape
print("Rows: ", num_rows)
print("Columns: ", num_cols)

In [None]:
print("Null rows:", df.shape[0] - df.dropna().shape[0])
print("Duplicated rows:", df.duplicated().sum())
df.drop_duplicates(inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

df_X = df.iloc[:, 1:]
df_y = df.iloc[:, 0]
indices = np.arange(df_X.shape[0])  # useful later to split the data in train/val/test

# Separate indices in train/val/set
# "stratify=y" makes sure to keep the classes proportions on the dataset (useful on imbalanced classes)
train_idx, test_idx = train_test_split(indices, test_size=0.2, stratify=df_y, random_state=random_state)
train_idx, val_idx = train_test_split(train_idx, test_size=0.2, stratify=df_y[train_idx], random_state=random_state)

In [None]:
#VEDERE COME PREPROCESSARE
#visto che sono tutte gaussiane
#possiamo normalizzare, batch normalizzare, 
#standardizzare + scaler che tolga gli outlier
# altro?? da vedere 

In [None]:
## ciclare sotto su batch size 
# DataLoaders
my_dataset = MyDataset(df_X, df_y)

train_subset = Subset(my_dataset, train_idx)
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)

val_subset = Subset(my_dataset, val_idx)
val_loader = DataLoader(val_subset, batch_size=1)

test_subset = Subset(my_dataset, test_idx)
test_loader = DataLoader(test_subset, batch_size=1)

#### Model, Criterion, Optimizer

In [None]:
fix_random(random_state)

hidden_size = 32
model = NeuralNetwork(my_dataset.num_features, hidden_size, my_dataset.num_classes)
model.to(device)  # move the NN to device

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

log_writer = SummaryWriter(runs_dir + model._get_name())  # Start tensorboard

### Run

Run Tensorboard from the command line:

> tensorboard --logdir nn/runs/


In [None]:
### ciclare sui parametri e inserire la roba nella cella sotto

In [None]:
# Test before the training
y_test, y_pred_c, _ = test_model(model, test_loader, device)
acc = (y_test == y_pred_c).float().sum() / y_test.shape[0]
print("Accuracy before training:", acc.cpu().numpy())


# Train
train_model(model,criterion,optimizer,num_epochs,train_loader,val_loader,device,log_writer,model._get_name(),
)


# Load best model
model.load_state_dict(torch.load(model_dir + model._get_name()))
model.to(device)


# Test after the training
y_test, y_pred_c, _ = test_model(model, test_loader, device)
acc = (y_test == y_pred_c).float().sum() / y_test.shape[0]
print("Accuracy after training:", acc.cpu().numpy())

print(model)

# Close tensorboard writer after a training
log_writer.flush()
log_writer.close()

### Plot results ?????

In [None]:
# Plotting the results
plt.figure(figsize=(10, 10))
sns.regplot(x=y_true, y=y_pred, scatter_kws={"alpha": 0.1})  ## alpha ???
plt.xlabel("True year")
plt.ylabel("Predicted year")
plt.title("Predicted vs True year")
plt.show()

# Distribution of predicted years
plt.figure(figsize=(10, 10))
y_pred = np.array(y_pred)
sns.kdeplot(y_true)
sns.kdeplot(y_pred.flatten())
plt.legend(["Predicted years", "True years"])
plt.xlabel("Predicted years")
plt.title("Distribution of predicted years")
plt.show()