# Pytorch classifier notebook

V1 : only 1 split. First implementation  
All folds V1 : with all folds  
All folds V2 : add activation stats plot  
All folds V2.1 : back to  best MLP found so far, and backport fix of activation layers stats. Add weight decay and scheduler (fit one cycle) code

All folds autoencoder MLP V1

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.utils as utils

from torch.utils.tensorboard import SummaryWriter
import io
import PIL.Image

import datetime

torch.set_printoptions(edgeitems=2)
torch.manual_seed(42)

DATASET_INPUT_FILE = 'train.csv'

#FEATURES_LIST_TOTRAIN = ['feature_'+str(i) for i in range(130)] + ['cross_41_42_43', 'cross_1_2']
FEATURES_LIST_TOTRAIN = ['feature_'+str(i) for i in range(130)]

# For custom non-overlaped folds generation
TRAIN_PERCENT = 0.70  
TEST_PERCENT = 0.30

# If subsplit of training set : percentage of second training set  
TRAIN1_PERCENT = 0.20  

pd.set_option('display.max_rows', 500)

In [2]:
# Deterministic Behavior
seed = 42
#os.environ['PYTHONHASHSEED'] = str(seed)
# Torch RNG
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# Python RNG
np.random.seed(seed)
#random.seed(seed)
# CuDA Determinism
torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False

In [65]:
#BATCH_SIZE = 50000
#BATCH_SIZE = 4096 # Gave once better results than 50000
#BATCH_SIZE = 2048

#BATCH_SIZE = 300000

#BATCH_SIZE = 4096
#BATCH_SIZE = 8192
#BATCH_SIZE = 32768
BATCH_SIZE = 8192
WEIGHT_DECAY = 1e-4 # Remettre à 1e-5
LEARNING_RATE = 1e-4

EARLY_STOPPING = True

NUM_EPOCHS = 1000
#NUM_EPOCHS = 36

MODEL_FILE = f'model_NN_allfolds_V1.pt'

BATCH_SIZE_AE = 40960
NUM_EPOCHS_AE = 1000
LEARNING_RATE_AE = 1e-3
WEIGHT_DECAY_AE = 1e-4
MODEL_FILE_AE = f'model_NN_AE_allfolds_V1.pt'

RETRAIN_MODEl_AE = False


MODEL_COMMENT_AE = f'All folds MLP autoenc, 2 layers 64 32, good model reloaded, batch size {BATCH_SIZE_AE}, lr={LEARNING_RATE_AE}, patience 5, standard scale, weight decay {WEIGHT_DECAY_AE}, dropout 0.5, with cross features, no scheduler, no std scale'
MODEL_COMMENT = f'All folds MLP with autoenc (decoded features), 3 layers 130, 200 and 100, good model reloaded, batch size {BATCH_SIZE}, lr={LEARNING_RATE}, patience 5, standard scale, weight decay {WEIGHT_DECAY}, 0.7 dropout, without cross features, no scheduler, no std scale'

In [4]:
def pyStandardScale(tensor, mean, std):
    return((tensor - mean) / std)

In [5]:
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

from matplotlib.colors import ListedColormap
    
# this is code slightly modified from the sklearn docs here:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    
    cmap_cv = plt.cm.coolwarm

    jet = plt.cm.get_cmap('jet', 256)
    seq = np.linspace(0, 1, 256)
    _ = np.random.shuffle(seq)   # inplace
    cmap_data = ListedColormap(jet(seq))

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=plt.cm.Set3)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['target', 'day']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2], xlim=[0, len(y)])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax


def plot_cv_indices_custom(cv_custom, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    
    cmap_cv = plt.cm.coolwarm

    jet = plt.cm.get_cmap('jet', 256)
    seq = np.linspace(0, 1, 256)
    _ = np.random.shuffle(seq)   # inplace
    cmap_data = ListedColormap(jet(seq))

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv_custom):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=plt.cm.Set3)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['target', 'day']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2], xlim=[0, len(y)])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [6]:
# This function accounts for variable instance counts in each split by dividing utility_pi by number of instances (but this has been removed)
# It also does some copy of dataframe to prevent memory overwrite
def utility_function(df_test, df_test_predictions):
    df_test_copy = df_test.copy(deep=True)
    df_test_copy.loc[:, 'utility_pj'] = df_test_copy['weight'] * df_test_copy['resp'] * df_test_predictions
    #df_test_utility_pi = df_test_copy.groupby('date')['utility_pj'].sum() / df_test_copy.groupby('date')['utility_pj'].count()
    df_test_utility_pi = df_test_copy.groupby('date')['utility_pj'].sum()

    nb_unique_dates = df_test_utility_pi.shape[0]
    t = (df_test_utility_pi.sum() / np.sqrt(df_test_utility_pi.pow(2).sum())) * (np.sqrt(250 / np.abs(nb_unique_dates)))
    u = min(max(t, 0), 6) * df_test_utility_pi.sum()
    del df_test_copy
    
    return(u)

In [7]:
from bisect import bisect_left

# The aim of this function is to return closest date from an index
# So that split indices correspond to start or end of a new day
# myList contains list of instances that correspond to start of a new da

def take_closest(myList, myNumber):
    """
    Assumes myList is sorted. Returns closest value to myNumber.

    If two numbers are equally close, return the smallest number.
    """
    pos = bisect_left(myList, myNumber)
    if pos == 0:
        return myList[0]
    if pos == len(myList):
        return myList[-1]
    before = myList[pos - 1]
    after = myList[pos]
    if after - myNumber < myNumber - before:
       return after
    else:
       return before

In [8]:
class SaveOutputActivationStats:
    def __init__(self):
        self.outputs = []
        
    def __call__(self, module, module_in, module_out):
        #self.outputs.append(module_out)
        #print('Save output callback :')
        #print(module)
        #print({'mean': module_out.mean().item(), 'std': module_out.std().item(),'near_zero': (module_out<=0.05).long().sum().item()/module_out.numel()})
        self.outputs.append({'mean': module_out.mean().item(), 'std': module_out.std().item(),'near_zero': (module_out<=0.05).long().sum().item()/module_out.numel()})
        
    def clear(self):
        self.outputs = []

In [9]:
#fig, ax = plt.subplots(1, 1)
#
#plot_cv_indices(cv, df.loc[:, FEATURES_LIST_TOTRAIN], (df['resp'] > 0), df['date'], 
#                         ax, 5, lw=20);

In [10]:
torch.device

torch.device

In [11]:
torch.cuda.is_available()

True

In [12]:
torch.cuda.current_device()

0

In [13]:
torch.cuda.get_device_name(0)

'GeForce RTX 3090'

# Load data

In [14]:
# Load data
    
df = pd.read_csv(DATASET_INPUT_FILE)
df['resp_positive'] = ((df['resp'])>0)*1  # Target to predict

print('Data loaded')


Data loaded


# Feature engineering

In [15]:
#df['cross_41_42_43'] = df['feature_41'] + df['feature_42'] + df['feature_43']

In [16]:
#df['cross_1_2'] = df['feature_1'] / (df['feature_2'] + 1e-5)

# Non overlap fold generation

In [17]:
date_indexes_list = df.groupby('date')['ts_id'].first().to_list()

In [18]:
base_train_split_size = int((df.shape[0] // 5) * TRAIN_PERCENT)

In [19]:
base_test_split_size = int((df.shape[0] // 5) * TEST_PERCENT)

In [20]:
train_split_start_indexes = [take_closest(date_indexes_list, (base_train_split_size + base_test_split_size)*fold_indice) for fold_indice in range(5)]

In [21]:
#test_split_start_indexes = [take_closest(date_indexes_list, (base_train_split_size + base_test_split_size)*fold_indice) for fold_indice in range(5)]

In [22]:
train_split_start_indexes

[0, 477711, 958233, 1435933, 1913985]

In [23]:
df.shape[0] - 1

2390490

In [24]:
# We'll have 5 folds of 3 subsets each (2 training sets and 1 test set per fold)
# (1st training set of each fold will be used for 1st model, ie auto encoder)

NB_FOLDS = 5
last_index = df.shape[0] - 1

cv_table = []

for fold_indice in range(NB_FOLDS):
    fold_train_start_index = train_split_start_indexes[fold_indice]
    
    if (fold_indice == NB_FOLDS - 1):    
        nextfold_train_start_index = last_index
        
    else:
        nextfold_train_start_index = train_split_start_indexes[fold_indice + 1]
    
    fold_test_start_index = take_closest(date_indexes_list, int(TRAIN_PERCENT * (nextfold_train_start_index - fold_train_start_index) + fold_train_start_index  ))
    fold_train2_start_index = take_closest(date_indexes_list, int(TRAIN1_PERCENT * (fold_test_start_index - fold_train_start_index) + fold_train_start_index  ))
    
    cv_table.append(fold_train_start_index)
    cv_table.append(fold_train2_start_index)
    cv_table.append(fold_test_start_index)

In [25]:
cv_table.append(last_index)

In [26]:
cv_table

[0,
 66091,
 336609,
 477711,
 546983,
 815783,
 958233,
 1024471,
 1290282,
 1435933,
 1505171,
 1771833,
 1913985,
 1980610,
 2248510,
 2390490]

In [27]:
cv_table

[0,
 66091,
 336609,
 477711,
 546983,
 815783,
 958233,
 1024471,
 1290282,
 1435933,
 1505171,
 1771833,
 1913985,
 1980610,
 2248510,
 2390490]

In [28]:
NB_FOLDS

5

In [29]:
cv_tuples = []

for i in range(0, NB_FOLDS*3, 3):
    cv_tuples.append([df.loc[cv_table[i]:cv_table[i+1]-1, :].index.to_list(), df.loc[cv_table[i+1]:cv_table[i+2]-1, :].index.to_list(),
                      df.loc[cv_table[i+2]:cv_table[i+3]-1, :].index.to_list()])

In [30]:
len(cv_tuples[0][2])

141102

In [31]:
cv_tuples_generator = iter(cv_tuples)

In [32]:
#fig, ax = plt.subplots(1, 1)

#plot_cv_indices_custom(cv_tuples_generator, df.loc[:, FEATURES_LIST_TOTRAIN], (df['resp'] > 0), df['date'], 
#                         ax, 5, lw=20); 

#cv_tuples_generator = iter(cv_tuples)

In [33]:
# Size of training set :
#train_sets_table =  [cv_tuples[i][0] for i in range(5)]
#sum([len(train_set_table) for train_set_table in train_sets_table])

In [34]:
# Our old time series split (with overlap : required 1 neural network trained per split)
# But in this script it's not needed because we're training 1 unique network, with a different fold strategy (non overlaped)
#cv = PurgedGroupTimeSeriesSplit(
#    n_splits=5,
#    max_train_group_size=180,
#    group_gap=20,
#    max_test_group_size=60
#)

In [35]:
#train_index, test_index = next(cv.split(df, (df['resp'] > 0)*1, df['date']))

In [36]:
#(df.loc[train_index, 'resp'] > 0).astype(np.byte)

In [37]:
f_mean = df.loc[:, FEATURES_LIST_TOTRAIN].mean(axis=0)

In [38]:
f_mean.shape

(130,)

In [39]:
df.fillna(f_mean, inplace=True)

In [40]:
#print('Sum of model parameters:')
#[print(p.sum()) for p in model.parameters()]

In [41]:
#writer = SummaryWriter()

#writer.add_text('test', 'test:'  + str(model).replace('\n', '<BR>'))

#writer.flush()
#writer.close()

In [42]:
#str([p.numel() for p in model.parameters()])

In [43]:
folds_list = []

for fold, (train1_index, train2_index, test_index) in enumerate(cv_tuples_generator):
    folds_list.append((train1_index, train2_index, test_index))

In [44]:
folds_list_train1 = [folds_list[i][0] for i in range(5)]
folds_list_train1_flat = [folds_list_train1_item for sublist in folds_list_train1 for folds_list_train1_item in sublist]
folds_list_train1_unique = list(set(folds_list_train1_flat))

folds_list_train2 = [folds_list[i][1] for i in range(5)]
folds_list_train2_flat = [folds_list_train2_item for sublist in folds_list_train2 for folds_list_train2_item in sublist]
folds_list_train2_unique = list(set(folds_list_train2_flat))

In [45]:
len(folds_list_train1_unique)

337464

In [46]:
len(folds_list_train2_unique)

1339691

In [47]:
np.sum([len(folds_list_train1_item) for folds_list_train1_item in folds_list_train1])

337464

In [48]:
np.sum([len(folds_list_train2_item) for folds_list_train2_item in folds_list_train2])

1339691

In [49]:
len(folds_list_train1_flat)

337464

In [50]:
folds_list_test = [folds_list[i][2] for i in range(5)]
folds_list_test_flat = [folds_list_test_item for sublist in folds_list_test for folds_list_test_item in sublist]
folds_list_test_unique = set(folds_list_test_flat)

In [51]:
np.sum([len(folds_list_test_item) for folds_list_test_item in folds_list_test])

713335

In [52]:
len(folds_list_test_flat)

713335

In [53]:
len(folds_list_train1_flat) + len(folds_list_train2_flat) + len(folds_list_test_flat)

2390490

In [54]:
df.loc[folds_list_test[4], FEATURES_LIST_TOTRAIN].to_numpy().shape

(141980, 130)

In [55]:
df.loc[(folds_list_train1_unique + folds_list_train2_unique), FEATURES_LIST_TOTRAIN].to_numpy().mean(axis=0)

array([ 0.00880718,  0.39574469,  0.33059838,  0.00919269,  0.00341737,
       -0.00498373, -0.01455459,  0.05534631,  0.02511896,  0.2646538 ,
        0.16705702,  0.09489698,  0.04450428,  0.15251293,  0.07996651,
        0.22166532,  0.12827658,  0.12181565,  0.10958852,  0.29772963,
        0.26463247,  0.1881408 ,  0.17251055,  0.25474009,  0.23267903,
        0.29794049,  0.2685417 ,  0.13985131,  0.16285107,  0.33060734,
        0.34385913,  0.22684687,  0.25190658,  0.31637359,  0.3359838 ,
        0.35284181,  0.36773315,  0.02650339,  0.0186391 ,  0.04320553,
        0.05298663,  0.45417433,  0.37762691,  0.41617323,  0.43927675,
        0.48651095,  0.49207956,  0.36839975,  0.50144387,  0.54379067,
        0.53074971,  0.45673965,  0.05646874,  0.38900233,  0.37690587,
        0.77549302,  0.92466193,  0.78590429,  0.80847667,  0.89895923,
        0.55335406,  0.55554392,  0.55922873,  0.56139559,  0.44231975,
        0.61884351,  0.61715568,  0.59770334,  0.59814018,  0.37

In [56]:
len(list(set(folds_list_train1_unique + folds_list_train2_unique)))

1677155

In [57]:
torch.mean(torch.tensor(df.loc[(folds_list_train1_unique + folds_list_train2_unique), FEATURES_LIST_TOTRAIN].to_numpy(), device='cpu'), axis=0)

tensor([ 0.0088,  0.3957,  0.3306,  0.0092,  0.0034, -0.0050, -0.0146,  0.0553,
         0.0251,  0.2647,  0.1671,  0.0949,  0.0445,  0.1525,  0.0800,  0.2217,
         0.1283,  0.1218,  0.1096,  0.2977,  0.2646,  0.1881,  0.1725,  0.2547,
         0.2327,  0.2979,  0.2685,  0.1399,  0.1629,  0.3306,  0.3439,  0.2268,
         0.2519,  0.3164,  0.3360,  0.3528,  0.3677,  0.0265,  0.0186,  0.0432,
         0.0530,  0.4542,  0.3776,  0.4162,  0.4393,  0.4865,  0.4921,  0.3684,
         0.5014,  0.5438,  0.5307,  0.4567,  0.0565,  0.3890,  0.3769,  0.7755,
         0.9247,  0.7859,  0.8085,  0.8990,  0.5534,  0.5555,  0.5592,  0.5614,
         0.4423,  0.6188,  0.6172,  0.5977,  0.5981,  0.3774,  0.2389,  0.3080,
         0.0041, -0.0322, -0.0016, -0.0199, -0.0316, -0.0932, -0.0081, -0.0358,
        -0.0025, -0.0149, -0.0350, -0.1015,  0.3934,  0.5416,  0.3924,  0.4281,
         0.4976,  0.3994,  0.4332,  0.5235,  0.4224,  0.4221,  0.4348,  0.4547,
         0.3984,  0.5422,  0.3973,  0.42

In [58]:
torch.tensor(f_mean)

tensor([ 9.8386e-03,  3.8558e-01,  3.5769e-01,  8.9192e-03,  4.1501e-03,
        -3.7146e-03, -1.2589e-02,  5.1777e-02,  2.6828e-02,  2.4881e-01,
         1.8235e-01,  8.9122e-02,  4.9486e-02,  1.4311e-01,  8.9027e-02,
         2.1168e-01,  1.4630e-01,  1.2122e-01,  1.1358e-01,  2.9381e-01,
         2.6877e-01,  1.8691e-01,  1.7698e-01,  2.5244e-01,  2.3856e-01,
         2.9407e-01,  2.7318e-01,  1.3548e-01,  1.6088e-01,  3.2189e-01,
         3.4253e-01,  2.2056e-01,  2.5013e-01,  3.0822e-01,  3.3535e-01,
         3.4145e-01,  3.6583e-01,  2.9320e-02,  2.2892e-02,  4.0022e-02,
         5.0750e-02,  4.4505e-01,  3.6018e-01,  3.4603e-01,  4.1153e-01,
         4.3803e-01,  4.7612e-01,  3.4787e-01,  4.9963e-01,  5.6400e-01,
         5.1226e-01,  4.5739e-01,  4.5744e-02,  3.6270e-01,  3.5887e-01,
         6.5260e-01,  8.0495e-01,  6.6135e-01,  6.7981e-01,  7.6259e-01,
         5.5640e-01,  5.5817e-01,  5.4554e-01,  5.4678e-01,  4.3506e-01,
         6.0757e-01,  6.0850e-01,  5.9519e-01,  5.9

# Training auto encoder

In [80]:
torch.manual_seed(seed)  
    
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder,self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(len(FEATURES_LIST_TOTRAIN), 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
        )
        
        self.decoder = nn.Sequential(             
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, len(FEATURES_LIST_TOTRAIN)),
        )
        
    def forward(self,x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x        

model = AutoEncoder().double().to('cuda')
    
#print('Number of model parameters :')
#numel_list = [p.numel() for p in model.parameters()]
#sum(numel_list), numel_list
    
if (RETRAIN_MODEl_AE == False):
    model_AE = model
    model_AE.load_state_dict(torch.load(MODEL_FILE_AE,map_location=torch.device('cuda')))
    print('Model AE loaded')

else:    
    print('Training started')
    patience=5

    utility_scores = [None] * 5
    accuracy_scores = [None] * 5

    today = datetime.datetime.now()
    now_str = today.strftime("%b%d_%H-%M-%S")
    tensorboard_dir_AE = 'runs_AE/' + now_str
    writer = SummaryWriter(log_dir=tensorboard_dir_AE)

    ts_train = torch.tensor(df.loc[folds_list_train1_unique, FEATURES_LIST_TOTRAIN].to_numpy(), device='cuda')
    #ts_train_y = torch.tensor((df.loc[folds_list_train_unique, 'resp'] > 0).astype(np.byte).to_numpy(), device='cuda')

    # Normalize data
    ts_train_mean = torch.mean(ts_train, axis=0)
    ts_train_std = torch.std(ts_train, axis=0)
    #ts_train_mean = torch.tensor(f_mean)
    # If you want to use Standard scale : calculate mean from f_mean and std scale from whole dataset
    #ts_train = pyStandardScale(ts_train, ts_train_mean, ts_train_std)

    train_dataset = torch.utils.data.TensorDataset(ts_train)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE_AE, shuffle=True)

    ts_test = [None] * 5
    #ts_test_y = [None] * 5    
    test_dataset = [None] * 5
    test_loader = [None] * 5

    for fold_indice in range(5):
        ts_test[fold_indice] = torch.tensor(df.loc[folds_list_test[fold_indice], FEATURES_LIST_TOTRAIN].to_numpy(), device='cuda')
        #ts_test_y[fold_indice] = torch.tensor((df.loc[folds_list_test[fold_indice], 'resp'] > 0).astype(np.byte).to_numpy(), device='cuda')

        # Normalize
        #ts_test[fold_indice] = pyStandardScale(ts_test[fold_indice], ts_train_mean, ts_train_std)

        test_dataset[fold_indice] = torch.utils.data.TensorDataset(ts_test[fold_indice])
        test_loader[fold_indice] = torch.utils.data.DataLoader(test_dataset[fold_indice], batch_size=BATCH_SIZE_AE)

    loss_fn = nn.MSELoss().to('cuda')
    #optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) 
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE_AE, weight_decay=WEIGHT_DECAY_AE) 

    scheduler = None
    #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,
    #                                                         max_lr=1e-4, epochs=NUM_EPOCHS, steps_per_epoch=len(train_loader))

    #model.eval()
    #start_accuracy = accuracy_score(ts_test_y.cpu().numpy(), (model(ts_test).squeeze() > 0.5).cpu().numpy())
    #start_utility_score = utility_function(df.loc[test_index], (model(ts_test).squeeze() > 0.5).cpu().numpy())
    #print('Start Validation Accuracy: {:.4f}'.format(start_accuracy))
    #print('Start Validation Utility: {:.4f}'.format(start_utility_score))

    Val_Loss = 0
    N_Samples = 0

    the_last_loss = 10000
    the_last_utility_score = 0
    the_last_accuracy = 0
    trigger_times=0
    early_stopping_met = False

    for epoch in range(NUM_EPOCHS_AE): 
        running_loss = 0.0        

        ### Call back to save activation stats (mean, std dev and near 0 values after activation functions)
        # Setting hook for activation layers stats

        hook_handles = []
        save_output_activation_stats = []

        for layer in model.modules():
            if ('activation' in str(type(layer))):
                save_output_activation_stats_1layer = SaveOutputActivationStats()
                handle = layer.register_forward_hook(save_output_activation_stats_1layer)
                save_output_activation_stats.append(save_output_activation_stats_1layer)
                hook_handles.append(handle)    

        model.train()

        for batch in train_loader:
            #inputs, labels = batch[0], batch[1]
            inputs = batch[0].to('cuda')
            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                outputs = model(inputs)
                loss = loss_fn(outputs, inputs.double())
                loss.backward()
                optimizer.step()

                if scheduler:
                    scheduler.step()

        # update local train loss
            running_loss += loss.item() * inputs.size(0)

        # update global train loss
        epoch_loss = running_loss / len(train_loader.dataset)
        print('Epoch({}) - Training Loss: {:.4f}'.format(epoch, epoch_loss))

        writer.add_scalar(f"Global train/loss", epoch_loss, epoch)

        # Write activation stats graphs
        for layer_number,save_output_activation_stats_layer in enumerate(save_output_activation_stats):
            df_stats_layer = pd.DataFrame(save_output_activation_stats_layer.outputs)

            fig, ax = plt.subplots(1, 3, figsize=(25, 4))

            ax[0].set_title(f'Layer {layer_number} : Mean activation value', fontsize=16)
            ax[0].set_xlabel('Batch instances')
            ax[0].set_ylabel('Mean')
            ax[0].plot(range(df_stats_layer.shape[0]), df_stats_layer['mean'])

            ax[1].set_title(f'Layer {layer_number} : Std deviation activation value', fontsize=16)
            ax[1].set_xlabel('Batch instances')
            ax[1].set_ylabel('Standard deviation')
            ax[1].plot(range(df_stats_layer.shape[0]), df_stats_layer['std'])

            ax[2].set_title(f'Layer {layer_number} : Percentage of activation values near zero', fontsize=16)
            ax[2].set_xlabel('Batch instances')
            ax[2].set_ylabel('Percentage')
            ax[2].plot(range(df_stats_layer.shape[0]), df_stats_layer['near_zero']);

            plot_buf = io.BytesIO()
            plt.savefig(plot_buf, format='jpeg')
            plt.close()

            plot_buf.seek(0)
            image = PIL.Image.open(plot_buf)
            image = transforms.ToTensor()(image)
            writer.add_image("Train activation stats/Activation stats layer " + str(layer_number), image, epoch)

        # Validation 
        model.eval()

        vrunning_loss = [None] * 5
        num_samples = [None] * 5
        vepoch_loss_folds = [None] * 5
        vepoch_accuracy_folds = [None] * 5
        vepoch_utility_score_folds = [None] * 5

        for fold_indice in range(5):    
            vrunning_loss[fold_indice] = 0.0
            num_samples[fold_indice] = 0

            for batch in test_loader[fold_indice]:
                inputs = batch[0].to('cuda')

                optimizer.zero_grad()
                with torch.no_grad():
                    outputs = model(inputs)
                    loss = loss_fn(outputs, inputs.double())

                vrunning_loss[fold_indice] += loss.item() * inputs.size(0)
                num_samples[fold_indice] += inputs.size(0)

                vepoch_loss_folds[fold_indice] = vrunning_loss[fold_indice] / num_samples[fold_indice]

            print('Epoch({}) - Fold {} - Validation Loss : {:.4f}'.format(epoch, fold_indice, vepoch_loss_folds[fold_indice]))        

        # update epoch loss
        vepoch_loss = sum(vepoch_loss_folds) / len(vepoch_loss_folds)
        print('Epoch({}) - GLOBAL - Validation Loss: {:.4f}'.format(epoch, vepoch_loss))

        #print(f'Sum of model parameters ({epoch}):')
        #[print(p.sum()) for p in model.parameters()]

        writer.add_scalar("Global valid/Loss", vepoch_loss, epoch)

        for fold_indice in range(5):
            writer.add_scalar("Fold valid Loss/Loss fold "+str(fold_indice), vepoch_loss_folds[fold_indice], epoch)        

        writer.flush()

        # Check if Early Stopping

        if (vepoch_loss > the_last_loss):
            if (EARLY_STOPPING == True):
                trigger_times += 1

                print(f'Intermediate early stopping : vepoch_loss = {vepoch_loss:.4f}, the_last_loss={the_last_loss:.4f}')

                if trigger_times >= patience:
                    print('Meet Early stopping!')
                    early_stopping_met = True
                    ##torch.save(model.state_dict(), f'model_{fold}.pt')
                    break
        else:
            trigger_times = 0
            the_last_loss = vepoch_loss

            the_best_epoch = epoch

            # Save model for the best version so far
            print(f'Saving model corresponding to last_loss == {the_last_loss}')
            torch.save(model.state_dict(), MODEL_FILE_AE)

        print('\n')

    if (early_stopping_met == False):
        print("Didn't meet early stopping : saving final model")
        # Save model if don't meet early stopping
        torch.save(model.state_dict(), MODEL_FILE_AE)

    writer.add_text(f"Global valid/Loss", f"Best loss: {the_last_loss}", the_best_epoch)

    scores_results = {'Loss': the_last_loss, 'Loss folds': vepoch_loss_folds, 'Loss_std': np.std(vepoch_loss_folds)}

    writer.add_text('Final score', str(scores_results))
    writer.add_text('Batch size', str(BATCH_SIZE_AE))
    writer.add_text('Patience', str(patience))
    writer.add_text('Number of epochs', str(NUM_EPOCHS_AE))
    writer.add_text('Best epoch', str(the_best_epoch))
    writer.add_text('Number of parameters per layer', str([p.numel() for p in model.parameters()]))
    writer.add_text('Model architecture', str(model).replace('\n', '<BR>'))
    writer.add_text('Comment', MODEL_COMMENT_AE)

    writer.close()

    print('Training summary:')
    print(scores_results)

    model_AE = model
    model_AE.eval()
    print('Training ended')

Training started
Epoch(0) - Training Loss: 8.4552
Epoch(0) - Fold 0 - Validation Loss : 7.4755
Epoch(0) - Fold 1 - Validation Loss : 5.9368
Epoch(0) - Fold 2 - Validation Loss : 5.1554
Epoch(0) - Fold 3 - Validation Loss : 5.0550
Epoch(0) - Fold 4 - Validation Loss : 4.6616
Epoch(0) - GLOBAL - Validation Loss: 5.6569
Saving model corresponding to last_loss == 5.656853811882923


Epoch(1) - Training Loss: 8.1171
Epoch(1) - Fold 0 - Validation Loss : 6.9563
Epoch(1) - Fold 1 - Validation Loss : 5.5597
Epoch(1) - Fold 2 - Validation Loss : 4.7793
Epoch(1) - Fold 3 - Validation Loss : 4.6929
Epoch(1) - Fold 4 - Validation Loss : 4.3494
Epoch(1) - GLOBAL - Validation Loss: 5.2675
Saving model corresponding to last_loss == 5.267520049354945


Epoch(2) - Training Loss: 7.5170
Epoch(2) - Fold 0 - Validation Loss : 6.2390
Epoch(2) - Fold 1 - Validation Loss : 5.0236
Epoch(2) - Fold 2 - Validation Loss : 4.3018
Epoch(2) - Fold 3 - Validation Loss : 4.2382
Epoch(2) - Fold 4 - Validation Loss : 3.

In [63]:
df[FEATURES_LIST_TOTRAIN].mean().mean()

0.3105054402961592

In [64]:
1.08 / df[FEATURES_LIST_TOTRAIN].mean().mean()

3.4781999277368514

In [62]:
#ts_train.mean(axis=0)

In [63]:
#loss_fn(ts_train[0:5, :], model_AE(ts_train[0:5, :]))

In [64]:
#model_AE(ts_train[0:5, :])[0, 5]

In [65]:
#ts_train[0, 5]

In [66]:
#loss_fn(ts_train[50, 5], model_AE(ts_train[:, :])[50, 5])

# Training main model

In [83]:
print('Training started')
patience=5

utility_scores = [None] * 5
accuracy_scores = [None] * 5

writer = SummaryWriter()

ts_train = torch.tensor(df.loc[folds_list_train2_unique, FEATURES_LIST_TOTRAIN].to_numpy(), device='cuda')
ts_train_y = torch.tensor((df.loc[folds_list_train2_unique, 'resp'] > 0).astype(np.byte).to_numpy(), device='cuda')

# Normalize data
ts_train_mean = torch.mean(ts_train, axis=0)
ts_train_std = torch.std(ts_train, axis=0)
#ts_train = pyStandardScale(ts_train, ts_train_mean, ts_train_std)

train_dataset = torch.utils.data.TensorDataset(ts_train, ts_train_y)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) # pin_memory : VOIR RESULTAT

ts_test = [None] * 5
ts_test_y = [None] * 5    
test_dataset = [None] * 5
test_loader = [None] * 5

for fold_indice in range(5):
    ts_test[fold_indice] = torch.tensor(df.loc[folds_list_test[fold_indice], FEATURES_LIST_TOTRAIN].to_numpy(), device='cuda')
    ts_test_y[fold_indice] = torch.tensor((df.loc[folds_list_test[fold_indice], 'resp'] > 0).astype(np.byte).to_numpy(), device='cuda')

    # Normalize
    #ts_test[fold_indice] = pyStandardScale(ts_test[fold_indice], ts_train_mean, ts_train_std)
    
    test_dataset[fold_indice] = torch.utils.data.TensorDataset(ts_test[fold_indice], ts_test_y[fold_indice])
    test_loader[fold_indice] = torch.utils.data.DataLoader(test_dataset[fold_indice], batch_size=BATCH_SIZE)

torch.manual_seed(seed)
    
class MLP(nn.Module):
    def __init__(self, AEncoder):
    #def __init__(self):
        super(MLP,self).__init__()
        
        self.AEncoder = AEncoder
        
        #self.layer1 = nn.Linear(len(FEATURES_LIST_TOTRAIN) + self.AEncoder.decoder[0].in_features, 200)
        self.layer1 = nn.Linear(len(FEATURES_LIST_TOTRAIN) * 2, 200) # <= % near 0 élevé
        self.act1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.7)

        self.layer2 = nn.Linear(200, 100)
        self.act2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.7)
        
        self.layer3 = nn.Linear(100, 1)
        self.act3 = nn.Sigmoid()
        
    def encoder(self, x):
        self.AEncoder.eval()
        
        encoded = self.AEncoder.encoder(x)
        
        return encoded
    
    def encoder_decoder(self, x):
        self.AEncoder.eval()
        
        encoded_decoded = self.AEncoder(x)
        
        return encoded_decoded
    
    def forward(self,x):
        #x_encoded = self.encoder(x)
        x_decoded = self.encoder_decoder(x)
        
        #x = torch.cat((x, x_encoded), dim=1)
        x = torch.cat((x, x_decoded), dim=1)
        
        x = self.dropout1(self.act1(self.layer1(x)))
        x = self.dropout2(self.act2(self.layer2(x)))
        
        x = self.act3(self.layer3(x))
        
        return x        

#model = MLP(model_AE)
model = MLP(model_AE).double().to('cuda')

'''
model = nn.Sequential(
        #nn.Dropout(0.2),
        nn.Linear(len(FEATURES_LIST_TOTRAIN), 200),
        #nn.BatchNorm1d(130),
        nn.ReLU(),
        nn.Dropout(0.7),

        nn.Linear(200, 100),
        #nn.BatchNorm1d(130),
        nn.ReLU(),
        nn.Dropout(0.7),
    
        nn.Linear(100, 1),
        nn.Sigmoid(),
    ).double().to('cuda')
'''
   
print('Number of model parameters :')
numel_list = [p.numel() for p in model.parameters()]
sum(numel_list), numel_list

loss_fn = nn.BCELoss().to('cuda')
#optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) 
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) 

scheduler = None
#scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,
#                                                         max_lr=1e-4, epochs=NUM_EPOCHS, steps_per_epoch=len(train_loader))

model.eval()
#start_accuracy = accuracy_score(ts_test_y.cpu().numpy(), (model(ts_test).squeeze() > 0.5).cpu().numpy())
#start_utility_score = utility_function(df.loc[test_index], (model(ts_test).squeeze() > 0.5).cpu().numpy())
#print('Start Validation Accuracy: {:.4f}'.format(start_accuracy))
#print('Start Validation Utility: {:.4f}'.format(start_utility_score))


Val_Loss = 0
N_Samples = 0

the_last_loss = 100
the_last_utility_score = 0
the_last_accuracy = 0
trigger_times=0
early_stopping_met = False

for epoch in range(NUM_EPOCHS): 
    running_loss = 0.0        
    
    ### Call back to save activation stats (mean, std dev and near 0 values after activation functions)
    # Setting hook for activation layers stats

    hook_handles = []
    save_output_activation_stats = []

    for layer in model.modules():
        if ('activation' in str(type(layer))):
            save_output_activation_stats_1layer = SaveOutputActivationStats()
            handle = layer.register_forward_hook(save_output_activation_stats_1layer)
            save_output_activation_stats.append(save_output_activation_stats_1layer)
            hook_handles.append(handle)    
            
    model.train()

    for batch in train_loader:
        #inputs, labels = batch[0], batch[1]
        inputs, labels = batch[0].to('cuda'), batch[1].to('cuda')
        optimizer.zero_grad()

        with torch.set_grad_enabled(True):
            outputs = model(inputs)
            loss = loss_fn(outputs, labels.unsqueeze(-1).double())
            loss.backward()
            optimizer.step()
            
            if scheduler:
                scheduler.step()

    # update local train loss
        running_loss += loss.item() * inputs.size(0)

    # update global train loss
    epoch_loss = running_loss / len(train_loader.dataset)
    print('Epoch({}) - Training Loss: {:.4f}'.format(epoch, epoch_loss))

    writer.add_scalar(f"Global train/loss", epoch_loss, epoch)

    # Write activation stats graphs
    for layer_number,save_output_activation_stats_layer in enumerate(save_output_activation_stats):
        df_stats_layer = pd.DataFrame(save_output_activation_stats_layer.outputs)
        
        if ((df_stats_layer.shape[0] == 0) and (df_stats_layer.shape[1] == 0)):
            print(f'Activation stats: No data returned for stats at layer {layer_number}')

        else:
            fig, ax = plt.subplots(1, 3, figsize=(25, 4))

            ax[0].set_title(f'Layer {layer_number} : Mean activation value', fontsize=16)
            ax[0].set_xlabel('Batch instances')
            ax[0].set_ylabel('Mean')
            ax[0].plot(range(df_stats_layer.shape[0]), df_stats_layer['mean'])

            ax[1].set_title(f'Layer {layer_number} : Std deviation activation value', fontsize=16)
            ax[1].set_xlabel('Batch instances')
            ax[1].set_ylabel('Standard deviation')
            ax[1].plot(range(df_stats_layer.shape[0]), df_stats_layer['std'])

            ax[2].set_title(f'Layer {layer_number} : Percentage of activation values near zero', fontsize=16)
            ax[2].set_xlabel('Batch instances')
            ax[2].set_ylabel('Percentage')
            ax[2].plot(range(df_stats_layer.shape[0]), df_stats_layer['near_zero']);

            plot_buf = io.BytesIO()
            plt.savefig(plot_buf, format='jpeg')
            plt.close()

            plot_buf.seek(0)
            image = PIL.Image.open(plot_buf)
            image = transforms.ToTensor()(image)
            writer.add_image("Train activation stats/Activation stats layer " + str(layer_number), image, epoch)
    
    # Validation 
    model.eval()

    vrunning_loss = [None] * 5
    num_samples = [None] * 5
    vepoch_loss_folds = [None] * 5
    vepoch_accuracy_folds = [None] * 5
    vepoch_utility_score_folds = [None] * 5
    
    for fold_indice in range(5):    
        vrunning_loss[fold_indice] = 0.0
        num_samples[fold_indice] = 0

        for batch in test_loader[fold_indice]:
            inputs, labels = batch[0].to('cuda'), batch[1].to('cuda')

            optimizer.zero_grad()
            with torch.no_grad():
                outputs = model(inputs)
                loss = loss_fn(outputs, labels.unsqueeze(-1).double())

            vrunning_loss[fold_indice] += loss.item() * inputs.size(0)
            num_samples[fold_indice] += labels.size(0)
            
            vepoch_loss_folds[fold_indice] = vrunning_loss[fold_indice] / num_samples[fold_indice]

        print('Epoch({}) - Fold {} - Validation Loss : {:.4f}'.format(epoch, fold_indice, vepoch_loss_folds[fold_indice]))

        model.eval()
        with torch.no_grad():
            vepoch_accuracy_folds[fold_indice] = accuracy_score(ts_test_y[fold_indice].cpu().numpy(), (model(ts_test[fold_indice]).squeeze() > 0.5).cpu().numpy())
            vepoch_utility_score_folds[fold_indice] = utility_function(df.loc[folds_list_test[fold_indice]], (model(ts_test[fold_indice]).squeeze() > 0.5).cpu().numpy())
        print('Epoch({}) - Fold {} - Validation Accuracy : {:.4f}'.format(epoch, fold_indice, vepoch_accuracy_folds[fold_indice]))
        print('Epoch({}) - Fold {} - Validation Utility score : {:.4f}'.format(epoch, fold_indice, vepoch_utility_score_folds[fold_indice]))
        
            
    # update epoch loss
    vepoch_loss = sum(vepoch_loss_folds) / len(vepoch_loss_folds)
    vepoch_accuracy = sum(vepoch_accuracy_folds) / len(vepoch_accuracy_folds)
    vepoch_utility_score = sum(vepoch_utility_score_folds) #/ len(vepoch_utility_score_folds)
    print('Epoch({}) - GLOBAL - Validation Loss: {:.4f}'.format(epoch, vepoch_loss))
    print('Epoch({}) - GLOBAL - Validation Accuracy: {:.4f}'.format(epoch, vepoch_accuracy))
    print('Epoch({}) - GLOBAL - Validation Utility score: {:.4f}'.format(epoch, vepoch_utility_score))

    #print(f'Sum of model parameters ({epoch}):')
    #[print(p.sum()) for p in model.parameters()]

    writer.add_scalar("Global valid/Loss", vepoch_loss, epoch)
    writer.add_scalar("Global valid/Accuracy", vepoch_accuracy, epoch)
    writer.add_scalar("Global valid/Utility", vepoch_utility_score, epoch)

    for fold_indice in range(5):
        writer.add_scalar("Fold valid Loss/Loss fold "+str(fold_indice), vepoch_loss_folds[fold_indice], epoch)
        writer.add_scalar("Fold valid Accuracy/Accuracy fold "+str(fold_indice), vepoch_accuracy_folds[fold_indice], epoch)
        writer.add_scalar("Fold valid Utility/Utility fold "+str(fold_indice), vepoch_utility_score_folds[fold_indice], epoch)
        
    
    writer.flush()

    # Check if Early Stopping
    #if vepoch_loss > the_last_loss:
    #if (vepoch_utility_score < the_last_utility_score) and (vepoch_loss > the_last_loss) and (vepoch_accuracy < the_last_accuracy):
    
    if (vepoch_loss > the_last_loss):
        if (EARLY_STOPPING == True):
            trigger_times += 1

            print(f'Intermediate early stopping : vepoch_loss = {vepoch_loss:.4f}, the_last_loss={the_last_loss:.4f}')
            #print(f'Intermediate early stopping : vepoch_accuracy = {vepoch_accuracy:.4f}, the_last_utility_score={the_last_accuracy:.4f}')
            #print(f'Intermediate early stopping : vepoch_utility_score = {vepoch_utility_score:.4f}, the_last_utility_score={the_last_utility_score:.4f}')

            if trigger_times >= patience:
                print('Meet Early stopping!')
                early_stopping_met = True
                ##torch.save(model.state_dict(), f'model_{fold}.pt')
                break
    else:
        trigger_times = 0
        the_last_loss = vepoch_loss
        the_last_utility_score = vepoch_utility_score
        the_last_accuracy = vepoch_accuracy
        
        the_last_utility_score_folds = vepoch_utility_score_folds
        the_last_accuracy_folds = vepoch_accuracy_folds
        
        the_best_epoch = epoch

        # Save model for the best version so far
        print(f'Saving model corresponding to last_utility_score == {the_last_utility_score}')
        torch.save(model.state_dict(), MODEL_FILE)

    print('\n')

if (early_stopping_met == False):
    print("Didn't meet early stopping : saving final model")
    # Save model if don't meet early stopping
    torch.save(model.state_dict(), MODEL_FILE)

#utility_scores.append(the_last_utility_score)
#accuracy_scores.append(the_last_accuracy)
writer.add_text(f"Global valid/Utility", f"Best utility: {the_last_utility_score}", the_best_epoch)
        
scores_results = {'utility_score': the_last_utility_score, 'utility_scores': the_last_utility_score_folds, 'utility_score_std': np.std(the_last_utility_score_folds), 'accuracy_scores': the_last_accuracy_folds}

writer.add_text('Final utility score', str(scores_results))
writer.add_text('Batch size', str(BATCH_SIZE))
writer.add_text('Patience', str(patience))
writer.add_text('Number of epochs', str(NUM_EPOCHS))
writer.add_text('Best epoch', str(the_best_epoch))
writer.add_text('Number of parameters per layer', str([p.numel() for p in model.parameters()]))
writer.add_text('Model architecture', str(model).replace('\n', '<BR>'))
writer.add_text('Comment', MODEL_COMMENT)

writer.close()

print('Training summary:')
print(scores_results)

Training started
Number of model parameters :
Epoch(0) - Training Loss: 0.7025
Epoch(0) - Fold 0 - Validation Loss : 0.6930
Epoch(0) - Fold 0 - Validation Accuracy : 0.5049
Epoch(0) - Fold 0 - Validation Utility score : -0.0000
Epoch(0) - Fold 1 - Validation Loss : 0.6927
Epoch(0) - Fold 1 - Validation Accuracy : 0.5092
Epoch(0) - Fold 1 - Validation Utility score : -0.0000
Epoch(0) - Fold 2 - Validation Loss : 0.6933
Epoch(0) - Fold 2 - Validation Accuracy : 0.5025
Epoch(0) - Fold 2 - Validation Utility score : -0.0000
Epoch(0) - Fold 3 - Validation Loss : 0.6934
Epoch(0) - Fold 3 - Validation Accuracy : 0.5012
Epoch(0) - Fold 3 - Validation Utility score : -0.0000
Epoch(0) - Fold 4 - Validation Loss : 0.6933
Epoch(0) - Fold 4 - Validation Accuracy : 0.5006
Epoch(0) - Fold 4 - Validation Utility score : 48.2034
Epoch(0) - GLOBAL - Validation Loss: 0.6931
Epoch(0) - GLOBAL - Validation Accuracy: 0.5037
Epoch(0) - GLOBAL - Validation Utility score: 48.2034
Saving model corresponding to 

### Note

Note utility score précédent :  

{'utility_score': 2699.3290911247423, 'utility_scores': [433.21587581983806, 842.8579165327393, -0.0, -0.0, 1423.255298772165], 'utility_score_std': 541.5654345633293, 'accuracy_scores': [0.5243157432212159, 0.5228992628992629, 0.5123754728769456, 0.5104606336878834, 0.5277785603606142]}

(avec std scale)


Training summary:
{'utility_score': 2697.374406045479, 'utility_scores': [448.22515142892547, 938.181355255796, -0.0, -0.0, 1310.9678993607574], 'utility_score_std': 518.5674763422022, 'accuracy_scores': [0.5212753894345934, 0.5206107406107406, 0.5085924573123425, 0.5062820079914457, 0.525750105648683]}


Essayer : 
> avec features supplémentaires  
> augmenter le dropout de la couche avec bcp de 0, ou supprimer la couche  
> bouger le weight decay : essayer 1e-5, et 1e-3  
> augmenter la taille du batch  
> label smoothing   (voir loss_fn = SmoothBCEwLogits(smoothing=0.005)  dans janestreet_kaggle....)  
> différents triggers

In [None]:
'''
model_load = nn.Sequential(
        #nn.Dropout(0.2),
        nn.Linear(len(FEATURES_LIST_TOTRAIN), 130),
        #nn.BatchNorm1d(130),
        nn.ReLU(),
        nn.Dropout(0.3),

        nn.Linear(130, 130),
        #nn.BatchNorm1d(130),
        nn.ReLU(),
        nn.Dropout(0.3),    

        nn.Linear(130, 130),
        #nn.BatchNorm1d(130),
        nn.ReLU(),
        nn.Dropout(0.3),    

        nn.Linear(130, 130),
        #nn.BatchNorm1d(130),
        nn.ReLU(),
        nn.Dropout(0.3),    

        nn.Linear(130, 130),
        #nn.BatchNorm1d(130),
        nn.ReLU(),
        nn.Dropout(0.3),    

        nn.Linear(130, 130),
        #nn.BatchNorm1d(130),
        nn.ReLU(),
        nn.Dropout(0.3),    
    
        nn.Linear(130, 130),
        #nn.BatchNorm1d(130),
        nn.ReLU(),
        nn.Dropout(0.3),    
    
        nn.Linear(130, 130),
        #nn.BatchNorm1d(130),
        nn.ReLU(),
        nn.Dropout(0.3),    

        nn.Linear(130, 60),
        #nn.BatchNorm1d(130),
        nn.ReLU(),
        nn.Dropout(0.3),    
    
        nn.Linear(60, 30),
        #nn.BatchNorm1d(130),
        nn.ReLU(),
        nn.Dropout(0.3),    
       
        nn.Linear(30, 1),
        nn.Sigmoid(),
    ).double().to('cuda')
    
model_load.load_state_dict(torch.load(f'model_NN_allfolds_V1.pt',map_location=torch.device('cuda')))
'''

#model_load.eval()
#print(accuracy_score(ts_test_y.cpu().numpy(), (model_load(ts_test).squeeze() > 0.5).cpu().numpy()))
#
#model_load.eval()
#print(utility_function(df.loc[test_index], (model_load(ts_test).squeeze() > 0.5).cpu().numpy()))


# Data for model inference

## First, fill NA with :

In [56]:
list(f_mean)

[0.009838564545944745,
 0.38557755173112973,
 0.35768747744650975,
 0.00891916614596665,
 0.0041500560373424495,
 -0.0037146189993207766,
 -0.012589244366156346,
 0.051776552018932685,
 0.026828095990947754,
 0.24881331937245502,
 0.18234851148590248,
 0.08912156181298928,
 0.049485535154017296,
 0.14310535183278583,
 0.08902722352210106,
 0.21167757450938102,
 0.146300650876364,
 0.12121931699105562,
 0.11358210894993667,
 0.2938148492026861,
 0.26876788737703755,
 0.18691131282167164,
 0.1769785779830002,
 0.25244128771902047,
 0.23856075429165213,
 0.29407078502434514,
 0.273177703966479,
 0.13548298050171867,
 0.16087630644126466,
 0.32189235718153003,
 0.3425343272966006,
 0.2205604165416505,
 0.25013120792951216,
 0.3082216783372597,
 0.3353533199754549,
 0.34145307300650396,
 0.36582532760649067,
 0.029320465264380657,
 0.02289177995103487,
 0.04002162079212139,
 0.05074972651124518,
 0.4450543980970518,
 0.36018357114469624,
 0.34602868865463743,
 0.4115306048129169,
 0.4380310

## Then, normalize with :

In [362]:
ts_train_mean

tensor([ 0.0088,  0.3957,  0.3306,  0.0092,  0.0034, -0.0050, -0.0146,  0.0553,
         0.0251,  0.2647,  0.1671,  0.0949,  0.0445,  0.1525,  0.0800,  0.2217,
         0.1283,  0.1218,  0.1096,  0.2977,  0.2646,  0.1881,  0.1725,  0.2547,
         0.2327,  0.2979,  0.2685,  0.1399,  0.1629,  0.3306,  0.3439,  0.2268,
         0.2519,  0.3164,  0.3360,  0.3528,  0.3677,  0.0265,  0.0186,  0.0432,
         0.0530,  0.4542,  0.3776,  0.4162,  0.4393,  0.4865,  0.4921,  0.3684,
         0.5014,  0.5438,  0.5307,  0.4567,  0.0565,  0.3890,  0.3769,  0.7755,
         0.9247,  0.7859,  0.8085,  0.8990,  0.5534,  0.5555,  0.5592,  0.5614,
         0.4423,  0.6188,  0.6172,  0.5977,  0.5981,  0.3774,  0.2389,  0.3080,
         0.0041, -0.0322, -0.0016, -0.0199, -0.0316, -0.0932, -0.0081, -0.0358,
        -0.0025, -0.0149, -0.0350, -0.1015,  0.3934,  0.5416,  0.3924,  0.4281,
         0.4976,  0.3994,  0.4332,  0.5235,  0.4224,  0.4221,  0.4348,  0.4547,
         0.3984,  0.5422,  0.3973,  0.42

In [363]:
ts_train_std

tensor([ 1.0000,  2.5724,  2.4543,  1.9501,  1.7327,  1.7503,  1.6707,  1.6297,
         1.8147,  2.4146,  1.7088,  1.6546,  2.3538,  2.2777,  2.0171,  1.8925,
         2.2236,  1.5821,  1.9931,  1.7196,  1.9315,  2.4339,  1.7864,  2.1404,
         2.6117,  2.0852,  2.2918,  1.4115,  1.8827,  1.7403,  2.0817,  1.6850,
         2.4817,  1.9481,  2.0704,  2.4316,  2.2853,  2.0566,  2.1225,  1.6162,
         2.3138,  1.9912,  2.4127,  2.3259,  2.7887,  1.9650,  2.8462,  2.2005,
         3.0387,  3.5713,  3.7193,  2.8348,  1.8877,  2.2007,  1.9352,  7.1295,
        11.0556,  7.5957,  8.1579,  9.9465,  2.2164,  1.9947,  2.1712,  2.2567,
         2.3735,  2.1915,  1.7553,  2.5883,  2.5263,  2.2991,  2.4406,  1.8076,
         1.8125,  2.1476,  1.7792,  1.9738,  2.2283,  2.6795,  2.1807,  1.7949,
         1.7688,  2.2805,  1.9767,  2.4821,  1.9748,  2.4948,  1.9679,  2.6387,
         2.5943,  2.6160,  2.0864,  2.6942,  2.0670,  2.2104,  2.1212,  2.5054,
         2.1243,  2.5619,  2.3017,  2.11