# Pytorch classifier notebook

V1 : only 1 split. First implementation  
All folds V1 : with all folds  
All folds V2 : add activation stats plot  
All folds V2.1 : back to  best MLP found so far, and backport fix of activation layers stats. Add weight decay and scheduler (fit one cycle) code

All folds autoencoder MLP V1  
All folds autoencoder MLP V2 : with weights and biases  
All folds autoencoder MLP V3 : replace MLP with xgboost
All folds XGB resp N1 fold predict: start from code of V3 but without NN code  

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

import io
import PIL.Image

import datetime

import faiss

DATASET_INPUT_FILE = 'train.csv'

#FEATURES_LIST_TOTRAIN = ['feature_'+str(i) for i in range(130)] + ['cross_41_42_43', 'cross_1_2']
FEATURES_LIST_TOTRAIN = ['feature_'+str(i) for i in range(130)]

# For custom non-overlaped folds generation
TRAIN_PERCENT = 0.70  
TEST_PERCENT = 0.30

# If subsplit of training set : percentage of second training set  
TRAIN1_PERCENT = 0.20  

ACT_N = False  # Add N previous predictions to input of MLP <= Does not work, logic is not right
ACT_N_SIZE = 5

CLUSTERING = False

MODEL_FILE_META = 'model_XGB_meta_respn1_fold.bin'
MODEL_FILE_RESPN1 = 'model_XGB_respn1.bin'
MODEL_FILE_FOLD = 'model_XGB_fold.bin'

pd.set_option('display.max_rows', 500)

In [3]:
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, ClassifierMixin

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [4]:
# Deterministic Behavior
seed = 42
#os.environ['PYTHONHASHSEED'] = str(seed)
# Python RNG
np.random.seed(seed)
#random.seed(seed)

In [5]:
'''
sweep_config = {
    'method': 'bayes', #grid, random, bayes
    'metric': {
      'name': 'Best utility',
      'goal': 'maximize'   
    },
    'parameters': {
        'batch_size': {
            'values': [524288, 262144, 131072, 65536, 32768, 16384, 8192, 4096, 2048, 1024, 512]
        },
        'dropout': {
            'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
        },
        'learning_rate': {
            #'values': [1e-2, 1e-3, 1e-4, 3e-4, 1e-5]
            #'values': [1e-2, 1e-3, 1e-4]
            'values': [1e-2, 1e-3]
        },

        'weight_decay': {
            'values': [1e-2, 1e-3, 1e-4, 1e-5]
        },
    
        'use_autoenc': {
            'values': ['encoder-decoder', 'encoder', 'encoder-only', 'None']
            #'values': ['encoder-decoder', 'None']
        },
        
        'activation_function': {
            'values': ['relu', 'leakyrelu']
        },
        
    }
}
'''

sweep_config = {
    'method': 'bayes', #grid, random, bayes
    'metric': {
      'name': 'Best utility',
      'goal': 'maximize'   
    },
    'parameters': {
        'batch_size': {
            'min': 4096,
            'max': 65536,
            'distribution': 'int_uniform',
        },
        'dropout': {
            'min': 0.3,
            'max': 0.5,
            'distribution': 'uniform',
        },
        'learning_rate': {
            'min': 0.0005,
            'max': 0.002,
            'distribution': 'uniform',
        },

        'weight_decay': {
            'min': 0.00001,
            'max': 0.0002,
            'distribution': 'uniform',

        },
    
        'use_autoenc': {
            'values': ['encoder', 'encoder-only']
            #'values': ['encoder-decoder', 'None']
        },
        
        'activation_function': {
            'values': ['leakyrelu']
        },
        
    }
}

In [6]:
def pyStandardScale(tensor, mean, std):
    return((tensor - mean) / std)

In [7]:
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

from matplotlib.colors import ListedColormap
    
# this is code slightly modified from the sklearn docs here:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    
    cmap_cv = plt.cm.coolwarm

    jet = plt.cm.get_cmap('jet', 256)
    seq = np.linspace(0, 1, 256)
    _ = np.random.shuffle(seq)   # inplace
    cmap_data = ListedColormap(jet(seq))

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=plt.cm.Set3)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['target', 'day']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2], xlim=[0, len(y)])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax


def plot_cv_indices_custom(cv_custom, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    
    cmap_cv = plt.cm.coolwarm

    jet = plt.cm.get_cmap('jet', 256)
    seq = np.linspace(0, 1, 256)
    _ = np.random.shuffle(seq)   # inplace
    cmap_data = ListedColormap(jet(seq))

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv_custom):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=plt.cm.Set3)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['target', 'day']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2], xlim=[0, len(y)])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [8]:
# This function accounts for variable instance counts in each split by dividing utility_pi by number of instances (but this has been removed)
# It also does some copy of dataframe to prevent memory overwrite
def utility_function(df_test, df_test_predictions):
    df_test_copy = df_test.copy(deep=True)
    df_test_copy.loc[:, 'utility_pj'] = df_test_copy['weight'] * df_test_copy['resp'] * df_test_predictions
    #df_test_utility_pi = df_test_copy.groupby('date')['utility_pj'].sum() / df_test_copy.groupby('date')['utility_pj'].count()
    df_test_utility_pi = df_test_copy.groupby('date')['utility_pj'].sum()

    nb_unique_dates = df_test_utility_pi.shape[0]

    if (np.sqrt(df_test_utility_pi.pow(2).sum()) == 0):
        t = 0

    else:
        t = (df_test_utility_pi.sum() / np.sqrt(df_test_utility_pi.pow(2).sum())) * (np.sqrt(250 / np.abs(nb_unique_dates)))

    u = min(max(t, 0), 6) * df_test_utility_pi.sum()
    del df_test_copy
    
    return(u)

In [9]:
from bisect import bisect_left

# The aim of this function is to return closest date from an index
# So that split indices correspond to start or end of a new day
# myList contains list of instances that correspond to start of a new da

def take_closest(myList, myNumber):
    """
    Assumes myList is sorted. Returns closest value to myNumber.

    If two numbers are equally close, return the smallest number.
    """
    pos = bisect_left(myList, myNumber)
    if pos == 0:
        return myList[0]
    if pos == len(myList):
        return myList[-1]
    before = myList[pos - 1]
    after = myList[pos]
    if after - myNumber < myNumber - before:
       return after
    else:
       return before

In [10]:
class SaveOutputActivationStats:
    def __init__(self):
        self.outputs = []
        
    def __call__(self, module, module_in, module_out):
        #self.outputs.append(module_out)
        #print('Save output callback :')
        #print(module)
        #print({'mean': module_out.mean().item(), 'std': module_out.std().item(),'near_zero': (module_out<=0.05).long().sum().item()/module_out.numel()})
        self.outputs.append({'mean': module_out.mean().item(), 'std': module_out.std().item(),'near_zero': (module_out<=0.05).long().sum().item()/module_out.numel()})
        
    def clear(self):
        self.outputs = []

In [11]:
#fig, ax = plt.subplots(1, 1)
#
#plot_cv_indices(cv, df.loc[:, FEATURES_LIST_TOTRAIN], (df['resp'] > 0), df['date'], 
#                         ax, 5, lw=20);

# Load data

In [12]:
# Load data
    
df = pd.read_csv(DATASET_INPUT_FILE)
df['resp_positive'] = ((df['resp'])>0)*1  # Target to predict

print('Data loaded')


Data loaded


# Feature engineering

In [13]:
#df['cross_41_42_43'] = df['feature_41'] + df['feature_42'] + df['feature_43']

In [14]:
#df['cross_1_2'] = df['feature_1'] / (df['feature_2'] + 1e-5)

# Non overlap fold generation

In [15]:
date_indexes_list = df.groupby('date')['ts_id'].first().to_list()

In [16]:
base_train_split_size = int((df.shape[0] // 5) * TRAIN_PERCENT)

In [17]:
base_test_split_size = int((df.shape[0] // 5) * TEST_PERCENT)

In [18]:
train_split_start_indexes = [take_closest(date_indexes_list, (base_train_split_size + base_test_split_size)*fold_indice) for fold_indice in range(5)]

In [19]:
#test_split_start_indexes = [take_closest(date_indexes_list, (base_train_split_size + base_test_split_size)*fold_indice) for fold_indice in range(5)]

In [20]:
train_split_start_indexes

[0, 477711, 958233, 1435933, 1913985]

In [21]:
df.shape[0] - 1

2390490

In [22]:
# We'll have 5 folds of 3 subsets each (2 training sets and 1 test set per fold)
# (1st training set of each fold will be used for 1st model, ie auto encoder)

NB_FOLDS = 5
last_index = df.shape[0] - 1

cv_table = []

for fold_indice in range(NB_FOLDS):
    fold_train_start_index = train_split_start_indexes[fold_indice]
    
    if (fold_indice == NB_FOLDS - 1):    
        nextfold_train_start_index = last_index
        
    else:
        nextfold_train_start_index = train_split_start_indexes[fold_indice + 1]
    
    fold_test_start_index = take_closest(date_indexes_list, int(TRAIN_PERCENT * (nextfold_train_start_index - fold_train_start_index) + fold_train_start_index  ))
    fold_train2_start_index = take_closest(date_indexes_list, int(TRAIN1_PERCENT * (fold_test_start_index - fold_train_start_index) + fold_train_start_index  ))
    
    cv_table.append(fold_train_start_index)
    cv_table.append(fold_train2_start_index)
    cv_table.append(fold_test_start_index)

In [23]:
cv_table.append(last_index)

In [24]:
cv_table

[0,
 66091,
 336609,
 477711,
 546983,
 815783,
 958233,
 1024471,
 1290282,
 1435933,
 1505171,
 1771833,
 1913985,
 1980610,
 2248510,
 2390490]

In [25]:
cv_table

[0,
 66091,
 336609,
 477711,
 546983,
 815783,
 958233,
 1024471,
 1290282,
 1435933,
 1505171,
 1771833,
 1913985,
 1980610,
 2248510,
 2390490]

In [26]:
NB_FOLDS

5

In [27]:
cv_tuples = []

for i in range(0, NB_FOLDS*3, 3):
    cv_tuples.append([df.loc[cv_table[i]:cv_table[i+1]-1, :].index.to_list(), df.loc[cv_table[i+1]:cv_table[i+2]-1, :].index.to_list(),
                      df.loc[cv_table[i+2]:cv_table[i+3]-1, :].index.to_list()])

In [28]:
len(cv_tuples[0][2])

141102

In [29]:
cv_tuples_generator = iter(cv_tuples)

In [30]:
#fig, ax = plt.subplots(1, 1)

#plot_cv_indices_custom(cv_tuples_generator, df.loc[:, FEATURES_LIST_TOTRAIN], (df['resp'] > 0), df['date'], 
#                         ax, 5, lw=20); 

#cv_tuples_generator = iter(cv_tuples)

In [31]:
# Size of training set :
#train_sets_table =  [cv_tuples[i][0] for i in range(5)]
#sum([len(train_set_table) for train_set_table in train_sets_table])

In [32]:
# Our old time series split (with overlap : required 1 neural network trained per split)
# But in this script it's not needed because we're training 1 unique network, with a different fold strategy (non overlaped)
#cv = PurgedGroupTimeSeriesSplit(
#    n_splits=5,
#    max_train_group_size=180,
#    group_gap=20,
#    max_test_group_size=60
#)

In [33]:
#train_index, test_index = next(cv.split(df, (df['resp'] > 0)*1, df['date']))

In [34]:
#(df.loc[train_index, 'resp'] > 0).astype(np.byte)

In [35]:
#f_mean = df.loc[:, FEATURES_LIST_TOTRAIN].mean(axis=0)

In [36]:
#f_mean.shape

In [37]:
# No fill na with our XGB models.
# df.fillna(f_mean, inplace=True)

In [38]:
#str([p.numel() for p in model.parameters()])

In [39]:
folds_list = []

for fold, (train1_index, train2_index, test_index) in enumerate(cv_tuples_generator):
    folds_list.append((train1_index, train2_index, test_index))

In [40]:
folds_list_train1 = [folds_list[i][0] for i in range(5)]
folds_list_train1_flat = [folds_list_train1_item for sublist in folds_list_train1 for folds_list_train1_item in sublist]
folds_list_train1_unique = list(set(folds_list_train1_flat))

folds_list_train2 = [folds_list[i][1] for i in range(5)]
folds_list_train2_flat = [folds_list_train2_item for sublist in folds_list_train2 for folds_list_train2_item in sublist]
folds_list_train2_unique = list(set(folds_list_train2_flat))

In [41]:
len(folds_list_train1_unique)

337464

In [42]:
len(folds_list_train2_unique)

1339691

In [43]:
np.sum([len(folds_list_train1_item) for folds_list_train1_item in folds_list_train1])

337464

In [44]:
np.sum([len(folds_list_train2_item) for folds_list_train2_item in folds_list_train2])

1339691

In [45]:
len(folds_list_train1_flat)

337464

In [46]:
folds_list_test = [folds_list[i][2] for i in range(5)]
folds_list_test_flat = [folds_list_test_item for sublist in folds_list_test for folds_list_test_item in sublist]
folds_list_test_unique = set(folds_list_test_flat)

In [47]:
np.sum([len(folds_list_test_item) for folds_list_test_item in folds_list_test])

713335

In [48]:
len(folds_list_test_flat)

713335

In [49]:
len(folds_list_train1_flat) + len(folds_list_train2_flat) + len(folds_list_test_flat)

2390490

In [50]:
df.loc[folds_list_test[4], FEATURES_LIST_TOTRAIN].to_numpy().shape

(141980, 130)

In [51]:
df.loc[(folds_list_train1_unique + folds_list_train2_unique), FEATURES_LIST_TOTRAIN].to_numpy().mean(axis=0)

array([ 0.00880718,  0.39574469,  0.33059838,         nan,         nan,
       -0.00498373, -0.01455459,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,         nan,         nan,         nan,
               nan,         nan,  0.02650339,  0.0186391 ,  0.04320553,
        0.05298663,  0.45417433,  0.37762691,  0.41617323,         nan,
               nan,  0.49207956,  0.36839975,  0.50144387,  0.54379067,
        0.53074971,  0.45673965,  0.05646874,  0.38900233,  0.37690587,
               nan,         nan,  0.78590429,         nan,         nan,
        0.55335406,  0.55554392,  0.55922873,  0.56139559,  0.44231975,
        0.61884351,  0.61715568,  0.59770334,  0.59814018,  0.37

In [52]:
len(list(set(folds_list_train1_unique + folds_list_train2_unique)))

1677155

# Training XGB model

## Training XGB model that predicts resp n-1

In [53]:
class XGBClassifier_wrapper(BaseEstimator, ClassifierMixin):  
    ''' Params passed as dictionnary to __init__, for example :
        params_space = {
       'features': FEATURES_LIST_TOTRAIN, 
        'random_state': 42,
        'max_depth': 12,
        'n_estimators': 500,
        'learning_rate': 0.01,
        'subsample': 0.9,
        'colsample_bytree': 0.3,
        'tree_method': 'gpu_hist'
        }
    '''
    def __init__(self, params):
        self.fitted = False
        
        self.features = list(params['features'])
        self.random_state = params['random_state']
        self.max_depth = params['max_depth']
        self.n_estimators = params['n_estimators']
        self.learning_rate = params['learning_rate']
        self.subsample = params['subsample']
        self.colsample_bytree = params['colsample_bytree']
        self.gamma = params['gamma']
        self.tree_method = params['tree_method']  
        
        #print('Features assigned :')
        #print(self.features)

        self.model_internal = XGBClassifier(
            random_state= self.random_state,
            max_depth= self.max_depth,
            n_estimators= self.n_estimators,
            learning_rate= self.learning_rate,
            subsample= self.subsample,
            colsample_bytree= self.colsample_bytree,
            tree_method= self.tree_method,
            gamma = self.gamma,
            #objective= 'binary:logistic',
            #disable_default_eval_metric=True,
            )

    def fit(self, X, y=None):
        print('Model used for fitting:')
        print(self.model_internal)
        self.model_internal.fit(X[self.features], y)
        
        self.fitted = True
        return self

    def predict(self, X, y=None):
        if (self.fitted == True):
            print('predict called')
            return(self.model_internal.predict(X[self.features]))
        
        else:
            print('You must fit model first')
            return(None)

    def predict_proba(self, X, y=None):
        if (self.fitted == True):
            print('predict proba called')
            return(self.model_internal.predict_proba(X[self.features]))
        
        else:
            print('You must fit model first')
            return(None)
        

    #def set_params(self, **parameters):
    #    for parameter, value in parameters.items():
    #        setattr(self, parameter, value)

        
    def score(self, X, y=None):        
        print('Type of X:')
        print(type(X))
        
        print('Shape of X:')
        print(X.shape)
        
        print('Type of y:')
        print(type(y))
        
        print('model fitted ?')
        print(self.fitted) # Usually returns yes at this point when called by cross_val_score
        
        if y is None:
            print('y is None')
            y_preds = pd.Series(self.model_internal.predict(X.reset_index(drop=True)[self.features]))
            
        else: # cross_val_score goes there
            print('y is not None')
            y_preds = pd.Series(y)
        
        return(utility_function(X.reset_index(drop=True), y_preds)) 
    
    def accuracy_score(self, X, y=None):
        if y is None:
            print('y is None in accuracy_score method : pass predictions as y to avoid launching predict')
            y_preds = pd.Series(self.model_internal.predict(X.reset_index(drop=True)[self.features]))
            
        else: # cross_val_score goes there
            #print('y is not None')
            y_preds = pd.Series(y)
            
        return(accuracy_score(X['resp_positive'], y_preds))

In [54]:
# Calculate label of current step
y_train1_resp_positive = (df.loc[folds_list_train1_unique, 'resp'] > 0).astype(np.byte)

# Shift values of resp to get resp of step n-1
y_train1_resp_n1_positive = y_train1_resp_positive.shift(1, fill_value=0)


model_n1 = XGBClassifier(
    random_state= 42,
    max_depth= 12,
    n_estimators= 500,
    learning_rate= 0.01,
    subsample= 0.9,
    colsample_bytree= 0.2,
    tree_method= 'gpu_hist',
    gamma = None,
    )

model_n1.fit(df.loc[folds_list_train1_unique, FEATURES_LIST_TOTRAIN], y_train1_resp_n1_positive, verbose=True)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.2, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=24, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)

# Train model that predicts original fold

## Generate labels

In [55]:
fold_indexes = []

In [56]:
for fold_indice in range(NB_FOLDS):
    fold_indexes.append([item for sublist in folds_list[fold_indice] for item in sublist])

In [57]:
for fold_number, fold_indexes_1fold in enumerate(fold_indexes):
    df.loc[fold_indexes_1fold, 'fold_number'] = str(int(fold_number))

In [58]:
df.loc[df.shape[0] - 1, 'fold_number'] = str(int(NB_FOLDS - 1))

In [59]:
df['fold_number'].value_counts()

1    480522
3    478052
0    477711
2    477700
4    476506
Name: fold_number, dtype: int64

In [60]:
df.shape

(2390491, 140)

In [61]:
#pd.get_dummies(df['fold_number'], prefix = 'fold')

In [62]:
#df = pd.concat([df, pd.get_dummies(df['fold_number'], prefix = 'fold')], axis=1)

In [63]:
#df.drop(columns=['fold_number'], inplace=True)

## Train fold predictor

In [64]:
model_xgb = XGBClassifier(
    random_state= 42,
    max_depth= 10,
    n_estimators= 500,
    learning_rate= 0.02,
    subsample= 0.5,
    colsample_bytree= 0.6,
    tree_method= 'gpu_hist',
    gamma = None,
    #objective= 'binary:logistic',
    #disable_default_eval_metric=True,
    )

#model_xgb.fit(df.loc[folds_list_train1_unique, FEATURES_LIST_TOTRAIN], df.loc[folds_list_train1_unique, ['fold_'+str(i) for i in range(NB_FOLDS)]])
model_xgb.fit(df.loc[folds_list_train1_unique, FEATURES_LIST_TOTRAIN], df.loc[folds_list_train1_unique, 'fold_number'])



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=24, num_parallel_tree=1,
              objective='multi:softprob', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.5,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [65]:
df.loc[folds_list_test[1]]['fold_number'].value_counts()

1    142450
Name: fold_number, dtype: int64

In [66]:
pd.Series(model_xgb.predict(df.loc[folds_list_test[0], FEATURES_LIST_TOTRAIN])).value_counts()

0    38608
1    31708
3    25560
2    23863
4    21363
dtype: int64

In [67]:
pd.Series(model_xgb.predict(df.loc[folds_list_test[1], FEATURES_LIST_TOTRAIN])).value_counts()

2    35380
3    34525
4    28870
1    27587
0    16088
dtype: int64

In [68]:
pd.Series(model_xgb.predict(df.loc[folds_list_test[2], FEATURES_LIST_TOTRAIN])).value_counts()

4    43197
3    41715
2    30115
1    15469
0    15155
dtype: int64

In [69]:
pd.Series(model_xgb.predict(df.loc[folds_list_test[3], FEATURES_LIST_TOTRAIN])).value_counts()

4    55001
3    42224
2    20747
1    12120
0    12060
dtype: int64

In [70]:
pd.Series(model_xgb.predict(df.loc[folds_list_test[4], FEATURES_LIST_TOTRAIN])).value_counts()

4    56798
3    38220
2    19590
0    13698
1    13674
dtype: int64

In [71]:
model_xgb.predict_proba(df.loc[folds_list_test[1], FEATURES_LIST_TOTRAIN])

array([[0.03349742, 0.44114932, 0.2217906 , 0.24658968, 0.05697299],
       [0.2493427 , 0.6202549 , 0.02858509, 0.032284  , 0.06953336],
       [0.12408023, 0.64893204, 0.07998472, 0.05955048, 0.08745254],
       ...,
       [0.32993764, 0.14755715, 0.17608246, 0.1451546 , 0.20126821],
       [0.15299153, 0.11114156, 0.28277832, 0.28438374, 0.16870484],
       [0.06557676, 0.4378262 , 0.22874515, 0.11568245, 0.15216942]],
      dtype=float32)

In [72]:
scores = []
accuracy_scores = []
precision_scores = []
recall_scores = []

for fold_indice in range(NB_FOLDS): 
    test_predictions = model_xgb.predict(df.loc[folds_list_test[fold_indice], FEATURES_LIST_TOTRAIN])

    accuracy_scores.append(accuracy_score(df.loc[folds_list_test[fold_indice], 'fold_number'], test_predictions))  
    precision_scores.append(precision_score(df.loc[folds_list_test[fold_indice], 'fold_number'], test_predictions, average='micro'))  
    recall_scores.append(recall_score(df.loc[folds_list_test[fold_indice], 'fold_number'], test_predictions, average='micro'))  

    df_featimportance = pd.DataFrame(model_xgb.feature_importances_, index=df[FEATURES_LIST_TOTRAIN].columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
    df_featimportance_cumulated = pd.concat([df_featimportance, pd.DataFrame({'% feat importance cumulé' : (df_featimportance['Importance'] / df_featimportance['Importance'].sum()).cumsum()})], axis=1)
    #print(f'Feature importances for split {fold_indice}:')
    #print(df_featimportance_cumulated)

print({'accuracy_scores': accuracy_scores})
print({'precision_scores': precision_scores})
print({'recall_scores': recall_scores})

{'accuracy_scores': [0.27361766665249254, 0.19366093366093365, 0.20676136792744298, 0.29703416061680454, 0.40004225947316524]}
{'precision_scores': [0.27361766665249254, 0.19366093366093365, 0.20676136792744298, 0.29703416061680454, 0.40004225947316524]}
{'recall_scores': [0.27361766665249254, 0.19366093366093365, 0.20676136792744298, 0.29703416061680454, 0.40004225947316524]}


In [73]:
df.shape

(2390491, 140)

# Train XGBoost with fold prediction as input AND resp n-1

In [74]:
model_wrapped = XGBClassifier_wrapper({
   #'features': ['feature_'+str(i) for i in range(130)] + [0,1,2,3,4] + ['resp_n1_predict'], 
    'features': ['feature_'+str(i) for i in range(130)] + [0,3,4] + ['resp_n1_predict'], 
    'random_state': 42,
    'max_depth': 10,
    'n_estimators': 500,
    'learning_rate': 0.02,
    'subsample': 0.5,
    'colsample_bytree': 0.6,
    'gamma': None,
    'tree_method': 'gpu_hist'        
    #'tree_method': 'hist' # CPU
    })

In [75]:
df = pd.concat(
    [df, 
     pd.DataFrame(model_xgb.predict_proba(df.loc[:, FEATURES_LIST_TOTRAIN]))], 
    axis=1)

In [76]:
df.shape

(2390491, 145)

In [77]:
df.loc[:, 'resp_n1_predict'] = model_n1.predict_proba(df.loc[:, FEATURES_LIST_TOTRAIN])[:, 1]

In [78]:
model_wrapped.fit(
    df.loc[folds_list_train2_unique], 
    (df.loc[folds_list_train2_unique]['resp'] > 0).astype(np.byte)
)

Model used for fitting:
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.6, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.02, max_delta_step=None, max_depth=10,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              random_state=42, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=0.5, tree_method='gpu_hist',
              validate_parameters=None, verbosity=None)








XGBClassifier_wrapper(params=None)

In [79]:
scores = []
accuracy_scores = []
xgb_test_predictions_folds = []

for fold_indice in range(NB_FOLDS):     
    test_predictions = model_wrapped.predict(df.loc[folds_list_test[fold_indice], :])
    test_predictions_probas = model_wrapped.predict_proba(df.loc[folds_list_test[fold_indice], :])[:, 1]
    xgb_test_predictions_folds.append(test_predictions_probas)

    scores.append(model_wrapped.score(df.loc[folds_list_test[fold_indice]], test_predictions))
    accuracy_scores.append(model_wrapped.accuracy_score(df.loc[folds_list_test[fold_indice]], test_predictions))  

    df_featimportance = pd.DataFrame(model_wrapped.model_internal.feature_importances_, index=FEATURES_LIST_TOTRAIN + [0,3,4] + ['resp_n1_predict'], columns=['Importance']).sort_values(by='Importance', ascending=False)
    df_featimportance_cumulated = pd.concat([df_featimportance, pd.DataFrame({'% feat importance cumulé' : (df_featimportance['Importance'] / df_featimportance['Importance'].sum()).cumsum()})], axis=1)
    #print(f'Feature importances for split {fold_indice}:')
    #print(df_featimportance_cumulated)

print({'utility_score': sum(scores), 'utility_scores': scores, 'utility_score_std': np.std(scores), 'accuracy_scores': accuracy_scores})

predict called
predict proba called
Type of X:
<class 'pandas.core.frame.DataFrame'>
Shape of X:
(141102, 146)
Type of y:
<class 'numpy.ndarray'>
model fitted ?
True
y is not None
predict called
predict proba called
Type of X:
<class 'pandas.core.frame.DataFrame'>
Shape of X:
(142450, 146)
Type of y:
<class 'numpy.ndarray'>
model fitted ?
True
y is not None
predict called
predict proba called
Type of X:
<class 'pandas.core.frame.DataFrame'>
Shape of X:
(145651, 146)
Type of y:
<class 'numpy.ndarray'>
model fitted ?
True
y is not None
predict called
predict proba called
Type of X:
<class 'pandas.core.frame.DataFrame'>
Shape of X:
(142152, 146)
Type of y:
<class 'numpy.ndarray'>
model fitted ?
True
y is not None
predict called
predict proba called
Type of X:
<class 'pandas.core.frame.DataFrame'>
Shape of X:
(141980, 146)
Type of y:
<class 'numpy.ndarray'>
model fitted ?
True
y is not None
{'utility_score': 2201.1077946132314, 'utility_scores': [282.6849057991371, 660.7564349123575, -0.0,

# Load public models

# Notebook 1 load : Using TensorFlow and PyTorch
https://www.kaggle.com/yonikremer/using-tensorflow-and-pytorch/data?scriptVersionId=54551772  
V1

In [80]:
from sklearn.metrics import log_loss, roc_auc_score

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss, MSELoss
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

#DATA_PATH = '../input/jane-street-market-prediction/'

NFOLDS = 5

TRAIN = False
CACHE_PATH = '/home/francois/coding/OC/PJ9/blending/notebook1'

def save_pickle(dic, save_path):
    with open(save_path, 'wb') as f:
        pickle.dump(dic, f)

def load_pickle(load_path):
    with open(load_path, 'rb') as f:
        message_dict = pickle.load(f)
    return message_dict

feat_cols = [f'feature_{i}' for i in range(130)]

target_cols = ['action', 'action_1', 'action_2', 'action_3', 'action_4']

f_mean = np.load(f'{CACHE_PATH}/f_mean_online.npy')

#     Making features
all_feat_cols = [col for col in feat_cols]
all_feat_cols.extend(['cross_41_42_43', 'cross_1_2'])

#     Model&Data fnc
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.batch_norm0 = nn.BatchNorm1d(len(all_feat_cols))
        self.dropout0 = nn.Dropout(0.2)

        dropout_rate = 0.2
        hidden_size = 256
        self.dense1 = nn.Linear(len(all_feat_cols), hidden_size)
        self.batch_norm1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(dropout_rate)

        self.dense2 = nn.Linear(hidden_size+len(all_feat_cols), hidden_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(dropout_rate)

        self.dense3 = nn.Linear(hidden_size+hidden_size, hidden_size)
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(dropout_rate)

        self.dense4 = nn.Linear(hidden_size+hidden_size, hidden_size)
        self.batch_norm4 = nn.BatchNorm1d(hidden_size)
        self.dropout4 = nn.Dropout(dropout_rate)

        self.dense5 = nn.Linear(hidden_size+hidden_size, len(target_cols))

        self.Relu = nn.ReLU(inplace=True)
        self.PReLU = nn.PReLU()
        self.LeakyReLU = nn.LeakyReLU(negative_slope=0.01, inplace=True)
        self.RReLU = nn.RReLU()

    def forward(self, x):
        x = self.batch_norm0(x)
        x = self.dropout0(x)

        x1 = self.dense1(x)
        x1 = self.batch_norm1(x1)
        x1 = self.LeakyReLU(x1)
        x1 = self.dropout1(x1)

        x = torch.cat([x, x1], 1)

        x2 = self.dense2(x)
        x2 = self.batch_norm2(x2)
        x2 = self.LeakyReLU(x2)
        x2 = self.dropout2(x2)

        x = torch.cat([x1, x2], 1)

        x3 = self.dense3(x)
        x3 = self.batch_norm3(x3)
        x3 = self.LeakyReLU(x3)
        x3 = self.dropout3(x3)

        x = torch.cat([x2, x3], 1)

        x4 = self.dense4(x)
        x4 = self.batch_norm4(x4)
        x4 = self.LeakyReLU(x4)
        x4 = self.dropout4(x4)

        x = torch.cat([x3, x4], 1)

        x = self.dense5(x)

        return x

if True:
    device = torch.device("cpu")

    model_list = []
    tmp = np.zeros(len(feat_cols))
    for _fold in range(NFOLDS):
        torch.cuda.empty_cache()
        model = Model()
        model.to(device)
        model_weights = f"{CACHE_PATH}/online_model{_fold}.pth"
        model.load_state_dict(torch.load(model_weights, map_location=torch.device('cpu')))
        model.eval()
        model_list.append(model)

## tensorflow part

In [81]:
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf
#import tensorflow_addons as tfa

import numpy as np
import pandas as pd
from tqdm import tqdm
from random import choices


SEED = 1111

np.random.seed(SEED)

# fit
def create_mlp(
    num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
    
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    '''
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )
    '''

    return model

epochs = 200
batch_size = 4096
hidden_units = [160, 160, 160]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

tf.keras.backend.clear_session()
tf.random.set_seed(SEED)
clf = create_mlp(
    len(feat_cols), 5, hidden_units, dropout_rates, label_smoothing, learning_rate
    )
clf.load_weights(f'{CACHE_PATH}/model.h5')

tf_models = [clf]

2021-02-19 19:50:32.926745: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-02-19 19:50:33.711343: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-02-19 19:50:33.711395: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-02-19 19:50:33.711509: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-02-19 19:50:33.712098: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:09:00.0 name: GeForce RTX 3090 computeCapability: 8.6
coreClock: 1.8GHz coreCount: 82 deviceMemorySize: 23.69GiB deviceMemoryBandwidth: 871.81GiB/s
2021-02-19 19:50:33.712118: I tensorflow/stream_executor/platform/default/ds

In [82]:
f_mean_NB1 = f_mean

# Notebook 2 : load CNN keras
https://www.kaggle.com/hyperbeam/cnn-using-keras/data  
V1

In [83]:
feat_cols_NB1_NB2 = [f'feature_{i}' for i in range(130)]
f_mean_NB2 = f_mean[1:]

model_NB2 = tf.keras.models.load_model('/home/francois/coding/OC/PJ9/blending/notebook2/cnn_model')

# Calculate base models predictions

In [85]:
# df_filled for public models. df (without fill NA) for xgboost
f_mean_df = df.loc[:, FEATURES_LIST_TOTRAIN].mean(axis=0)
df_filled = df.copy(deep=True)
df_filled.fillna(f_mean_df, inplace=True)

In [86]:
from sklearn.linear_model import LogisticRegression

In [101]:
logreg_coefs = []
logreg_intercepts = []

for fold_indice in range(5):
    df_np = df_filled.loc[folds_list_test[fold_indice], ['feature_' + str(i) for i in range(130)]].values 

    feature_inp = np.concatenate((
        df_np,
        (df_np[:, 41] + df_np[:, 42] + df_np[:, 43]).reshape(df_np.shape[0], 1), # cross_41_42_43
        (df_np[:, 1] / (df_np[:, 2] + 1e-5)).reshape(df_np.shape[0], 1), # cross_1_2
    ), axis=1)

    # Predictions model 1
    with torch.no_grad():
        preds_nb1 = np.median(
                        np.stack(
                            [model(torch.tensor(feature_inp, dtype=torch.float).to(device)).sigmoid().numpy() for model in model_list]
                        ).sum(axis=0) / 5, axis=1)

    del feature_inp

    # Predictions model 2
    preds_nb2 = np.median(np.mean([model(df_np, training = False).numpy() for model in tf_models],axis=0), axis=1)

    batch_size = 20000

    # Predictions model 3
    preds_nb3 = np.empty([0, ])

    for min_bound in range(0, df_np.shape[0], batch_size):
        nb_elems = min(batch_size, df_np.shape[0] - min_bound)
        max_bound = min_bound + nb_elems - 1

        #print(f'min_bound = {min_bound}, max_bound = {max_bound}')

        preds_nb3 = np.append(preds_nb3, np.median(np.mean([model_NB2(df_np[min_bound:max_bound+1, :].reshape(-1, 130, 1), training = False).numpy()],axis=0), axis=1), axis=0)

    preds_xgb = xgb_test_predictions_folds[fold_indice]
    
    #preds = np.stack([preds_nb1, preds_nb2, preds_nb3, preds_xgb], axis=1)
    preds = np.stack([preds_nb1, preds_nb3, preds_xgb], axis=1) # Remove preds_nb2 for submit speed constraint :(

    # Train logistic regression
    
    logreg = LogisticRegression()
    logreg.fit(preds, df.loc[folds_list_test[fold_indice], 'resp_positive'])

    print(f'Coefficients of logistic regression for fold {fold_indice}:')
    print(logreg.coef_)
    logreg_coefs.append(logreg.coef_[0])
    
    print(f'Intercept for fold {fold_indice}')
    print(logreg.intercept_)
    logreg_intercepts.append(logreg.intercept_[0])

    print('Manually calculating proba for instance 0:')
    print(1/(1 + np.exp(-(np.dot(logreg.coef_[0], preds[0]) + logreg.intercept_))) )

    print('Predict proba by scikit learn for instance 0:')
    print(logreg.predict_proba(preds[0:1]))
    print('\n')


Coefficients of logistic regression for fold 0:
[[0.73505422 1.14404314 0.88368141]]
Intercept for fold 0
[-1.40017756]
Manually calculating proba for instance 0:
[0.47032119]
Predict proba by scikit learn for instance 0:
[[0.52967881 0.47032119]]


Coefficients of logistic regression for fold 1:
[[ 9.28878195 -1.66280907 -1.49119587]]
Intercept for fold 1
[-3.067424]
Manually calculating proba for instance 0:
[0.48399739]
Predict proba by scikit learn for instance 0:
[[0.51600261 0.48399739]]


Coefficients of logistic regression for fold 2:
[[11.08829042 -2.55277271 -3.14534251]]
Intercept for fold 2
[-2.70574782]
Manually calculating proba for instance 0:
[0.48547541]
Predict proba by scikit learn for instance 0:
[[0.51452459 0.48547541]]


Coefficients of logistic regression for fold 3:
[[11.01947756 -1.93777247 -3.87512962]]
Intercept for fold 3
[-2.61857389]
Manually calculating proba for instance 0:
[0.55365945]
Predict proba by scikit learn for instance 0:
[[0.44634055 0.553659

In [102]:
np.stack(logreg_coefs)

array([[ 0.73505422,  1.14404314,  0.88368141],
       [ 9.28878195, -1.66280907, -1.49119587],
       [11.08829042, -2.55277271, -3.14534251],
       [11.01947756, -1.93777247, -3.87512962],
       [ 9.69298325, -2.21555271, -1.55129055]])

In [103]:
logreg_intercepts

[-1.400177560491695,
 -3.067424002194023,
 -2.705747816224323,
 -2.6185738946627026,
 -2.995802423049205]

In [104]:
np.mean(np.stack(logreg_coefs), axis=0)

array([ 8.36491748, -1.44497276, -1.83585543])

In [105]:
np.mean(logreg_intercepts)  

-2.5575451393243895

# Model retrain on maximum data possible

We're going to retrain meta model on all data except data used to train base model (to avoid base model providing overfitted predictions),  
and retrain base models (resp n-1 prediction and fold prediction) on all data  

## Retrain meta model (final)

In [117]:
model_wrapped_final = XGBClassifier_wrapper({
   #'features': ['feature_'+str(i) for i in range(130)] + [0,1,2,3,4] + ['resp_n1_predict'], 
    'features': ['feature_'+str(i) for i in range(130)] + [0,3,4] + ['resp_n1_predict'], 
    'random_state': 42,
    'max_depth': 10,
    'n_estimators': 500,
    'learning_rate': 0.02,
    'subsample': 0.5,
    'colsample_bytree': 0.6,
    'gamma': None,
    'tree_method': 'gpu_hist'        
    #'tree_method': 'hist' # CPU
    })

model_wrapped_final.fit(
    df, 
    (df['resp'] > 0).astype(np.byte)
)

Model used for fitting:
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.6, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.02, max_delta_step=None, max_depth=10,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              random_state=42, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=0.5, tree_method='gpu_hist',
              validate_parameters=None, verbosity=None)








XGBClassifier_wrapper(params=None)

In [123]:
model_wrapped_final.model_internal.save_model(MODEL_FILE_META)

In [126]:
df_featimportance_final = pd.DataFrame(model_wrapped.model_internal.feature_importances_, index=FEATURES_LIST_TOTRAIN + [0,3,4] + ['resp_n1_predict'], columns=['Importance']).sort_values(by='Importance', ascending=False)
df_featimportance_cumulated_final = pd.concat([df_featimportance, pd.DataFrame({'% feat importance cumulé' : (df_featimportance['Importance'] / df_featimportance['Importance'].sum()).cumsum()})], axis=1)

In [127]:
df_featimportance_cumulated_final

Unnamed: 0,Importance,% feat importance cumulé
feature_42,0.013739,0.013739
feature_43,0.013437,0.027176
feature_45,0.013231,0.040407
feature_41,0.013018,0.053426
feature_44,0.012578,0.066004
feature_63,0.011994,0.077999
feature_39,0.011919,0.089918
feature_61,0.011712,0.10163
feature_5,0.011217,0.112847
feature_6,0.011094,0.123941


## Retrain resp n-1 model (final)

In [120]:
# Calculate label of current step
y_train1_resp_positive = (df.loc[:, 'resp'] > 0).astype(np.byte)

# Shift values of resp to get resp of step n-1
y_train1_resp_n1_positive = y_train1_resp_positive.shift(1, fill_value=0)


model_n1_final = XGBClassifier(
    random_state= 42,
    max_depth= 12,
    n_estimators= 500,
    learning_rate= 0.01,
    subsample= 0.9,
    colsample_bytree= 0.2,
    tree_method= 'gpu_hist',
    gamma = None,
    )

model_n1_final.fit(df.loc[:, FEATURES_LIST_TOTRAIN], y_train1_resp_n1_positive, verbose=True)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.2, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=24, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [122]:
model_n1_final.save_model(MODEL_FILE_RESPN1)

## Retrain fold model (final)

In [124]:
model_xgb_final = XGBClassifier(
    random_state= 42,
    max_depth= 10,
    n_estimators= 500,
    learning_rate= 0.02,
    subsample= 0.5,
    colsample_bytree= 0.6,
    tree_method= 'gpu_hist',
    gamma = None,
    #objective= 'binary:logistic',
    #disable_default_eval_metric=True,
    )

#model_xgb.fit(df.loc[folds_list_train1_unique, FEATURES_LIST_TOTRAIN], df.loc[folds_list_train1_unique, ['fold_'+str(i) for i in range(NB_FOLDS)]])
model_xgb_final.fit(df.loc[:, FEATURES_LIST_TOTRAIN], df.loc[:, 'fold_number'])



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=24, num_parallel_tree=1,
              objective='multi:softprob', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.5,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [125]:
model_xgb_final.save_model(MODEL_FILE_FOLD)

Check possible return values for fold 3 that does not perform well :

In [168]:
df_fold3_resp_positive = df.loc[folds_list_test[3]].query('resp > 0')

In [169]:
(df_fold3_resp_positive['weight'] * df_fold3_resp_positive['resp']).sum()

2201.1234249072004