# Predict Gamma to use the model with the other base models in stacking

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.compose import ColumnTransformer
import xgboost
import optuna
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from copy import deepcopy
from sklearn.metrics import confusion_matrix, log_loss, average_precision_score
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA
from joblib import parallel_backend

In [None]:
folder = '/home/olli/Projects/Kaggle/ICR'

In [None]:
train_csv = 'train.csv'
greek_csv = 'greeks.csv'
test_csv = 'test.csv'

In [None]:
train_csv_path = os.path.join(folder, train_csv)
greek_csv_path = os.path.join(folder, greek_csv)

In [None]:
df = pd.read_csv(train_csv_path)
df_g = pd.read_csv(greek_csv_path)

# Preprocess Pipeline

In [None]:
cat_features = ['EJ']
num_features = list(df.columns)
for remove_value in ['Id', 'EJ', 'Class']:
    num_features.remove(remove_value)

In [None]:
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

In [None]:
# use a standardscaler due to the outliers
scaler_num = StandardScaler()

In [None]:
encoder_cat = OneHotEncoder(sparse_output=False)

In [None]:
num_pipeline = Pipeline([
    ('Num_Imputer', imputer_num),
    ('Num_Scaler', scaler_num)
])

In [None]:
cat_pipeline = Pipeline([
    ('Cat_Imputer', imputer_cat),
    ('Cat_Encoder', encoder_cat)
])

In [None]:
preprocess_pipe = ColumnTransformer([
    ('Num_Pipe', num_pipeline, num_features),
    ('Cat_Pipe', cat_pipeline, cat_features)
])

In [None]:
# merge dfs; use same split like in final training
X_DF = df.copy()
X_DF['Alpha'] = df_g.Alpha
X_DF['Gamma'] = df_g.Gamma

In [None]:
X_DF.head()

Recall from Notebook_1: Gamma_M had the highest correlation with the Target and most datapoints were gamma_M. gamma_H also has a high correlation but were less datapoints.

# Function to add the multi-label

In [None]:
# Problem: The target is a pd series and need to be converted to a multi class numpy array

def create_gamma_label(series, onehot=True):
    df = series.copy()
    
    class_labels = []
    
    # for each values add the class number (M==0, H==1, other==2)
    for value in df.values:
        if value == 'M':
            class_labels.append(0)
        elif value == 'H':
            class_labels.append(1)
        else:
            class_labels.append(2)
            
    # since this is a multiclass classification the labels need to be like this:
    # [1, 0, 0] for [gamma_M, gamma_H, gamma_other]
    labels_onehot = []
    
    for label in class_labels:
        labels_onehot.append(np.eye(3)[label])  # makes easy onehot
    
    labels_onehot = np.array(labels_onehot)
    
    if onehot:
        return labels_onehot
    else:
        return np.array(class_labels)

In [None]:
# check the function
exp_gamma = X_DF['Gamma'].copy()

In [None]:
exp_gamma.head(n=15)

In [None]:
exp_gamma_encoded = create_gamma_label(exp_gamma)

In [None]:
exp_gamma_encoded[:15]

In [None]:
# Also calculate the initial weights to try
exp_gamma.value_counts()

In [None]:
len(exp_gamma)

In [None]:
exp_gamma_enc_2 = create_gamma_label(exp_gamma, onehot=False)

In [None]:
exp_gamma_enc_2[:15]

In [None]:
len(np.unique(exp_gamma_enc_2))

In [None]:
# set weight M to 1
# weight H
print(f'H: {445 / 53}; other: {445 / (617 - 445 - 53)}' )

# Optimize Hyperparameters

In [None]:
def objective(trial):
    
    # weights
    weight_M = trial.suggest_float('weight_M', 0.7, 1.5)
    weight_H = trial.suggest_float('weight_H', 6, 11)
    weight_other = trial.suggest_float('weight_other', 3, 4.5)
    
    # use the auc score here with weighted param for the imbalanced task and to utilize propapilities
    scores = []

    seeds = list(range(0, 10))

    for seed in seeds:

        cv = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        # use alpha+gamma to draw stratified samples
        for train_index, valid_index in cv.split(X_DF, X_DF[['Alpha', 'Gamma']].copy()):

            # take all columns for the features since the ColumnTransformer will only select the defined ones
            X_train, y_train = X_DF.iloc[train_index], X_DF.loc[train_index, 'Gamma']
            X_valid, y_valid = X_DF.iloc[valid_index], X_DF.loc[valid_index, 'Gamma']

            # use defined pipeline
            X_train = preprocess_pipe.fit_transform(X_train)
            X_valid = preprocess_pipe.transform(X_valid)  # no fit

            # apply function to add labels
            y_train = create_gamma_label(y_train, onehot=False)
            y_valid = create_gamma_label(y_valid, onehot=False)
            
            # create the sample weight to assign an individual weight for each point
            dict_weight = {0:weight_M, 1:weight_H, 2: weight_other}
            sample_weights = np.array([dict_weight[i] for i in y_train])
            
            xgb = xgboost.XGBClassifier(n_estimators=trial.suggest_int('n_estimators', 5, 1000),
                                        max_depth=trial.suggest_int('max_depth', 2, 10),
                                        learning_rate=trial.suggest_float('lr', 0.01, 1),
                                        gamma=trial.suggest_float('gamma', 0, 1),
                                        min_child_weight=trial.suggest_float('min_child_weight', 0, 10),
                                        max_delta_step=trial.suggest_int('max_delta_step', 0, 10),
                                        subsample=trial.suggest_float('subsample', 0.5, 1),
                                        colsample_bynode=trial.suggest_float('colsample_bynode', 0.5, 1),
                                        colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1),
                                        colsample_bylevel=trial.suggest_float('colsample_bylevel', 0.5, 1),
                                        reg_lambda=trial.suggest_float('reg_lambda', 0.0001, 0.1, log=True),
                                        objective='multi:softprob',
                                        num_class=3,
                                        #sample_weight=sample_weights,
                                        )
            xgb.fit(X_train,
                    y_train,
                    eval_set=[(X_train, y_train), (X_valid, y_valid)],
                    sample_weight=sample_weights,
                    early_stopping_rounds=trial.suggest_int('early_stop', 1, 1000),
                    verbose=0
                    )

            xgb.fit(X_train, y_train)

            y_val_pred = xgb.predict_proba(X_valid)  # currently (n_samples, 3) for 3 classes
            
            # y_valid is (n_samples, ) for xgb, metric needs onehot
            y_valid_onehot = np.eye(len(np.unique(y_valid)))[y_valid]
                        
            score = average_precision_score(y_valid_onehot, y_val_pred, average='weighted')
            scores.append(score)

    final_score = np.array(scores).mean()

    return final_score

In [None]:
study = optuna.create_study(direction='maximize')

In [None]:
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
%%time
study.optimize(objective, n_trials=200, show_progress_bar=True)

In [None]:
study.best_value

0.9138

In [None]:
study.best_params

# Run 2

In [None]:
def objective_2(trial):
    
    # weights
    weight_M = trial.suggest_float('weight_M', 1.2, 1.7)
    weight_H = trial.suggest_float('weight_H', 5, 7)
    weight_other = trial.suggest_float('weight_other', 3.25, 4.25)
    
    # use the auc score here with weighted param for the imbalanced task and to utilize propapilities
    scores = []

    seeds = list(range(0, 10))

    for seed in seeds:

        cv = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        # use alpha+gamma to draw stratified samples
        for train_index, valid_index in cv.split(X_DF, X_DF[['Alpha', 'Gamma']].copy()):

            # take all columns for the features since the ColumnTransformer will only select the defined ones
            X_train, y_train = X_DF.iloc[train_index], X_DF.loc[train_index, 'Gamma']
            X_valid, y_valid = X_DF.iloc[valid_index], X_DF.loc[valid_index, 'Gamma']

            # use defined pipeline
            X_train = preprocess_pipe.fit_transform(X_train)
            X_valid = preprocess_pipe.transform(X_valid)  # no fit

            # apply function to add labels
            y_train = create_gamma_label(y_train, onehot=False)
            y_valid = create_gamma_label(y_valid, onehot=False)
            
            # create the sample weight to assign an individual weight for each point
            dict_weight = {0:weight_M, 1:weight_H, 2: weight_other}
            sample_weights = np.array([dict_weight[i] for i in y_train])
            
            xgb = xgboost.XGBClassifier(n_estimators=trial.suggest_int('n_estimators', 850, 910),
                                        max_depth=3,
                                        learning_rate=trial.suggest_float('lr', 0.02, 0.1),
                                        gamma=trial.suggest_float('gamma', 0.05, 0.25),
                                        min_child_weight=trial.suggest_float('min_child_weight', 0.6, 1),
                                        max_delta_step=trial.suggest_int('max_delta_step', 6, 10),
                                        subsample=trial.suggest_float('subsample', 0.5, 0.7),
                                        colsample_bynode=trial.suggest_float('colsample_bynode', 0.85, 1),
                                        colsample_bytree=trial.suggest_float('colsample_bytree', 0.75, 0.95),
                                        colsample_bylevel=trial.suggest_float('colsample_bylevel', 0.4, 0.6),
                                        reg_lambda=trial.suggest_float('reg_lambda', 0.1, 10, log=True),
                                        objective='multi:softprob',
                                        num_class=3,
                                        #sample_weight=sample_weights,
                                        )
            xgb.fit(X_train,
                    y_train,
                    eval_set=[(X_train, y_train), (X_valid, y_valid)],
                    sample_weight=sample_weights,
                    early_stopping_rounds=trial.suggest_int('early_stop', 200, 260),
                    verbose=0
                    )

            xgb.fit(X_train, y_train)

            y_val_pred = xgb.predict_proba(X_valid)  # currently (n_samples, 3) for 3 classes
            
            # y_valid is (n_samples, ) for xgb, metric needs onehot
            y_valid_onehot = np.eye(len(np.unique(y_valid)))[y_valid]
                        
            score = average_precision_score(y_valid_onehot, y_val_pred, average='weighted')
            scores.append(score)

    final_score = np.array(scores).mean()

    return final_score

In [None]:
study_2 = optuna.create_study(direction='maximize')

In [None]:
%%time
study_2.optimize(objective_2, n_trials=40, show_progress_bar=True)

In [None]:
study_2.best_value

0.91545

In [None]:
study_2.best_params

In [None]:
gamma_xgb_params = {'weight_M': 1.6574709291749559,
         'weight_H': 6.494618704275055,
         'weight_other': 4.141839934777784,
         'n_estimators': 904,
         'max_depth': 3,
         'lr': 0.025917942386337954,
         'gamma': 0.23400159239280982,
         'min_child_weight': 0.8473270431776004,
         'max_delta_step': 9,
         'subsample': 0.5309248067433621,
         'colsample_bynode': 0.9777134591145428,
         'colsample_bytree': 0.8372737842373374,
         'colsample_bylevel': 0.5711105969683837,
         'reg_lambda': 0.10145045118989354,
         'early_stop': 218,
         'objective': 'multi:softprob',
         'num_class': 3}