# Use a Logistic Regression to stack the 4 optimized models

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.compose import ColumnTransformer
import xgboost
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import optuna
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from copy import deepcopy
from sklearn.metrics import confusion_matrix
import seaborn as sns
from tqdm.notebook import tqdm

In [2]:
folder = '/home/olli/Projects/Kaggle/ICR'

In [3]:
train_csv = 'train.csv'
greek_csv = 'greeks.csv'
test_csv = 'test.csv'

In [4]:
train_csv_path = os.path.join(folder, train_csv)
greek_csv_path = os.path.join(folder, greek_csv)

In [5]:
df = pd.read_csv(train_csv_path)
df_g = pd.read_csv(greek_csv_path)

# Build a preprocessing pipeline (imputing, encoding, scaling)

In [6]:
cat_features = ['EJ']
num_features = list(df.columns)
for remove_value in ['Id', 'EJ', 'Class']:
    num_features.remove(remove_value)

In [7]:
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

In [8]:
# use a standardscaler due to the outliers
scaler_num = StandardScaler()

In [9]:
encoder_cat = OneHotEncoder(sparse_output=False)

In [10]:
num_pipeline = Pipeline([
    ('Num_Imputer', imputer_num),
    ('Num_Scaler', scaler_num)
])

In [11]:
cat_pipeline = Pipeline([
    ('Cat_Imputer', imputer_cat),
    ('Cat_Encoder', encoder_cat)
])

In [12]:
preprocess_pipe = ColumnTransformer([
    ('Num_Pipe', num_pipeline, num_features),
    ('Cat_Pipe', cat_pipeline, cat_features)
])

In [13]:
# merge dfs for split
X_DF = df.copy()
X_DF['Alpha'] = df_g.Alpha
X_DF['Gamma'] = df_g.Gamma

In [14]:
# metric
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)  # [num_class_0, num_class_1]
    return log_loss(y_true, y_pred, sample_weight=1/nc[y_true], eps=1e-15)

In [15]:
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Define the optimized hyperparameters from the 4 models

In [16]:
xgb_params = {'n_estimators': 540,
                 'max_depth': 3,
                 'learning_rate': 0.037390264787993115,
                 'gamma': 0.503401096426527,
                 'min_child_weight': 6.0081952605965405,
                 'max_delta_step': 5,
                 'subsample': 0.9675791374036842,
                 'colsample_bynode': 0.5385701026123967,
                 'colsample_bytree': 0.548075108799538,
                 'colsample_bylevel': 0.936147174988883,
                 'reg_lambda': 0.020026716033135116,
                 'scale_pos_weight': 6.362746795831511,
                 'objective': 'binary:logistic'
              }

In [17]:
svm_params = {'C': 5,
              'kernel': 'rbf',
              'gamma': 0.017,
              'coef0': -0.74,
              'class_weight': {0: 1, 1: 7.6},
              'probability': True
             }

In [18]:
extra_tree_params = {'n_estimators': 85,
                     'max_depth': 13,
                     'min_samples_split': 0.03500662766420863,
                     'min_samples_leaf': 0.002061652693906283,
                     'min_weight_fraction_leaf': 0.00616078547424613,
                     'max_features': 0.9999007675837779,
                     'max_leaf_nodes': 31,
                     'min_impurity_decrease': 0.0004818371664378797,
                     'class_weight': {0: 1, 1: 4.9}
                    }

In [19]:
random_forest_params = {'n_estimators': 167,
                        'criterion': 'entropy',
                        'max_depth': 10,
                        'min_samples_split': 0.05,
                        'min_samples_leaf': 0.016,
                        'min_weight_fraction_leaf': 0.0174,
                        'max_features': 0.43112450096106114,
                        'max_leaf_nodes': 28,
                        'min_impurity_decrease': 0.00012,
                        'bootstrap': False,
                        'class_weight': {0: 1, 1: 9.747981332376273}
                        }

# Build a nested cross validation & save the pred / y for each inner fold

In [20]:
scores = []

seeds = list(range(0, 10))

for seed in tqdm(seeds):
    
    # cross-validation
    cv_out = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    # use alpha+gamma to draw stratified samples
    for train_index, valid_index in cv_out.split(X_DF, X_DF[['Alpha', 'Gamma']].copy()):

        # take all columns for the features since the ColumnTransformer will only select the defined ones
        X_train, y_train = X_DF.iloc[train_index], X_DF.loc[train_index, 'Class']
        X_valid, y_valid = X_DF.iloc[valid_index], X_DF.loc[valid_index, 'Class']
        
        # now train 20 models (4 models * 5 folds) and use the trained ones on the outer hold-out fold
        xgb = xgboost.XGBClassifier(**xgb_params)
        svm = SVC(**svm_params)
        extra_tree = ExtraTreesClassifier(**extra_tree_params)
        random_forest = RandomForestClassifier(**random_forest_params)
        
        # use defined pipeline
        X_train = preprocess_pipe.fit_transform(X_train)
        X_valid = preprocess_pipe.transform(X_valid)  # no fit

        y_train = np.array(y_train)
        y_valid = np.array(y_valid)        
        
        xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], \
                early_stopping_rounds=196, verbose=0)
        svm.fit(X_train, y_train)
        extra_tree.fit(X_train, y_train)
        random_forest.fit(X_train, y_train)
        
        models = [xgb, svm, extra_tree, random_forest]
        
        # append each predictions from the 4 models, then average the probabilities
        predictions = []
        
        for model in models:
            pred = model.predict_proba(X_valid)
            predictions.append(pred)
        
        #average the predictions
        predictions = np.array(predictions).mean(axis=0)
        
        # calculate score for this validation fold from the averaged predictions
        score = balanced_log_loss(y_valid, predictions)
        scores.append(score)

  0%|          | 0/10 [00:00<?, ?it/s]

#### 10 seeds * 5-fold-cv should make 50 scores again

In [21]:
len(scores)

50

In [22]:
print(f'Final Score: {np.array(scores).mean()}')

Final Score: 0.3085278689596406


#### Final Score did not get better then with the xgb / random forest alone

### Since the SVM had by far the worst score try an ensemble without it

In [23]:
scores_2 = []

seeds = list(range(0, 10))

for seed in tqdm(seeds):
    
    # cross-validation
    cv_out = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    # use alpha+gamma to draw stratified samples
    for train_index, valid_index in cv_out.split(X_DF, X_DF[['Alpha', 'Gamma']].copy()):

        # take all columns for the features since the ColumnTransformer will only select the defined ones
        X_train, y_train = X_DF.iloc[train_index], X_DF.loc[train_index, 'Class']
        X_valid, y_valid = X_DF.iloc[valid_index], X_DF.loc[valid_index, 'Class']
        
        # now train 20 models (4 models * 5 folds) and use the trained ones on the outer hold-out fold
        xgb = xgboost.XGBClassifier(**xgb_params)
        extra_tree = ExtraTreesClassifier(**extra_tree_params)
        random_forest = RandomForestClassifier(**random_forest_params)
        
        # use defined pipeline
        X_train = preprocess_pipe.fit_transform(X_train)
        X_valid = preprocess_pipe.transform(X_valid)  # no fit

        y_train = np.array(y_train)
        y_valid = np.array(y_valid)        
        
        xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], \
                early_stopping_rounds=196, verbose=0)
        extra_tree.fit(X_train, y_train)
        random_forest.fit(X_train, y_train)
        
        models = [xgb, extra_tree, random_forest]
        
        # append each predictions from the 4 models, then average the probabilities
        predictions = []
        
        for model in models:
            pred = model.predict_proba(X_valid)
            predictions.append(pred)
        
        #average the predictions
        predictions = np.array(predictions).mean(axis=0)
        
        # calculate score for this validation fold from the averaged predictions
        score = balanced_log_loss(y_valid, predictions)
        scores_2.append(score)

  0%|          | 0/10 [00:00<?, ?it/s]

In [24]:
print(f'Final Score without SVM: {np.array(scores_2).mean()}')

Final Score without SVM: 0.29423840919388783


### Only the XGB and Random Forest

In [25]:
scores_3 = []

seeds = list(range(0, 10))

for seed in tqdm(seeds):
    
    # cross-validation
    cv_out = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    # use alpha+gamma to draw stratified samples
    for train_index, valid_index in cv_out.split(X_DF, X_DF[['Alpha', 'Gamma']].copy()):

        # take all columns for the features since the ColumnTransformer will only select the defined ones
        X_train, y_train = X_DF.iloc[train_index], X_DF.loc[train_index, 'Class']
        X_valid, y_valid = X_DF.iloc[valid_index], X_DF.loc[valid_index, 'Class']
        
        # now train 20 models (4 models * 5 folds) and use the trained ones on the outer hold-out fold
        xgb = xgboost.XGBClassifier(**xgb_params)
        random_forest = RandomForestClassifier(**random_forest_params)
        
        # use defined pipeline
        X_train = preprocess_pipe.fit_transform(X_train)
        X_valid = preprocess_pipe.transform(X_valid)  # no fit

        y_train = np.array(y_train)
        y_valid = np.array(y_valid)        
        
        xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], \
                early_stopping_rounds=196, verbose=0)
        random_forest.fit(X_train, y_train)
        
        models = [xgb, random_forest]
        
        # append each predictions from the 4 models, then average the probabilities
        predictions = []
        
        for model in models:
            pred = model.predict_proba(X_valid)
            predictions.append(pred)
        
        #average the predictions
        predictions = np.array(predictions).mean(axis=0)
        
        # calculate score for this validation fold from the averaged predictions
        score = balanced_log_loss(y_valid, predictions)
        scores_3.append(score)

  0%|          | 0/10 [00:00<?, ?it/s]

In [26]:
print(f'Final Score (XGB & Random Forest): {np.array(scores_3).mean()}')

Final Score (XGB & Random Forest): 0.2720216365485635


# Next: Use a Logistic Regression on these individual models (stacking)