In [None]:
%%HTML
<style type="text/css">
div.h1 {
    background-color: #009C37; 
    color: white; padding: 8px; padding-right: 300px; font-size: 35px; max-width: 1500px; margin: auto; margin-top: 50px;}
div.h2 {
    background-color: #002277; 
    color: white; padding: 8px; padding-right: 300px; font-size: 25px; max-width: 1500px; margin: auto; margin-top: 50px;}
div.h3 {
    color: #002277;
    font-size: 16px; margin-top: 20px; margin-bottom:4px;}
div.h4 {
    font-size: 15px; margin-top: 20px; margin-bottom: 8px;
span.note {
    font-size: 5; color: #002277;
    font-style: italic;}
span.captiona {
    font-size: 5; color: dimgray; font-style: italic; margin-left: 130px; vertical-align: top;}
hr {
    display: block; color: #002277;
    height: 1px; border: 0;  border-top: 1px solid;}
hr.light {
    display: block; color: lightgray; height: 1px; border: 0; border-top: 1px solid;}
</style>

# <div class="h1">Starter Here: StackNetClassifier 📚</div>
![](https://github.com/kaz-Anova/StackNet/raw/master/images/StackNet_Logo.png?raw=true)
by [kazanova](https://www.kaggle.com/kazanova)


<a id='top'></a>
<div class="h1">Contents</div>

1. [Glimpse of Data](#t1)
    1.2 [Read in Data](#t1_2)
    
2. [Modeling - Ensembling With StackNet](#t2)
    2.1 [StackNet Modeling](#t2_1)
    2.2 [Evaluation StackNet ROC_AUC](#t2_2)
    2.3 [Submission](#t2_3)

3. [General Findings](#t3)

4. [End Notebooks](#end)


<a id='t1_1'></a>
# <div class="h3"> Imports</div>
[Next](#t1_2)


###### Import PyStackNet Package

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
# Suppr warning
import warnings

# Import PyStackNet Package
# Source: https://www.kaggle.com/kirankunapuli/pystacknet
import os
import sys
package_dir = "../input/py-stack-net/h2oai-pystacknet-af571e0"
sys.path.append(package_dir)
import pystacknet
import joblib
sys.modules['sklearn.externals.joblib'] = joblib

warnings.filterwarnings("ignore")
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
print(os.listdir("../input"))


In [None]:

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Machine Learning
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, roc_auc_score, f1_score, roc_curve, auc,precision_recall_curve
from sklearn import metrics

In [None]:
# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import matplotlib.patches as patches

Utils

In [None]:
def evalBinaryClassifier(model, x, y, labels=['Positives','Negatives']):
    '''
    source: https://towardsdatascience.com/how-to-interpret-a-binary-logistic-regressor-with-scikit-learn-6d56c5783b49
    Visualize the performance of  a Logistic Regression Binary Classifier.
    
    Displays a labelled Confusion Matrix, distributions of the predicted
    probabilities for both classes, the ROC curve, and F1 score of a fitted
    Binary Logistic Classifier. Author: gregcondit.com/articles/logr-charts
    
    Parameters
    ----------
    model : fitted scikit-learn model with predict_proba & predict methods
        and classes_ attribute. Typically LogisticRegression or 
        LogisticRegressionCV
    
    x : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples
        in the data to be tested, and n_features is the number of features
    
    y : array-like, shape (n_samples,)
        Target vector relative to x.
    
    labels: list, optional
        list of text labels for the two classes, with the positive label first
        
    Displays
    ----------
    3 Subplots
    
    Returns
    ----------
    F1: float
    '''
    #model predicts probabilities of positive class
    p = model.predict_proba(x)
    if len(model.classes_)!=2:
        raise ValueError('A binary class problem is required')
    if model.classes_[1] == 1:
        pos_p = p[:,1]
    elif model.classes_[0] == 1:
        pos_p = p[:,0]
    
    #FIGURE
    plt.figure(figsize=[15,4])
    
    #1 -- Confusion matrix
    train_preds = model.predict_proba(x)[:, 1]
    cm = confusion_matrix(y,train_preds.round())
    
    plt.subplot(131)
    ax = sns.heatmap(cm, annot=True, cmap='Blues', cbar=False, 
                annot_kws={"size": 14}, fmt='g')
    cmlabels = ['True Negatives', 'False Positives',
              'False Negatives', 'True Positives']
    for i,t in enumerate(ax.texts):
        t.set_text(t.get_text() + "\n" + cmlabels[i])
    plt.title('Confusion Matrix', size=15)
    plt.xlabel('Predicted Values', size=13)
    plt.ylabel('True Values', size=13)
      
    #2 -- Distributions of Predicted Probabilities of both classes
    df = pd.DataFrame({'probPos':pos_p, 'target': y})
    plt.subplot(132)
    plt.hist(df[df.target==1].probPos, density=True, bins=25,
             alpha=.5, color='green',  label=labels[0])
    plt.hist(df[df.target==0].probPos, density=True, bins=25,
             alpha=.5, color='red', label=labels[1])
    plt.axvline(.5, color='blue', linestyle='--', label='Boundary')
    plt.xlim([0,1])
    plt.title('Distributions of Predictions', size=15)
    plt.xlabel('Positive Probability (predicted)', size=13)
    plt.ylabel('Samples (normalized scale)', size=13)
    plt.legend(loc="upper right")
    
    #3 -- ROC curve with annotated decision point
    fp_rates, tp_rates, _ = roc_curve(y,p[:,1])
    roc_auc = auc(fp_rates, tp_rates)
    plt.subplot(133)
    plt.plot(fp_rates, tp_rates, color='green',
             lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], lw=1, linestyle='--', color='grey')
    #plot current decision point:
    tn, fp, fn, tp = [i for i in cm.ravel()]
    plt.plot(fp/(fp+tn), tp/(tp+fn), 'bo', markersize=8, label='Decision Point')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', size=13)
    plt.ylabel('True Positive Rate', size=13)
    plt.title('ROC Curve', size=15)
    plt.legend(loc="lower right")
    plt.subplots_adjust(wspace=.3)
    plt.show()
    #Print and Return the F1 score
    tn, fp, fn, tp = [i for i in cm.ravel()]
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1 = 2*(precision * recall) / (precision + recall)
    printout = (
        f'Precision: {round(precision,2)} | '
        f'Recall: {round(recall,2)} | '
        f'F1 Score: {round(F1,2)} | '
    )
    print(printout)
    return F1

# source: https://www.kaggle.com/carlolepelaars/ensembling-with-stacknet/notebook
def auc_score(y_true, y_pred):
    """
    Calculates the Area Under ROC Curve (AUC)
    """
    return roc_auc_score(y_true, y_pred)
def plot_curve(y_true_train, y_pred_train, y_true_val, y_pred_val, model_name):
    """
    Plots the ROC Curve given predictions and labels
    """
    fpr_train, tpr_train, _ = roc_curve(y_true_train, y_pred_train, pos_label=1)
    fpr_val, tpr_val, _ = roc_curve(y_true_val, y_pred_val, pos_label=1)
    plt.figure(figsize=(8, 8))
    plt.plot(fpr_train, tpr_train, color='black',
             lw=2, label=f"ROC train curve (AUC = {round(roc_auc_score(y_true_train, y_pred_train), 4)})")
    plt.plot(fpr_val, tpr_val, color='darkorange',
             lw=2, label=f"ROC validation curve (AUC = {round(roc_auc_score(y_true_val, y_pred_val), 4)})")
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.title(f'ROC Plot for {model_name}', weight="bold", fontsize=20)
    plt.legend(loc="lower right", fontsize=16)

def dist_target(df, target, msg_title):
    plt.figure(figsize=(12,4))
    plt.hist(df[target], bins=100)
    plt.title(msg_title, weight='bold', fontsize=18)
    plt.xlabel("Predictions", fontsize=15)
    plt.ylabel("Frequency", fontsize=15)
    plt.xlim(0, 1);    

Read data

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')

<a id='t2'></a>
# <div class="h1">Modeling</div>

[Back to Contents](#top)

[Next](#t2_2)

# <div class="h3">Ensembling With StackNet</div>

[Next](#t3_1)
![](https://github.com/kaz-Anova/StackNet/raw/master/images/StackNet_Logo.png?raw=true)
StackNet was created by Kaggle Grandmaster Marios Michailidis ([kazanova](https://www.kaggle.com/kazanova)) as part of his PhD. Thanks to [Kiran Kunapuli](https://www.kaggle.com/kirankunapuli) for uploading the package as a Kaggle dataset so it can conveniently be used with Kaggle kernels.

#### Introducing StackNet Meta-Modelling Framework

In [None]:
# Suppress warnings 
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
from IPython.display import HTML
HTML('<iframe width="1280" height="720" src="https://www.youtube.com/embed/gNPphk98pnI" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>')

In [None]:
cat_cols = ['cat'+str(i) for i in range(19)]
cont_cols = ['cont'+str(i) for i in range(11)]

In [None]:
for c in cat_cols:
    labelencoer = LabelEncoder() 
    labelencoer.fit(list(train[c].values)+list(test[c].values)) 
    train[c] = labelencoer.transform(list(train[c].values))
    test[c] = labelencoer.transform(list(test[c].values))
print('Labelling done.')    

scaler = StandardScaler()    
train[cont_cols] = scaler.fit_transform(train[cont_cols])
test[cont_cols] = scaler.transform(test[cont_cols])    

<a id='t2_1'></a>
# <div class="h2">StackNet Modeling</div>

[Back to Contents](#top)

[Next](#t2_2)
 
StackNet allows you to define all kinds of models. For example, Sklearn models, LightGBM, XGBoost, CatBoost and Keras models can all be used with StackNet.For the individual models, you are responsible for not overfitting.

In [None]:
# Level 1 are the base models that take the training dataset as input

l1_clf1 = LGBMRegressor(boosting_type='gbdt',
                        objective="binary",
                        metric="AUC",
                        boost_from_average="false",
                        learning_rate=0.05,
                        num_leaves=721,
                        max_depth=21,
                        min_child_weight=0.035,
                        feature_fraction=0.38,
                        bagging_fraction=0.42,
                        min_data_in_leaf=121,
                        max_bin=255,
                        importance_type='split',
                        reg_alpha=0.4,
                        reg_lambda=0.65,
                        bagging_seed=21,
                        random_state=2021,
                        verbosity=-1,
                        subsample=0.6,
                        colsample_bytree=0.8,
                        min_child_samples=79)

l1_clf3 = CatBoostRegressor(learning_rate=0.1,
                            bagging_temperature=0.1, 
                            l2_leaf_reg=30,
                            depth=12, 
                            max_bin=255,
                            iterations=320,
                            loss_function='Logloss',
                            objective='RMSE',
                            eval_metric="AUC",
                            bootstrap_type='Bayesian',
                            random_seed=2021,
                            early_stopping_rounds=21)


# Level 2 models will take predictions from level 1 models as input
# Remember to keep level 2 models smaller
# Basic models like Ridge Regression with large regularization or small random forests work well
l2_clf1 = RandomForestRegressor(n_estimators=721, 
                                max_depth=5, 
                                max_features='sqrt', 
                                random_state=2021)

The model is compiled and fitted through the a familiar sklearn-like API. The StackNetClassifier will perform cross-validation (CV) and will output the CV scores for each model.

In [None]:
# Split Train and Validation
features = cat_cols+cont_cols
target_col = 'target'
X_train = train[features]
target = train[target_col]
X_train, X_val, y_train, y_val = train_test_split(X_train, target, test_size=0.21, random_state=2021, stratify=target)

Specify model tree for StackNet

In [None]:
# Specify model tree for StackNet
models = [[l1_clf1, l1_clf3], # Level 1
          [l2_clf1]] # Level 2

In [None]:
from pystacknet.pystacknet import StackNetClassifier
# Specify parameters for stacked model and begin training
model = StackNetClassifier(models, 
                           metric="auc", 
                           folds=5,
                           restacking=False,
                           use_retraining=True,
                           use_proba=True, # To use predict_proba after training
                           random_state=2021,
                           n_jobs=1, 
                           verbose=1)


# Fit the entire model tree
model.fit(X_train, y_train)

Get score on training set and validation set for our StackNetClassifier

In [None]:
# Get score on training set and validation set for our StackNetClassifier
train_preds = model.predict_proba(X_train)[:, 1]
val_preds = model.predict_proba(X_val)[:, 1]
train_score = auc_score(y_train, train_preds)
val_score = auc_score(y_val, val_preds)

<a id='t2_2'></a>
# <div class="h2">Evaluation StackNet</div>

[Back to Contents](#top)

[Next](#t2_3)


In [None]:
print(f"StackNet AUC on training set: {round(train_score, 4)}")
print(f"StackNet AUC on validation set: {round(val_score, 4)}")



The blue line signifies the baseline AUC which is 0.5. The final validation score is the area under the orange curve, which is mentioned in the plot

In [None]:
# Plot ROC curve
plot_curve(y_train, train_preds, y_val, val_preds, "StackNet Baseline")

In [None]:
eval_metric = evalBinaryClassifier(model, X_val, y_val)

Score probs we have positive high score with probs >.0.5

#### Classification report train

In [None]:
from sklearn.metrics import classification_report
train_cm = confusion_matrix(y_train,train_preds.round())
print('Confusion matrix: \n',train_cm)
print('Classification report: \n',classification_report(y_train, train_preds.round()))

In [None]:
# visualize with seaborn library
sns.heatmap(train_cm,annot=True,fmt="d") 
plt.show()

### LGBMClassifier

In [None]:
# LGBMClassifier
clf_lgb = LGBMClassifier(
    max_bin=72,
    num_leaves=271,
    num_iterations=721,
    learning_rate=0.02,
    tree_learner="serial",
    task="train",
    is_training_metric=False,
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=100,
    sparse_threshold=1.0,
    save_binary=True,
    seed=42,
    feature_fraction_seed=42,
    bagging_seed=42,
    drop_seed=42,
    data_random_seed=42,
    objective="binary",
    boosting_type="gbdt",
    verbose=1,
    metric="auc",
    is_unbalance=True,
    boost_from_average=False,
)

clf_lgb.fit(X_train, y_train)

In [None]:
eval_metric_lgb = evalBinaryClassifier(clf_lgb, X_val, y_val)

### XGBClassifier

In [None]:
clf_xgb = XGBClassifier(
    n_estimators=721,
    max_depth=21,
    learning_rate=0.02,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=21,
)
clf_xgb.fit(X_train, y_train,eval_metric=["auc", "logloss"],verbose=True)

In [None]:
eval_metric_xgb = evalBinaryClassifier(clf_xgb, X_val, y_val)

In [None]:
from catboost import CatBoostClassifier
import catboost as catboost
param_cb = {
        'learning_rate': 0.2,
        'bagging_temperature': 0.1, 
        'l2_leaf_reg': 30,
        'depth': 12, 
        'max_bin':255,
        'iterations' : 721,
        'loss_function' : "Logloss",
        'objective':'CrossEntropy',
        'eval_metric' : "AUC",
        'bootstrap_type' : 'Bayesian',
        'random_seed':42,
        'early_stopping_rounds' : 121,
}
clf_ctb = CatBoostClassifier(silent=True, **param_cb)
clf_ctb.fit(X_train, y_train)

#### StackNetClassifier

In [None]:
models_ = [  ######## First level ########
            [clf_lgb, clf_xgb, clf_ctb],
            ######## Second level ########
            [clf_lgb],
]
# StackNetClassifier with GPU

model_ = StackNetClassifier(
    models_,
    metric="auc",
    folds=5,
    restacking=False,
    use_retraining=False,
    use_proba=True,
    random_state=42,
    verbose=1,
)

model_.fit(X_train, y_train)

<a id='t2_3'></a>
# <div class="h2">Submission</div>

[Back to Contents](#top)

[Next](#t3)


submission_lgb

In [None]:
sub['target'] = clf_lgb.predict_proba(test[features])[:,1]
sub.to_csv('submission_lgb.csv', index=False)

In [None]:
dist_target(sub, 'target', "Prediction Distribution for test set")

submission_xgb

In [None]:
sub['target'] = clf_xgb.predict_proba(test[features])[:,1]
sub.to_csv('submission_xgb.csv', index=False)

In [None]:
dist_target(sub, 'target', "Prediction Distribution for test set")

submission_ctb

In [None]:
sub['target'] = clf_ctb.predict_proba(test[features])[:,1]
sub.to_csv('submission_ctb.csv', index=False)

In [None]:
dist_target(sub, 'target', "Prediction Distribution for test set")

submission_stacknet

In [None]:
sub['target'] = model.predict_proba(test[features])[:,1]
sub.to_csv('submission.csv', index=False)

In [None]:
dist_target(sub, 'target', "Prediction Distribution for test set")

In [None]:
sub['target'] = model_.predict_proba(test[features])[:,1]
sub.to_csv('submission_stacknet2.csv', index=False)

In [None]:
dist_target(sub, 'target', "Prediction Distribution for test set")

<a id='t3'></a>
# <div class="h1">General Findings</div>

[Back to Contents](#top)

[End Notebook](#end)


### Useful links: 
- [StackNet](https://github.com/kaz-Anova/StackNet)
- [StackNet Examples](https://github.com/kaz-Anova/StackNet#examples)
- [Kaggle Stacknet example](https://www.kaggle.com/caesarlupum/brazil-against-the-advance-of-covid-19)
- [StacKNet - Ensembling - How to Win a Data Science Competition: Learn from Top Kagglers](https://www.coursera.org/lecture/competitive-data-science/stacknet-s8RLi)
- [Win Machine Learning (Kaggle) Competitions using StackNet with Marios Michailidis](https://skillsmatter.com/skillscasts/10121-infiniteconf-bytes-with-marios-michailidis)
- [StackingClassifier - An ensemble-learning meta-classifier for stacking](http://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/#overview)
 <hr>