In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib 
from matplotlib import gridspec
import seaborn as sns
from scipy.stats import ks_2samp

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve

from matplotlib.patches import Patch
from sklearn.model_selection import (
    KFold,
    GroupKFold,
    StratifiedGroupKFold,
)

import warnings
warnings.filterwarnings("ignore")

# runtime configuration of matplotlib
plt.style.use("Solarize_Light2")
plt.rc("figure", 
    autolayout=True, 
    figsize=(20, 10)
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=20,
    titlepad=10,
)
#!pip install mplcyberpunk
#import mplcyberpunk

In [None]:
%%HTML
<style type="text/css">

div.h2 {
    background-color: #00b050; 
    color: white; 
    padding: 5px; 
    padding-right: 300px; 
    font-size: 25px;  
    margin-top: 2px;
    margin-bottom: 10px;
}

div.h3 {
    background-color: white; 
    color: #fe0000; 
    padding: 5px; 
    padding-right: 300px; 
    font-size: 20px; 
    margin-top: 2px;
    margin-bottom: 10px;
}
</style>

---

<div class="h2">Introduction</div>

In this notebook I have performed an analysis of the sensor and subject data:

   <a id="toc"></a>
   
1. [Covariate Shift](#1)
2. [Subject Analysis](#2)
3. [Cross Validation Strategies](#3)
4. [References](#4)

I will keep adding content; its work in progress.

<a id="1"></a>
<div class="h2">Covariate Shift</div>

Covariate shift is the scholarly term for when the distribution of the data (i.e. our input features) changes. It is important to adress this issue since ignoring it could give poor predictive performance assuming there is a significant shift in the feature distribution. The reason being basically that the data the model is trained on is not representative of the future.

There are several ways to detect covariate shift like `Kullback–Leibler divergence` or `Kolmogorov-Smirnov test`. I have applied Kolmogorov-Smirnov test (`KS-test`). `KS-test` is a non-parametric test that compares the shape of two empirical distributions. It is sensitive to differences in both location and shape of the distributions.

Under the the `null-hypothesis` the covariates come from the same distribution and when the `p-value` is smaller than `0.05` then is the null-hypotheis rejected.

What is the solution once the 'drift' is recognised? Well there are two solution:

   1. Feature Removal
   2. Importance reweighting


In [None]:
train_raw = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv',index_col='subject')
test_raw = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv',index_col='subject')
train_labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
def parse_data(data):
    X = data.copy()#[train_raw.index==49].sort_values(by=["sequence", "step"])
    X = X.pivot(index='sequence', columns=['step'], values=[col for col in X.columns if 'sensor_' in col])

    columns = []
    for col in X.columns.values:
        string = col[0]+"_"+ np.str(col[1])
        columns.append(string)
    X.columns = columns

    return X

train = parse_data(train_raw)
test = parse_data(test_raw)


In [None]:
# ks-test
def ks_test(df_train, df_test):
    df_ks_test = pd.DataFrame(index=test.columns, columns=["p_value_ks", "statistics_ks"])
    p_value_ks = []
    statistics_value_ks = []

    for col in train.columns:
        data1 = df_train[col]
        data2 = df_test[col]
        
        #perform Kolmogorov-Smirnov test
        statistics_value_ks.append(ks_2samp(data1, data2)[0])
        p_value_ks.append(ks_2samp(data1, data2)[1])

    df_ks_test["p_value_ks"] = p_value_ks
    df_ks_test["statistics_ks"] = statistics_value_ks

    filters = [
    (df_ks_test.p_value_ks <= 0.05), (df_ks_test.p_value_ks > 0.05)
    ]
    values = ["Different distribution", "Same distribution"]

    df_ks_test["category"] = np.select(filters, values)
    df_ks_test["flag"]=1

    return df_ks_test

In [None]:
train = parse_data(train_raw)
test = parse_data(test_raw)
df_ks_test = ks_test(train, test)

In [None]:
# ks-plot
fig = plt.figure(constrained_layout=False)
spec = gridspec.GridSpec(ncols=2, nrows=2, figure=fig, width_ratios=[3, 1])

fig.suptitle("Kolmogorov-Smirnov Test\n", fontsize=25, fontproperties="bold")

sensors = ["sensor_00_0", "sensor_01_0",  "sensor_02_0", "sensor_03_0", "sensor_04_0",  "sensor_05_0",  \
    "sensor_06_0",  "sensor_07_0",  "sensor_08_0",  "sensor_09_0",  "sensor_10_0",  "sensor_11_0",  "sensor_12_0"]

sensor_labels = ["sensor 1", "sensor 2", "sensor 3","sensor 4","sensor 5","sensor 6","sensor 7","sensor 8","sensor 9",\
    "sensor 10","sensor 11","sensor 12","sensor 13"]

# KS p-value
ax1 = fig.add_subplot(spec[0, 0])
sns.scatterplot(y="p_value_ks", x=df_ks_test.index, ax=ax1, data=df_ks_test)
for sensor, label in zip(sensors, sensor_labels):
    plt.axvline(x = sensor, color = 'k', alpha=0.3)
    ax1.text(sensor, 1.05, label, fontsize=8, verticalalignment='top', rotation="horizontal", color="k", fontproperties="normal")
ax1.axhline(0.05, c="r")
ax1.text(sensor, 0.1, " p-value = 0.05", fontsize=8, verticalalignment='top', rotation="horizontal", color="k", fontproperties="bold")
ax1.xaxis.set_ticks([])
ax1.set(xlabel="Steps(1-60)", ylabel = "value")
ax1.set_title('P-Value', fontsize=15, fontproperties="semibold")
ax1.set_xlim("sensor_00_0", "sensor_13_59")

# text
text = "The Kolmogorov Smirnov test\n statistic quantifies a distance \n between two empirical \n distributions."
ax2 = fig.add_subplot(spec[0, 1])
ax2.xaxis.set_ticks([])
ax2.yaxis.set_ticks([])
ax2.text(0.0, 0.65, text, fontsize=20, verticalalignment='top', rotation="horizontal", color="k", fontproperties="cursive")
ax2.set_facecolor('cornsilk')
ax2.set_title('', fontsize=25, fontproperties="semibold")

# Kolmogorov test statistics
ax3 = fig.add_subplot(spec[1, 0])
sns.scatterplot(y="statistics_ks", x=df_ks_test.index, ax=ax3, data=df_ks_test)
ax3.xaxis.set_ticks([])
ax3.set(xlabel="Steps(1-60)", ylabel = "value")
ax3.set_title('Test statistics', fontsize=15, fontproperties="semibold")
ax3.set_xlim("sensor_00_0", "sensor_13_59")
for sensor, label in zip(sensors, sensor_labels):
    plt.axvline(x = sensor, color = 'k', alpha=0.3)
    ax3.text(sensor, 0.0465, label, fontsize=10, verticalalignment='top', rotation="horizontal", color="k", fontproperties="cursive", family="bold")
ax3.text(sensor, 0.0165, " Critical value", fontsize=8, verticalalignment='top', rotation="horizontal", color="k", fontproperties="bold")
ax3.axhline(0.0145, c="r")

# Count of features
ax4 = fig.add_subplot(spec[1, 1])
sns.countplot(x="category", ax=ax4, data=df_ks_test)
for p in ax4.patches:
    ax4.annotate(f'\n{p.get_height()}', (p.get_x()+0.4, p.get_height()), ha='center', va='top', color='white', size=18)
ax4.set(xlabel="", ylabel = "")
ax4.set_title('Feature Distribution Similarity \n (Train vs Test)', fontsize=18, fontproperties="semibold")
ax4.yaxis.set_ticks([])

plt.show()

💡 **INSIGHTS**
- From the first plot we can deduce that features of `sensor 5` and `sensor 13` have different distributions for the `train` and `test` set.
- The third plot shows that the test statistics of `sensor 13` are significantly higher than the test statistics of other sensors. Does this mean that the `sensor 13` train features are more different than test `sensor 13` features compared to other features? I do not know yet. 
- The next step is to fit one model with all the features and another without `sensor 5` and `sensor 13` features and compare the model performances to know if the `drift` is consequential. 
- Based on a discussion with [@ambrosm](https://www.kaggle.com/ambrosm) and [@Pourchot](https://www.kaggle.com/pourchot) I have applied the KS-test on subject level. That is to see if the shift is on train test level or on subject level. So what I have done is that I compare train subjects with test subjects where the number of sequences are greater than 98 so that the test statistics is reliable. See the plot below for the results. 

In [None]:
# ks-plot
fig = plt.figure(constrained_layout=False)
spec = gridspec.GridSpec(ncols=2, nrows=2, figure=fig)

fig.suptitle("Kolmogorov-Smirnov Test\n P-Value\n", fontsize=25, fontproperties="bold")

sensors = ["sensor_00_0", "sensor_01_0",  "sensor_02_0", "sensor_03_0", "sensor_04_0",  "sensor_05_0",  \
    "sensor_06_0",  "sensor_07_0",  "sensor_08_0",  "sensor_09_0",  "sensor_10_0",  "sensor_11_0",  "sensor_12_0"]

sensor_labels = ["sensor 1", "sensor 2", "sensor 3","sensor 4","sensor 5","sensor 6","sensor 7","sensor 8","sensor 9",\
    "sensor 10","sensor 11","sensor 12","sensor 13"]

# KS p-value
train = parse_data(train_raw[train_raw.index==647])
test = parse_data(test_raw[test_raw.index==748])
df_ks_test = ks_test(train, test)

ax1 = fig.add_subplot(spec[0, 0])
sns.scatterplot(y="p_value_ks", x=df_ks_test.index, ax=ax1, data=df_ks_test)
for sensor, label in zip(sensors, sensor_labels):
    plt.axvline(x = sensor, color = 'k', alpha=0.3)
    ax1.text(sensor, 1.05, label, fontsize=8, verticalalignment='top', rotation="horizontal", color="k", fontproperties="normal")
ax1.axhline(0.05, c="r")
ax1.text(sensor, 0.1, "", fontsize=8, verticalalignment='top', rotation="horizontal", color="k", fontproperties="bold")
ax1.xaxis.set_ticks([])
ax1.set(xlabel="Steps(1-60)", ylabel = "value")
ax1.set_title('(train_subject = 647, test_subject = 748)', fontsize=15, fontproperties="semibold")
ax1.set_xlim("sensor_00_0", "sensor_13_59")

train = parse_data(train_raw[train_raw.index==196])
test = parse_data(test_raw[test_raw.index==682])
df_ks_test = ks_test(train, test)

# KS p-value
ax2 = fig.add_subplot(spec[0, 1])
sns.scatterplot(y="p_value_ks", x=df_ks_test.index, ax=ax2, data=df_ks_test)
for sensor, label in zip(sensors, sensor_labels):
    plt.axvline(x = sensor, color = 'k', alpha=0.3)
    ax2.text(sensor, 0.85, label, fontsize=8, verticalalignment='top', rotation="horizontal", color="k", fontproperties="normal")
ax2.axhline(0.05, c="r")
ax2.text(sensor, 0.1, "", fontsize=8, verticalalignment='top', rotation="horizontal", color="k", fontproperties="bold")
ax2.xaxis.set_ticks([])
ax2.set(xlabel="Steps(1-60)", ylabel = "value")
ax2.set_title('(train_subject = 196, test_subject = 682)', fontsize=15, fontproperties="semibold")
ax2.set_xlim("sensor_00_0", "sensor_13_59")

train = parse_data(train_raw[train_raw.index==47])
test = parse_data(test_raw[test_raw.index==781])
df_ks_test = ks_test(train, test)

# KS p-value
ax3 = fig.add_subplot(spec[1, 0])
sns.scatterplot(y="p_value_ks", x=df_ks_test.index, ax=ax3, data=df_ks_test)
for sensor, label in zip(sensors, sensor_labels):
    plt.axvline(x = sensor, color = 'k', alpha=0.3)
    ax3.text(sensor, 1.05, label, fontsize=8, verticalalignment='top', rotation="horizontal", color="k", fontproperties="normal")
ax3.axhline(0.05, c="r")
ax3.text(sensor, 0.1, "", fontsize=8, verticalalignment='top', rotation="horizontal", color="k", fontproperties="bold")
ax3.xaxis.set_ticks([])
ax3.set(xlabel="Steps(1-60)", ylabel = "value")
ax3.set_title('(train_subject = 47, test_subject = 781)', fontsize=15, fontproperties="semibold")
ax3.set_xlim("sensor_00_0", "sensor_13_59")

train = parse_data(train_raw[train_raw.index==125])
test = parse_data(test_raw[test_raw.index==865])
df_ks_test = ks_test(train, test)

# KS p-value
ax4 = fig.add_subplot(spec[1, 1])
sns.scatterplot(y="p_value_ks", x=df_ks_test.index, ax=ax4, data=df_ks_test)
for sensor, label in zip(sensors, sensor_labels):
    plt.axvline(x = sensor, color = 'k', alpha=0.3)
    ax4.text(sensor, 1, label, fontsize=8, verticalalignment='top', rotation="horizontal", color="k", fontproperties="normal")
ax4.axhline(0.05, c="r")
ax4.text(sensor, 0.1, "", fontsize=8, verticalalignment='top', rotation="horizontal", color="k", fontproperties="bold")
ax4.xaxis.set_ticks([])
ax4.set(xlabel="Steps(1-60)", ylabel = "value")
ax4.set_title('(train_subject = 125, test_subject = 865)', fontsize=15, fontproperties="semibold")
ax4.set_xlim("sensor_00_0", "sensor_13_59")

plt.show()

💡 **INSIGHTS**
- From the four plots we can see that `sensor 13` train features experience shift when compared to `sensor 13` test features.
- `Sensor 5` seems to be behavouing differently per subject therefor we may conclude the results of KS-test for `sensor 5` is not significant.

<a id="2"></a>
<div class="h2">Subject Analysis</div>

Now that we have studied the sensors it is time to take a look at the (disjoint) subjects. [@ambrose](https://www.kaggle.com/ambrosm) has also studied this, see [notebook](https://www.kaggle.com/code/ambrosm/tpsapr22-eda-which-makes-sense).

In [None]:
def subject(dataframe):

    X = dataframe.copy()
    X.reset_index(inplace=True)
    X = X[["subject", "sequence"]].groupby(["subject", "sequence"]).count()
    X.reset_index(inplace=True)
    X = X.merge(train_labels, how="left", left_on="sequence", right_on="sequence")
    X = X.groupby(["subject"]).agg({"sequence":"count", "state":"sum"}).sort_values(["subject"])
    X.reset_index(inplace=True)
    X["percentage"] = X["state"]/X["sequence"]

    return X

train_subject = subject(train_raw)
test_subject = subject(test_raw)

# plot
fig = plt.figure(constrained_layout=False)
spec = gridspec.GridSpec(ncols=2, nrows=2, figure=fig)

ax1 = fig.add_subplot(spec[0, 0])
sns.histplot(x="sequence", ax=ax1, data=train_subject, color="y", alpha=0.15, label="Train")
sns.histplot(x="sequence", ax=ax1, data=test_subject, color="r", label="Test")
ax1.set(xlabel="Sequence count", ylabel = "Number of subjects")
ax1.set_xlim(0, 200);
ax1.set_title('Histogram sequence count \n(Train vs Test)', fontsize=12)
ax1.legend()

ax2 = fig.add_subplot(spec[0, 1])
sns.kdeplot(x="sequence", ax=ax2, data=train_subject, color="y", alpha=1, label="Train")
sns.kdeplot(x="sequence", ax=ax2, data=test_subject, color="r", label="Train")
ax2.set(xlabel="Sequence count", ylabel = "Density")
ax2.set_xlim(0, 200);
ax2.set_title('Density sequence count \n(Train vs Test)', fontsize=12)
ax2.legend()

ax3 = fig.add_subplot(spec[1, :])
sns.scatterplot(x="sequence", y="percentage", ax=ax3, data=train_subject, color="g")
ax3.set(xlabel="Sequence count", ylabel = "Probability State=1")
ax3.set_title('Sequence count vs Probability State=1 \n (Train)', fontsize=12)

fig.suptitle("Subject Analysis\n", fontsize=25, fontproperties="bold")
plt.show()

💡 **INSIGHTS**
- Based on the first plot we can conclude that the number of sequences per subject is right skewed for both `train` and `test`.
- The second plot suggests that the number of sequences for both train and test have identical empirical density function.
- The third plot is very interesting. What it says is; the higher the number of sequnces the higher the probablity for state 1. For example `(Number of sequences)>=100` implies `P(state=1)>=0.78`. We can try to assign `probablity>= 0.78` for test subjects that has more than 100 sequences and see how it performs.
- This insight also has implications for the cross validation strategy. For reliable cross validation it is wise to apply `GroupKfold` approach. See discussion below with [@ambrose](https://www.kaggle.com/ambrosm).

<a id="3"></a>
<div class="h2">Cross Validation Strategy</div>

In this section I would like to layout the choice of cross validation strategy. As [@ambrose](https://www.kaggle.com/ambrosm) rightly mentions in his notebook that by applying `Kfold` strategy one introduces leak in the CV folds instead it is beter to chose `GroupKFold` strategy because every subject occurs just once in every fold. 

To demonstrate the difference in strategies I have selected `8 subjects` for which I apply `Kfold`, `GroupKFold` and `StratifiedGroupKFold` with the number of `folds=5`. For example from the first plot one can observe that for the 4th iteration the the second last subject occurs in both training and validation set. This introduces a leak in the training set as a result we will get high CV results and eventually lower performance on the `LB`.

`GroupKFold` and `StratifiedGroupKFold` are `K-fold` variant with non-overlapping groups. The difference between `GroupKFold` and `StratifiedGroupKFold` is that the former attempts to create balanced folds such that the number of distinct groups is approximately the same in each fold, whereas StratifiedGroupKFold attempts to create folds which preserve the percentage of samples for each class as much as possible given the constraint of non-overlapping groups between splits. 

In the next section I have implemented `XGBoost` with `GroupKFold` and `StratifiedGroupKFold` and compared the reults. See below.

In [None]:
cmap_data = plt.cm.summer
cmap_cv = plt.cm.coolwarm
n_splits = 5

#
subjects = train_subject.loc[(train_subject["sequence"]<13) & (train_subject["percentage"]>0),"subject"].values
#
X = train_raw.copy()
X.reset_index(inplace=True)
X = X[["subject", "sequence"]].groupby(["subject", "sequence"]).count()
X.reset_index(inplace=True)
X = X.merge(train_labels, how="left", left_on="sequence", right_on="sequence")
X = X.loc[X["subject"].isin(list(subjects)),:].sort_values(by=["subject","sequence"])

def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    # Plot the data classes and groups at the end
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=plt.cm.summer
    )

    ax.scatter(
        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=plt.cm.prism_r
    )

    # Formatting
    yticklabels = list(range(n_splits)) + ["state", "subject"]
    ax.set(
        yticks=np.arange(n_splits + 2) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits + 2.2, -0.2],
        xlim=[0, 75],
    )
    ax.set_title("{}".format(type(cv).__name__), fontsize=15)
    return ax

n_splits =5
# plot
fig = plt.figure(constrained_layout=True, figsize=(20,5))
spec = gridspec.GridSpec(ncols=3, nrows=1, figure=fig)

ax1 = fig.add_subplot(spec[0, 0])
plot_cv_indices(KFold(n_splits), X, X.state, X.subject, ax1, n_splits)

ax1.set(xlabel="Sequence", ylabel = "CV iteration")
ax1.set_xlim(0, 75);
ax1.set_title('KFold', fontsize=12, fontproperties="semibold")
ax1.annotate("Leak", (60 , 6.6), (65, 8) , arrowprops={"arrowstyle": "->"},\
    fontproperties="cursive", fontsize=10)
ax1.annotate("Split", (59 , 4.5), (63, 6) , arrowprops={"arrowstyle": "->"},\
    fontproperties="cursive", fontsize=10)

ax2 = fig.add_subplot(spec[0, 1])
plot_cv_indices(GroupKFold(n_splits), X, X.state, X.subject, ax2, n_splits)
ax2.set(xlabel="Sequence", ylabel = "")
ax2.set_xlim(0, 75);
ax2.set_title('GroupKFold', fontsize=12, fontproperties="semibold")

ax3 = fig.add_subplot(spec[0, 2])
plot_cv_indices(StratifiedGroupKFold(n_splits), X, X.state, X.subject, ax3, n_splits)
ax3.set(xlabel="Sequence", ylabel = "")
ax3.set_xlim(0, 75);
ax3.set_title('StratifiedGroupKFold', fontsize=12, fontproperties="semibold")
ax3.legend(
    [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))],
    ["Validation set", "Training set"],
    loc=(1.02, 0.8),
)

fig.suptitle("Validation Strategies\n", fontsize=25, fontproperties="bold")
plt.show()

In [None]:
train = parse_data(train_raw)

X = train_raw[["sequence"]].reset_index(drop=False)
X = X.groupby(["subject", "sequence"]).count()
X.reset_index(inplace=True)
X = train.merge(X, how="left", left_index=True, right_on="sequence")
X = X.merge(train_labels, how="left", left_on="sequence", right_on="sequence")

X.sort_values(by=["subject","sequence"])
X.set_index(["sequence","subject"], inplace=True)

def xgboost(X_train, y_train, X_val, y_val, n_estimators=5):
    
    model = XGBClassifier(n_estimators=n_estimators, n_jobs=-1,
                      eval_metric=['logloss'],
                      #max_depth=10,
                      colsample_bytree=0.8,
                      #gamma=1.4,
                      reg_alpha=6, reg_lambda=1.5,
                      tree_method='hist',
                      learning_rate=0.03,
                      verbosity=1,
                      use_label_encoder=False, random_state=3)

    model.fit(X_train.values, y_train, eval_set = [(X_val.values, y_val)], 
                eval_metric = ['auc'], early_stopping_rounds=30, verbose=10)
    
    return model.predict_proba(X_val.values)[:,1] 

####
features = [col for col in X.columns if col not in ['state',"subject", "sequence"]]
score_list = []
gkf_score = pd.DataFrame()
gkf = GroupKFold(n_splits=2)
for fold, (idx_train, idx_val) in enumerate(gkf.split(X, y=X.state, groups=X.index.get_level_values('subject'))):
    X_train = X.iloc[idx_train][features]
    X_val = X.iloc[idx_val][features]
    y_train = X.iloc[idx_train].state
    y_val = X.iloc[idx_val].state

    y_val_pred = xgboost(X_train, y_train, X_val, y_val, 300)
    
    X_val["gkf_pred"] = list(y_val_pred)
    gkf_score = gkf_score.append(X_val)

    score = roc_auc_score(y_val, y_val_pred)

    print(f"Fold {fold}: AUC = {score:.3f}")
    score_list.append(score)
    
print(f"OOF AUC:{np.mean(score_list):.3f}")

features = [col for col in X.columns if col not in ['state',"subject", "sequence"]]
score_list = []
sgkf_score = pd.DataFrame()
sgkf = StratifiedGroupKFold(n_splits=2)
for fold, (idx_train, idx_val) in enumerate(sgkf.split(X, y=X.state, groups=X.index.get_level_values('subject'))):
    X_train = X.iloc[idx_train][features]
    X_val = X.iloc[idx_val][features]
    y_train = X.iloc[idx_train].state
    y_val = X.iloc[idx_val].state

    y_val_pred = xgboost(X_train, y_train, X_val, y_val, 300)
    
    X_val["sgkf_pred"] = list(y_val_pred)
    sgkf_score = sgkf_score.append(X_val)

    score = roc_auc_score(y_val, y_val_pred)

    print(f"Fold {fold}: AUC = {score:.3f}")
    score_list.append(score)
    
print(f"OOF AUC:{np.mean(score_list):.3f}")

In [None]:
val_data = gkf_score.loc[:,["gkf_pred"]].merge(sgkf_score.loc[:,["sgkf_pred"]], how="left", left_index=True, right_index=True)
val_data.reset_index(inplace=True)
val_data = val_data.merge(train_labels, how="left", left_on="sequence", right_on="sequence")
val_data = val_data.groupby(["subject"]).agg({"sequence":"count", "state":"sum", "gkf_pred":"mean", "sgkf_pred":"mean"}).sort_values(["subject"])
val_data.reset_index(inplace=True)
val_data["percentage"] = val_data["state"]/val_data["sequence"]
val_data
# plot
fig = plt.figure(constrained_layout=False)
spec = gridspec.GridSpec(ncols=2, nrows=2, figure=fig)

ax1 = fig.add_subplot(spec[0, 0])
sns.scatterplot(x="sequence", y="gkf_pred", ax=ax1, data=val_data, color="y", label="GroupedKFold")
sns.scatterplot(x="sequence", y="sgkf_pred", ax=ax1, data=val_data, label="StratifiedGroupedKFold")
ax1.set(xlabel="Sequence length", ylabel = "Probability State=1")
ax1.set_xlim(0, 200);
ax1.set_title('Validation results', fontsize=12)
ax1.legend()

ax2 = fig.add_subplot(spec[0, 1])
sns.kdeplot(x="gkf_pred", ax=ax2, data=val_data, color="y", cumulative=True, label="GroupedKFold")
sns.kdeplot(x="sgkf_pred", ax=ax2, data=val_data, color="r", cumulative=True, label="StratifiedGroupedKFold")
ax2.set(xlabel="Probability", ylabel = "P(X<x)")
ax2.set_title('Cumulative Distribution Function (CDF)', fontsize=12)
ax2.legend()


ax3 = fig.add_subplot(spec[1, 0])
sns.scatterplot(x="sequence", y="percentage", ax=ax3, data=val_data, color="y")
ax3.set(xlabel="Sequence length", ylabel = "Probability State=1")
ax3.set_title('Sequence length vs Probability State=1 \n (Train)', fontsize=12)
ax3.set_xlim(0, 200);

ax4 = fig.add_subplot(spec[1, 1])
sns.histplot(x="gkf_pred", ax=ax4, data=val_data, color="y", alpha=0.15, label="GroupedKFold")
sns.histplot(x="sgkf_pred", ax=ax4, data=val_data, color="r", label="StratifiedGroupedKFold")
ax4.set(xlabel="Probability", ylabel = "Count")
ax4.set_title('Histogram prediction probability', fontsize=12)
ax4.legend()

fig.suptitle("GroupedKFold vs StratifiedGroupedKFold\n", fontsize=25, fontproperties="bold", alpha=1)
#mplcyberpunk.make_scatter_glow()

plt.show()

💡 **INSIGHT**
- Based on the first plot we can see there is not much difference between `GroupedKFold` and `StratifiedGroupedKFold`. 
- The second plot shows that both strategies give similar cumulative distribution function.
- The fourth plot suggests that `GroupedKFold` has higher kurtosis and `StratifiedGroupedKFold` has thicker tails. I do not know if the results are statistically significant.
- Comparing the first plot with the third plot one can conclude that there is still some work to do to improve the performance of the model.

<a id="4"></a>
<div class="h2">References</div>

### Domain Knowledge References

1. https://towardsdatascience.com/new-features-of-scikit-learn-fbbfe7652bfb
2. https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py

### Kaggle Kernels for Inspiration
1. https://www.kaggle.com/code/ambrosm/tpsapr22-eda-which-makes-sense
2. https://www.kaggle.com/code/cv13j0/tps-apr-2022-xgboost-model
3. https://www.kaggle.com/code/ambrosm/tpsapr22-best-model-without-nn
4. https://www.kaggle.com/code/thedatabeast/where-to-invest-to-combat-air-pollution-in-india