In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import datatable

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install mplcyberpunk
import mplcyberpunk
plt.style.use("cyberpunk")

In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.calibration import CalibrationDisplay
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

In [None]:
# helpers
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",round(start_mem_usg,2)," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
#             print("******************************")
#             print("Column: ",col)
#             print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
#             print("dtype after: ",props[col].dtype)
#             print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",round(mem_usg,2)," MB")
    print("This is ",round(100*mem_usg/start_mem_usg,2),"% of the initial size")
    return props, NAlist

In [None]:
%%time
train_df = datatable.fread("/kaggle/input/tabular-playground-series-may-2022/train.csv").to_pandas().set_index('id')
train_df,_ = reduce_mem_usage(train_df)
test_df = datatable.fread("/kaggle/input/tabular-playground-series-may-2022/test.csv").to_pandas().set_index('id')
test_df,_ = reduce_mem_usage(test_df)

In [None]:
def feature_engineer(df):
    
    df = df.copy()
    
    for i in range(10):
        df[f"ch{i}"] = df.f_27.str.get(i).apply(ord) - ord('A')
        
    df["f_27_len_unique_chars"] = df.f_27.apply(lambda x : len(set(x)))
    
    df.drop(columns=["f_27"], inplace=True)
    
    df["i_f02_f21"] = (df.f_02 + df.f_21 > 5.2).astype(int) - \
                        (df.f_02 + df.f_21 < -5.3).astype(int)
    
    df["i_f05_f22"] = (df.f_05 + df.f_22 > 5.1).astype(int) - \
                        (df.f_05 + df.f_22 < -5.4).astype(int)
    
    df["i_f00_f01_f26"] = (df.f_00 + df.f_01 + df.f_26 > 5.0).astype(int) - \
                        (df.f_00 + df.f_01 + df.f_26 < -5.0).astype(int)
    
    return df


train_df = feature_engineer(train_df)
test_df = feature_engineer(test_df)

In [None]:
def train_lgbm(features):
    X=train_df[features]
    y=train_df["target"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = 42)

    # Train model
    lgbm_model_cv = LGBMClassifier(n_estimators=1000, min_child_samples=80, random_state=1307)
    lgbm_model_cv.fit(X_train.values, y_train)
    y_pred = lgbm_model_cv.predict_proba(X_test.values)[:,1]
    auc_score = roc_auc_score(y_test, y_pred)
    print(f"Validation AUC:{(auc_score):.5f}")

In [None]:
base_features = ['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07', 'f_08',
       'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17',
       'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26',
       'f_28', 'f_29', 'f_30']

In [None]:
%%time
features = base_features
train_lgbm(features)

In [None]:
%%time
features = base_features + ['ch0', 'ch1', 'ch2', 'ch3', 'ch4',
       'ch5', 'ch6', 'ch7', 'ch8', 'ch9']
train_lgbm(features)

In [None]:
%%time
features = base_features + ['f_27_len_unique_chars']
train_lgbm(features)

In [None]:
%%time
features = base_features + ['i_f02_f21', 'i_f05_f22', 'i_f00_f01_f26']
train_lgbm(features)

In [None]:
final_feature_list = base_features + ['f_27_len_unique_chars'] + ['ch0', 'ch1', 'ch2', 'ch3', 'ch4',
       'ch5', 'ch6', 'ch7', 'ch8', 'ch9'] + ['i_f02_f21', 'i_f05_f22', 'i_f00_f01_f26']

In [None]:
%%time
train_lgbm(final_feature_list)

## Cross-Validation

For cross-validation, we use a simple KFold with five splits. It turned out that the scores of the five splits are very similar so that I usually run only the first split. This one split is good enough to evaluate the model.

Because I want to understand how many iterations are needed, we'll collect some metrics and plot the training history. early_stopping_round is set to the very high value of 100000. This means that the algorithm won't stop early, but the setting is necessary to collect the metrics.

In [None]:
%%time
def my_booster(n_estimators=10_000, random_state=1):
    return LGBMClassifier(n_estimators=n_estimators,
                         min_child_samples=80,
                         num_leaves=127,
                         subsample=0.85,
                         subsample_freq=1,
                         metric='auc,binary_logloss,binary_error',
                         max_bins=511,
                         random_state=random_state)

print(f"{len(final_feature_list)} features")

auc_scores_list=[]

kf = KFold(n_splits=5)

for fold, (idx_train, idx_val) in enumerate(kf.split(train_df)):
    X_train = train_df.iloc[idx_train][final_feature_list]
    X_val = train_df.iloc[idx_val][final_feature_list]
    y_train = train_df.iloc[idx_train].target
    y_val = train_df.iloc[idx_val].target
    
    model = my_booster()
    model.fit(X_train, y_train,
              eval_set = [
                  (X_train, y_train),
                  (X_val,y_val)
              ],
              callbacks=[
                  early_stopping(stopping_rounds=1_00_000), # will stop training if one metric of one validation data doesn’t improve in last stopping_rounds rounds
                  log_evaluation(period=1000)
              ],
             )
    y_pred = model.predict_proba(X_val)[:,1]
    auc_score = roc_auc_score(y_val, y_pred)
    print(f"Fold {fold}: \t  AUC = {auc_score:.5f}")
    auc_scores_list.append(auc_score)
    break # we only need the first fold
    
print(f"OOF AUC: \t  {np.mean(auc_scores_list):.5f}")

### Cross Validation Model Summary

In [None]:
{
    "model.n_features_": model.n_features_,
    "model.feature_name_": model.feature_name_,
    "model.feature_importances_": model.feature_importances_,
    "model.n_classes_": model.n_classes_,
    "model.objective_": model.objective_,
    "model.best_iteration_": model.best_iteration_,
    "model.best_score_": model.best_score_,
    "model.evals_result_['training']['auc'][:5]":model.evals_result_["training"]["auc"][:5]
}

We save the metrics as csv files in case anybody wants to evaluate them further:

In [None]:
history_train = pd.DataFrame(model.evals_result_['training'])
history_valid = pd.DataFrame(model.evals_result_['valid_1'])
history_train['accuracy'] = 1 - history_train.binary_error
history_valid['accuracy'] = 1 - history_valid.binary_error
history_train.to_csv('history_train.csv')
history_valid.to_csv('history_valid.csv')
history_train.tail()

Now we plot the three metrics loss, accuracy and AUC for the whole training history. In every plot, we mark the point where the metric reaches its optimum.

The estimator overfits terribly: After 2000 iterations, the training predictions are perfect (accuracy and AUC are 1.0). The validation loss reaches its optimum already before 2000 iterations, at a time when validation accuracy and validation auc are still improving. Validation accuracy and validation auc peak several thousand iterations later.

Insight: Don't stop lightgbm early when the validation loss stops improving! Wait until validation auc peaks!

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize=(30,20), sharex=True)

ax1.plot(history_train.index, history_train.binary_logloss, label='training')
ax1.plot(history_valid.index, history_valid.binary_logloss, label='validation')
m = history_train.binary_logloss.argmin()
ax1.scatter([m],history_train.binary_logloss[m])
m = history_valid.binary_logloss.argmin()
ax1.scatter([m],history_valid.binary_logloss[m])
ax1.annotate('Lowest Validation Loss. \nValidation Loss starts increasing from here. \nDont stop the training yet.', xy=(m,history_valid.binary_logloss[m]),  xycoords='data',
            xytext=(m, 0.3), textcoords='data',
            arrowprops=dict(facecolor='white', shrink=0.05),
            horizontalalignment='center', verticalalignment='top',fontsize=15
            )
# ax1.set_xticks(np.linspace(0, 1000, 11))
ax1.set_ylabel('Loss')
ax1.legend(loc='best')
mplcyberpunk.add_glow_effects(ax1)

ax2.plot(history_train.index, history_train.accuracy, label='training')
ax2.plot(history_valid.index, history_valid.accuracy, label='validation')
m = history_train.accuracy.argmax()
ax2.scatter([m],history_train.accuracy[m])
m = history_valid.accuracy.argmax()
ax2.scatter([m],history_valid.accuracy[m])
ax2.annotate('Best Validation Accuracy', xy=(m,history_valid.accuracy[m]),  xycoords='data',
            xytext=(m, 0.94), textcoords='data',
            arrowprops=dict(facecolor='white', shrink=0.05),
            horizontalalignment='center', verticalalignment='top',fontsize=15
            )
# ax2.set_xticks(np.linspace(0, 1000, 11))
ax2.set_ylabel('Accuracy')
ax2.legend(loc='best')
mplcyberpunk.add_glow_effects(ax2)

ax3.plot(history_train.index, history_train.auc, label='training')
ax3.plot(history_valid.index, history_valid.auc, label='validation')
m = history_train.auc.argmax()
ax3.scatter([m],history_train.auc[m])
m = history_valid.auc.argmax()
auc_peak = m
ax3.scatter([m],history_valid.auc[m])
ax3.annotate('Best Validation AUC \n Stop training here!', xy=(m,history_valid.auc[m]),  xycoords='data',
            xytext=(m, 0.975), textcoords='data',
            arrowprops=dict(facecolor='white', shrink=0.05),
            horizontalalignment='center', verticalalignment='top',fontsize=15
            )
ax3.set_xticks(np.linspace(0, 10000, 11))
ax3.set_ylabel('AUC')
ax3.legend(loc='best')
ax3.set_xlabel("Iteration")

mplcyberpunk.add_glow_effects(ax3)
plt.suptitle('Lightgbm training history', y=0.94, fontsize=30)
plt.show()
print(f"Validation AUC peaks at iteration {auc_peak} with score {history_valid.auc[auc_peak]:.5f}")

## Three diagrams for model evaluation
We plot the ROC curve just because it looks nice. The area under the red curve is the score of our model.

In [None]:
# Plot the roc curve for the last fold
def plot_roc_curve(y_va, y_va_pred):
    plt.figure(figsize=(8, 8))
    fpr, tpr, _ = roc_curve(y_va, y_va_pred)
    plt.plot(fpr, tpr, color='r', lw=2)
    plt.plot([0, 1], [0, 1], color="w", lw=1, linestyle="--")
    plt.gca().set_aspect('equal')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver operating characteristic")
    plt.show()

plot_roc_curve(y_val, y_pred)

Second, we plot a histogram of the out-of-fold predictions. Many predictions are near 0.0 or near 1.0; this means that in many cases the classifier's predictions have high confidence:

In [None]:
plt.figure(figsize=(12, 4))
plt.hist(y_pred, bins=25, density=True)
plt.title('Histogram of the oof predictions')
plt.show()

Finally, we plot the calibration curve.

In [None]:
plt.figure(figsize=(12, 4))
ax = plt.gca()
ax.plot([0, 1], [0, 1], "w:", label="Perfectly calibrated")
CalibrationDisplay.from_predictions(y_val, y_pred, n_bins=100, strategy='quantile', name="LGBMClassifier", ax=ax)
plt.title('Probability calibration')
plt.show()

## Submission
For the submission, we re-train the model on the complete training data. We set n_estimators to the number of iterations which gave the best auc in cross-validation.

In [None]:
%%time
print(f"{len(final_feature_list)} features")
X_train = train_df[final_feature_list]
y_train = train_df.target
model = my_booster(n_estimators=auc_peak)
model.fit(X_train.values, y_train)
pred = model.predict_proba(test_df[final_feature_list].values)[:,1]
submission = pd.DataFrame({"id":test_df.index, "target":pred})
submission.to_csv('submission.csv', index=False)
submission

Credits:  
https://www.kaggle.com/code/ambrosm/tpsmay22-gradient-boosting-quickstart/notebook