# Summary 

This notebook is a continuation of the baseline[ model and EDA](https://www.kaggle.com/code/slythe/tps-may-super-eda-base-model). 


# 📩 Import Libraries 📩 

In [None]:
# Data and visualization
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from collections import Counter

# hyperparameter tuning 
import optuna 

import gc

#modelling
import lightgbm as lgb
from catboost import CatBoostClassifier

from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.calibration import calibration_curve, CalibratedClassifierCV

In [None]:
# parameters 
sns.set_theme()

CALIBRATION = True
EPOCHS =3000

FULL_RUN = False

FEATS_2 = True

SCALING = True

# 💾 Load Data 💾

I have run [Feature Engine](https://feature-engine.readthedocs.io/en/1.3.x/) to create relative features of the float columns. \
This was run outside of Kaggle (on AWS instances due to memory constraints) \
A notebook detailing the process can be found [here](https://www.kaggle.com/code/slythe/feature-engine-selecting-creating-features?scriptVersionId=95047781) 

I then ran Powershap on these features to reduce the number, Powershap process can be found [here](https://www.kaggle.com/code/slythe/powershap-feature-selection-recursive)

In [None]:
train_original = pd.read_csv("../input/tabular-playground-series-may-2022/train.csv",index_col = 0)
test_original = pd.read_csv("../input/tabular-playground-series-may-2022/test.csv",index_col = 0)

train_features = pd.read_pickle("../input/tps-may-22-relative-feature-engine-powershap/Relative_feats_powershap_train.pickle")
test_features = pd.read_pickle("../input/tps-may-22-relative-feature-engine-powershap/Relative_feats_powershap_test.pickle")
sub = pd.read_csv("../input/tabular-playground-series-may-2022/sample_submission.csv",index_col = 0)

In [None]:
int_cols = train_original.dtypes[(train_original.dtypes =="int64") & (train_original.dtypes.index != "target") ].index
float_cols = train_original.dtypes[train_original.dtypes =="float64" ].index

In [None]:
add_cols = [col for col in train_features.columns if col not in train_original]
print(add_cols)

In [None]:
# add newly created features 
train_original[add_cols]= train_features[add_cols].values
test_original[add_cols]= test_features[add_cols].values
test_original

In [None]:
del train_features
del test_features

# 🌟 Feature Engineering 🌟

* Unicode (ord) code taken from [cabaxiom](https://www.kaggle.com/code/cabaxiom/tps-may-22-eda-lgbm-model#Feature-Engineering)

cabaxiom already identified f_29 and f_30 as potential categorical columns. 
Lets try improve on this

In [None]:
all_letters = ['A', 'B', 'D', 'E', 'P', 'C', 'S', 'G', 'F', 'Q', 'H', 'N', 'K', 'R', 'M', 'T', 'O', 'J', 'I', 'L']

def feature_engineering(df):
    
    # taken from wti200 https://www.kaggle.com/code/wti200/analysing-interactions-with-shap and CABAXIOM https://www.kaggle.com/code/cabaxiom/tps-may-22-eda-lgbm-model#Feature-Engineering
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
    
    #Unicoding
    for i in range(10):
        df["f_27_"+str(i)] = df["f_27"].str[i].apply(lambda x: ord(x) - ord("A"))
    
    # Get Unique letters
    df["unique_text_str"] = df["f_27"].apply(lambda x :  ''.join([str(n) for n in list(set(x))]) )
    df["unique_text_str"] = df["unique_text_str"].astype("category")

    df["unique_text_len"] = df.f_27.apply(lambda s: len(set(s)))
    
    return df

train = feature_engineering(train_original)
test = feature_engineering(test_original)

In [None]:
F_27_CAT_FEATS = [f'p_{i}' for i in range(10)]

def feature_engineering2(df):
    categories = [chr(c) for c in range(65, 85)]

    for i in range(0, 10):
        df[f'p_{i}'] = list(df['f_27'].map(lambda x: x[i]))
        df[f'p_{i}'] = pd.Categorical(df[f'p_{i}'], categories=categories)
    
    return df

if FEATS_2:
    print("adding feats")
    feature_engineering2(train)
    feature_engineering2(test)

## Mathematical Features 
* We will do this with certain columns i.e. the float columns (but certain groupings)

In [None]:
train_original[float_cols].describe()

#### Group Float columns 
* We can see from the above that certain columns have similar std/ min/ max, we will group them
* f_00 to f_06 => Group1
* f_19 to f_26 => Group2
* f28 looks to be seperate from both groups

In [None]:
group1_float =['f_00','f_01','f_02','f_03','f_04','f_05','f_06']
group2_float = ['f_19','f_20','f_21','f_22','f_23','f_24','f_25','f_26']

def mathematical_feats(df,cols, suffix):
    df[f"sum_{suffix}"] = df[cols].sum(axis = 1)
    df[f"mean_{suffix}"] = df[cols].mean(axis = 1)
    df[f"std_{suffix}"] = df[cols].std(axis = 1)
    df[f"min_{suffix}"] = df[cols].min(axis = 1)
    df[f"max_{suffix}"] = df[cols].max(axis = 1)
    df[f"median_{suffix}"] = df[cols].median(axis = 1)
    df[f"mad_{suffix}"] = df[cols].mad(axis = 1)
    
    df[f"max-min_{suffix}"] = df[cols].max(axis = 1) - df[cols].min(axis = 1)
    df[f"q01_{suffix}"] = df[cols].quantile(q= 0.1, axis =1)
    df[f"q25_{suffix}"] = df[cols].quantile(q= 0.25, axis =1) 
    #df[f"q50_{suffix}"] = df[cols].quantile(q= 0.5, axis =1) 
    df[f"q75_{suffix}"] = df[cols].quantile(q= 0.75, axis =1) 
    df[f"q95_{suffix}"] = df[cols].quantile(q= 0.95, axis =1) 
    df[f"q99_{suffix}"] = df[cols].quantile(q= 0.99, axis =1)
    df[f"kurt_{suffix}"] = df[cols].kurt(axis =1) 
    df[f"skew_{suffix}"] = df[cols].skew( axis =1)
    
    return df

mathematical_feats(train, float_cols, "group2_float")
mathematical_feats(test, float_cols, "group2_float")
# mathematical_feats(train, float_cols, "group1_float")
# mathematical_feats(test, float_cols, "group1_float")

## Drop unimportant features 
From previous runs 

In [None]:
feats = ['f_06_mul_f_04','f_05_mul_f_02','f_20_div_f_25','f_19_add_f_24' ,'f_24_add_f_19','f_06_mul_f_01','f_21_div_f_23','f_00_div_f_26','f_01_div_f_26',
         'f_03_mul_f_03','f_03_mul_f_02','f_02_mul_f_03','f_03_div_f_01','f_03','f_04','f_03_mul_f_00']

def drop_feats(df, feats):
    df.drop(feats ,axis = 1 ,inplace = True )
    return df 

drop_feats(train, feats)
drop_feats(test, feats)

## Downcasting

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

reduce_mem_usage(train)
reduce_mem_usage(test)

In [None]:
print([col for col in train.columns])

# 🚀 Base Model 🚀

In [None]:
categorical_features = ["unique_text_str"
                        #, "f29_f30"
                        #,"min_letter"
                        #,"max_letter"
                        #,"f_29","f_30"
                       ]


categorical_features.extend( [col for col in train.columns if "contains" in col] ) 

if FEATS_2:
    categorical_features.extend(F_27_CAT_FEATS)

In [None]:
# drop the text column as we already have features created earlier
X = train.drop(["target","f_27"],axis =1)
y= train["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 42)

# Scaling 

In [None]:
features = [col for col in test.columns if col not in categorical_features and col != "f_27"] 

if SCALING:
    qt = QuantileTransformer(n_quantiles=1000, 
                             output_distribution='normal', 
                             random_state=42).fit(X_train[features])



    X_train[features] = qt.transform(X_train[features])
    X_test[features] = qt.transform(X_test[features])

## Base model

In [None]:
def build_model( epochs = EPOCHS):
    
    model = lgb.LGBMClassifier(
    objective= 'binary',
    metric= "auc, binary_logloss, binary_error",
    num_iterations = epochs,
    num_threads= -1,
    learning_rate= 0.18319492258552644,
    boosting= 'gbdt',
    lambda_l1= 0.00028648667113792726,
    lambda_l2= 0.00026863027834978876,
    num_leaves= 229,
    max_depth= 0,
    min_child_samples=80,
    device = 'cpu',
    max_bins=511, 
    random_state=42 
    )
        
    return model


In [None]:
model = build_model()

#
model.fit(X_train,y_train, eval_set=[(X_test,y_test)], callbacks = [lgb.early_stopping(10000)],eval_metric="auc", #binary_logloss  
          categorical_feature = categorical_features
         )

val_preds = model.predict_proba(X_test)
y_preds = model.predict_proba(X_train)

print("Intrinsic AUC:", roc_auc_score(y_train, y_preds[:,1]))
print("Validation AUC:", model.best_score_["valid_0"]["auc"])

* quantile latest score: 0.9966848392504588  
* with logloss and error:0.9956027157829693 OR 0.995537760865407

In [None]:
history = pd.DataFrame(model.evals_result_['valid_0'])
history

In [None]:
history["auc"].plot(figsize =(25,8))
plt.ylabel("auc")
plt.xlabel("epochs")
plt.show()

In [None]:
#history["binary_logloss"].plot(figsize =(25,8))
#plt.ylabel("logloss")
#plt.xlabel("epochs")
#plt.show()

In [None]:
#history["binary_error"].plot(figsize =(25,8))
#plt.ylabel("error")
#plt.xlabel("epochs")
#plt.show()

In [None]:
max_epoch = history["auc"].argmax()
max_epoch

In [None]:
feat_importance = pd.DataFrame(data = model.feature_importances_, index= train.drop(["target","f_27"],axis =1).columns).sort_values(ascending = False, by= [0] )

plt.figure(figsize= (25,60))
sns.barplot(x= feat_importance[feat_importance[0]!= 0][0], y= feat_importance[feat_importance[0]!= 0].index)
plt.xticks(rotation = 90) 
plt.title("Feature importance")
plt.show()

In [None]:
print([col for col in feat_importance[feat_importance[0] <50].index])

## Calibration 
Taken from last months kernel [TPS April ](https://www.kaggle.com/code/slythe/calibrated-xgboost-human-activity-recognition)

In [None]:
prob_true, prob_pred = calibration_curve(y_test, val_preds[:,1], n_bins=10)

In [None]:
calibrator = CalibratedClassifierCV(model, method = "isotonic", cv='prefit')
calibrator.fit(X_test, y_test)
cal_preds = calibrator.predict_proba(X_test)

print("Validation AUC:", model.best_score_["valid_0"]["auc"])
print("Calibrated AUC:" , roc_auc_score(y_test, cal_preds[:, 1] ))

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.plot(prob_pred,prob_true, marker='o', linewidth=1, label='xgb model probabilities')

# reference line
line = mlines.Line2D([0, 1], [0, 1], color='black')
transform = ax.transAxes
line.set_transform(transform)
ax.add_line(line)
#plt.axvline(x=0.2, color = "r")
fig.suptitle('Calibration plot')
ax.set_xlabel('Predicted probability (mean)')
ax.set_ylabel('Fraction of positives (%True  in each bin)')
plt.legend()
plt.show()

# ❎ Cross validation ❎

In [None]:
gc.collect()

In [None]:
if not FULL_RUN:
    cv = KFold(n_splits = 5, shuffle = False)
    #cv = StratifiedKFold(n_splits = 5, shuffle = True,random_state=42)

    val_preds = []
    preds= []
    auc_cv = []
    for fold, (idx_train, idx_val) in enumerate(cv.split(X,y)):
        print("\n")
        print("#"*10, f"Fold: {fold}","#"*10)
        X_train , X_test = X.iloc[idx_train] , X.iloc[idx_val]
        y_train , y_test = y[idx_train] , y[idx_val]

        model = build_model()
        
        #scaling
        
        if SCALING:
            qt = QuantileTransformer(n_quantiles=1000, 
                             output_distribution='normal', 
                             random_state=42).fit(X_train[features])
            X_train[features] = qt.transform(X_train[features])
            X_test[features] = qt.transform(X_test[features])
            test_s = test.copy(deep = True)
            test_s[features] = qt.transform(test_s[features])
        else:
            test_s = test.copy(deep = True)
        
        model.fit(X_train,y_train, eval_set=[(X_test,y_test)], callbacks = [lgb.early_stopping(30)],eval_metric="auc")

        if CALIBRATION:
            calibrator = CalibratedClassifierCV(model, method = "isotonic", cv='prefit')
            calibrator.fit(X_test, y_test)
            
            val_pred = calibrator.predict_proba(X_test)[:, 1]
            val_preds.extend(val_pred)
            
            auc = roc_auc_score(y_test, val_pred)
            print("\n Calibration AUC:" , auc)
            preds.append(calibrator.predict_proba(test_s.drop("f_27",axis =1))[:, 1])
        else:
            
            val_pred = model.predict_proba(X_test)[:, 1]
            val_preds.extend(val_pred)
            
            auc = roc_auc_score(y_test, val_pred)
            print("\n Validation AUC:" , auc)
            preds.append(model.predict_proba(test_s.drop("f_27",axis =1))[:, 1])

        auc_cv.append(auc)

    print("FINAL AUC: ", np.mean(auc_cv))

In [None]:
if not FULL_RUN:
    lgb_vals = pd.DataFrame(val_preds,columns = ["target"])
    lgb_vals.to_csv("lgb_vals.csv")
    lgb_vals

* basline: 0.9958086260208202
* CV - feats 2: 0.9967434208290643

# 🌾 Full Run: Seed 🌾

from  [CABAXIOM](https://www.kaggle.com/code/cabaxiom/tps-may-22-eda-lgbm-model/notebook#Model)

In [None]:
if FULL_RUN:
    qt = QuantileTransformer(n_quantiles=1000, 
                             output_distribution='normal', 
                             random_state=42).fit(X[features])

    X[features] = qt.transform(X[features])
    test[features] = qt.transform(test[features])

In [None]:
def pred_test():
    pred_full = []
    for seed in range(5):
        
        print(f"\n### Running seed {seed} ###")
        
        #take the max epoch from the baseline model
        model = build_model(epochs = max_epoch)
        
        model.fit(X,y)
        
        #calibration
        print(" Calibrating")
        calibrator = CalibratedClassifierCV(model, method = "isotonic", cv='prefit')
        calibrator.fit(X, y)
        cal_preds = calibrator.predict_proba(test.drop("f_27",axis =1))

        pred_full.append(cal_preds[:,1])
    return pred_full

if FULL_RUN:
    pred_full = pred_test()

# 📡 Submission 📡

In [None]:
# Full Submission vs CV

if FULL_RUN:
    sub["target"] = np.array(pred_full).mean(axis =0)
    sub.to_csv("sub_full.csv")
    sub
else:
    #CV submission 
    sub["target"] = np.array(preds).mean(axis =0)
    sub.to_csv("submission_csv.csv")
    sub

In [None]:
plt.figure(figsize = (20,8))
sns.histplot(sub["target"])
#sns.histplot(sub_full["target"],color = "red" , alpha = 0.5,label = "Full prediiction")
plt.show()