In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
import lightgbm as lgb
from sklearn.feature_selection import SelectFromModel,RFECV
import joblib
from sklearn.model_selection import StratifiedKFold

In [20]:
X = pd.read_feather('datasets\dataset1\Train.feather') # read from denoised features
X_test = pd.read_feather('datasets\dataset1\Test.feather') 
y = pd.read_csv('datasets\dataset1/train_y.csv')

In [21]:
X.head()

Unnamed: 0,ID,B_37,S_24,S_4,S_14,B_25,D_38,B_30,D_138,P_2,...,S_5,onehot__CL,onehot__CO,onehot__CR,onehot__XL,onehot__XM,onehot__XZ,onehot__O,onehot__R,onehot__U
0,3337446730,0.0,0.0,85.0,55.0,100.0,,2.0,51.0,100.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
1,7888784125,0.0,0.0,75.0,68.0,0.0,,8.0,55.0,68.0,...,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
2,9871378905,0.0,0.0,76.0,28.0,100.0,,0.0,85.0,92.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
3,8891869609,0.0,0.0,36.0,0.0,53.0,,0.0,52.0,70.0,...,0.0,0.0,0.0,100.0,0.0,0.0,0.0,100.0,0.0,0.0
4,2006443827,0.0,0.0,75.0,42.0,100.0,,0.0,82.0,100.0,...,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,0.0,0.0


In [22]:
# Splitting it into train test split
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, random_state= 42, stratify = y) 
y_train = y_train.to_numpy().ravel()
y_val = y_val.to_numpy().ravel()

In [23]:
X_train.reset_index(inplace=True,drop=True)
X_val.reset_index(inplace = True,drop=True)

In [25]:
X_train.shape, X_val.shape

((367130, 196), (91783, 196))

## Creating Meta Features

In [26]:
# 1. CatBoost Meta Classifier
clf_cat1 = CatBoostClassifier(
    eval_metric='Accuracy',
    random_seed=42,
    n_estimators=1000, 
    learning_rate=0.05,
    task_type='GPU'
)

# 2. LGBM Meta Classifier

clf_lgb = lgb.LGBMClassifier(
    boosting_type='dart',
    num_leaves = 64,
    random_state=42,
    n_estimators=1000, 
    learning_rate=0.05,
    objective= 'multiclass',
    metric = 'multi_error'
    )

### Write-up:


- clf_cat1, clf_lgb will act as meta models
- catb_yhat, lgb_yhat will be used to store OOF (Out-of Fold) Predictions on the data from these meta models. 
- Models_dict dictionary has the following structure to add/remove meta models easily.
    - Key: Meta Model Name
    - Value[0] : Classifier
    - Value[1] : Classifier's OOF Prediction using 5 fold cross validation. If n is the number of rows of train data, p is number of meta models, each yhat will be nx3 size, (Totally there will be p yhats in the entire dictionary)


The predictions are on unseen data and kind of represents the learnings our model learnt from the train data. So these are important features which kinda boosts our overall model performance

In [27]:
catb_yhat, lgb_yhat = [] , [] # Predicted OOF Values (in probabilities)

In [28]:
models_dict = {'CatB_1' : [clf_cat1, catb_yhat], 'LGB_1':[clf_lgb,lgb_yhat]}

In [29]:
def train_oof_predictions(X,y,models,verbose=True):
    # Create a 5 fold stratified split
    kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state = 42)
    data_x, data_y = pd.DataFrame(columns=X.columns), []

    # Train_ix, Test_ix are indexes to keep track of shuffled data
    for train_ix, test_ix in kfold.split(X,y):
        if verbose:
            print("\n Starting a new fold\n")
            print("Creating splits")

        train_X, test_X = X.iloc[train_ix], X.iloc[test_ix]
        train_y, test_y = y[train_ix], y[test_ix]

        data_x = data_x.append(test_X)
        data_y.extend(test_y) # True Value

        for item in models.keys():
            model = models[item][0]

            if verbose:
                print("Running", item,"on this fold")

            model.fit(train_X,train_y)
            # Predictions on the out-of-fold data
            # There are 4 labels. We take the probability predictions of only three columns 
            # as the fourth column is anyway related as 1-\sigma p_{i}
            predictions = model.predict_proba(test_X)[:,:3] 

            models[item][1].extend(predictions)

    # Return the shuffled data_x, corresponding labels, and the corresponding OOF Predictions through the model dictionary
    return data_x, np.array(data_y), models

In [30]:
def addmetafeatures_TrainOOF(models,data_x):
    # Add the OOF Predictions of p meta-models to data

    for item in models:
        meta_features = pd.DataFrame(models[item][1])
        meta_features.columns = ['meta'+ item[:3] +'_0','meta'+ item[:3] +'_1','meta'+ item[:3] +'_2']

        data_x[meta_features.columns] = meta_features

    return data_x

def retrain_metaclassifier(models,data_x,data_y,verbose = True, dir = None):
    # Train the meta-model on the entire Data instead of Doing Cross Validation
    # Useful for making predictions on the Test data.
    '''
    To make predictions on test data, the meta-model is trained on the entire train split of data (Here 80%) (Instead
    of doing cross-validation). Now this meta-model is used to make predictions on validation data/ Test(Submission) data.
    '''

    import joblib
    for item in models:
        if verbose:
            print("\n Retraining" + item +"\n")

        model = models[item][0]

        model.fit(data_x,data_y)

        # Save the trained model
        if dir !=None:
            joblib.dump(model, dir+ '/' + item + '.pkl')


    return models


def addmetafeatures_Predict(models,X):
    # After retraining the model, Pass test data to add the predictions

    coln = X.columns
    for item in models:
        model = models[item][0]

        meta_features = pd.DataFrame(model.predict_proba(X[coln])[:,:3])
        meta_features.columns = ['meta'+ item[:3] +'_0','meta'+ item[:3] +'_1','meta'+ item[:3] +'_2']

        X[meta_features.columns] = meta_features

    return X

In [21]:
# Train for OOF Predictions using 5 fold cross validation using stratification (Level 1)
X_train_concat, y_train_concat, trained_models = train_oof_predictions(X_train, y_train, models_dict)


 Starting a new fold

Creating splits


  data_x = data_x.append(test_X)


Running CatB_1 on this fold
0:	learn: 0.7962949	total: 14.7ms	remaining: 14.7s
1:	learn: 0.7979939	total: 29.4ms	remaining: 14.7s
2:	learn: 0.8005203	total: 42.9ms	remaining: 14.2s
3:	learn: 0.8003023	total: 57.9ms	remaining: 14.4s
4:	learn: 0.8010854	total: 74.5ms	remaining: 14.8s
5:	learn: 0.8007926	total: 89.4ms	remaining: 14.8s
6:	learn: 0.8008063	total: 104ms	remaining: 14.8s
7:	learn: 0.8010276	total: 119ms	remaining: 14.7s
8:	learn: 0.8011025	total: 134ms	remaining: 14.8s
9:	learn: 0.8013204	total: 148ms	remaining: 14.7s
10:	learn: 0.8016745	total: 163ms	remaining: 14.6s
11:	learn: 0.8020354	total: 178ms	remaining: 14.6s
12:	learn: 0.8026074	total: 193ms	remaining: 14.7s
13:	learn: 0.8027470	total: 208ms	remaining: 14.7s
14:	learn: 0.8030602	total: 223ms	remaining: 14.7s
15:	learn: 0.8035914	total: 239ms	remaining: 14.7s
16:	learn: 0.8037378	total: 253ms	remaining: 14.6s
17:	learn: 0.8039012	total: 265ms	remaining: 14.5s
18:	learn: 0.8041157	total: 277ms	remaining: 14.3s
19:	lea

  data_x = data_x.append(test_X)


Running CatB_1 on this fold
0:	learn: 0.7952224	total: 18.2ms	remaining: 18.2s
1:	learn: 0.7973606	total: 34.6ms	remaining: 17.3s
2:	learn: 0.7994580	total: 49.4ms	remaining: 16.4s
3:	learn: 0.7998257	total: 66.2ms	remaining: 16.5s
4:	learn: 0.8007688	total: 88.9ms	remaining: 17.7s
5:	learn: 0.8004896	total: 105ms	remaining: 17.4s
6:	learn: 0.8005032	total: 121ms	remaining: 17.2s
7:	learn: 0.8009152	total: 136ms	remaining: 16.9s
8:	learn: 0.8010480	total: 152ms	remaining: 16.8s
9:	learn: 0.8014770	total: 167ms	remaining: 16.6s
10:	learn: 0.8018617	total: 182ms	remaining: 16.4s
11:	learn: 0.8020558	total: 195ms	remaining: 16.1s
12:	learn: 0.8023929	total: 209ms	remaining: 15.8s
13:	learn: 0.8025121	total: 222ms	remaining: 15.7s
14:	learn: 0.8027300	total: 236ms	remaining: 15.5s
15:	learn: 0.8029070	total: 249ms	remaining: 15.3s
16:	learn: 0.8030398	total: 262ms	remaining: 15.2s
17:	learn: 0.8032986	total: 275ms	remaining: 15s
18:	learn: 0.8033667	total: 287ms	remaining: 14.8s
19:	learn:

  data_x = data_x.append(test_X)


Running CatB_1 on this fold
0:	learn: 0.7963256	total: 14.5ms	remaining: 14.5s
1:	learn: 0.7982901	total: 27.6ms	remaining: 13.8s
2:	learn: 0.7991209	total: 40.2ms	remaining: 13.4s
3:	learn: 0.7987634	total: 54.1ms	remaining: 13.5s
4:	learn: 0.7999482	total: 69.5ms	remaining: 13.8s
5:	learn: 0.7999346	total: 82.7ms	remaining: 13.7s
6:	learn: 0.7999517	total: 95.8ms	remaining: 13.6s
7:	learn: 0.8003841	total: 109ms	remaining: 13.5s
8:	learn: 0.8020865	total: 121ms	remaining: 13.4s
9:	learn: 0.8019162	total: 137ms	remaining: 13.5s
10:	learn: 0.8022635	total: 149ms	remaining: 13.4s
11:	learn: 0.8024950	total: 162ms	remaining: 13.3s
12:	learn: 0.8031522	total: 177ms	remaining: 13.4s
13:	learn: 0.8034348	total: 190ms	remaining: 13.3s
14:	learn: 0.8036118	total: 202ms	remaining: 13.3s
15:	learn: 0.8037412	total: 214ms	remaining: 13.2s
16:	learn: 0.8037718	total: 226ms	remaining: 13.1s
17:	learn: 0.8039353	total: 239ms	remaining: 13s
18:	learn: 0.8040136	total: 252ms	remaining: 13s
19:	learn:

  data_x = data_x.append(test_X)


Running CatB_1 on this fold
0:	learn: 0.7960736	total: 15.1ms	remaining: 15.1s
1:	learn: 0.7980177	total: 29.7ms	remaining: 14.8s
2:	learn: 0.8003126	total: 43.2ms	remaining: 14.4s
3:	learn: 0.8004998	total: 57.2ms	remaining: 14.2s
4:	learn: 0.8014259	total: 71.4ms	remaining: 14.2s
5:	learn: 0.8010854	total: 85.5ms	remaining: 14.2s
6:	learn: 0.8010786	total: 99.8ms	remaining: 14.2s
7:	learn: 0.8011638	total: 114ms	remaining: 14.1s
8:	learn: 0.8013578	total: 128ms	remaining: 14.1s
9:	learn: 0.8015928	total: 142ms	remaining: 14.1s
10:	learn: 0.8019230	total: 156ms	remaining: 14s
11:	learn: 0.8022363	total: 169ms	remaining: 13.9s
12:	learn: 0.8027300	total: 184ms	remaining: 14s
13:	learn: 0.8030023	total: 199ms	remaining: 14s
14:	learn: 0.8032407	total: 214ms	remaining: 14s
15:	learn: 0.8035880	total: 229ms	remaining: 14.1s
16:	learn: 0.8036561	total: 243ms	remaining: 14s
17:	learn: 0.8040102	total: 256ms	remaining: 14s
18:	learn: 0.8041395	total: 271ms	remaining: 14s
19:	learn: 0.8043506

  data_x = data_x.append(test_X)


Running CatB_1 on this fold
0:	learn: 0.7964107	total: 13.8ms	remaining: 13.8s
1:	learn: 0.7979394	total: 26.8ms	remaining: 13.4s
2:	learn: 0.8006122	total: 39.1ms	remaining: 13s
3:	learn: 0.7997542	total: 52ms	remaining: 12.9s
4:	learn: 0.8012523	total: 64.9ms	remaining: 12.9s
5:	learn: 0.8009493	total: 77.9ms	remaining: 12.9s
6:	learn: 0.8009424	total: 90.5ms	remaining: 12.8s
7:	learn: 0.8014600	total: 103ms	remaining: 12.8s
8:	learn: 0.8014940	total: 116ms	remaining: 12.8s
9:	learn: 0.8015791	total: 129ms	remaining: 12.8s
10:	learn: 0.8022941	total: 143ms	remaining: 12.8s
11:	learn: 0.8026755	total: 157ms	remaining: 12.9s
12:	learn: 0.8028900	total: 170ms	remaining: 12.9s
13:	learn: 0.8031828	total: 183ms	remaining: 12.9s
14:	learn: 0.8035982	total: 196ms	remaining: 12.9s
15:	learn: 0.8040068	total: 209ms	remaining: 12.8s
16:	learn: 0.8041429	total: 221ms	remaining: 12.8s
17:	learn: 0.8042179	total: 233ms	remaining: 12.7s
18:	learn: 0.8044630	total: 244ms	remaining: 12.6s
19:	learn:

In [22]:
X_train_concat.reset_index(inplace=True,drop=True)
y_train_concat = pd.DataFrame(y_train_concat,columns=['Default_Flag'])
y_train_concat.reset_index(inplace = True, drop = True)
y_val_concat = pd.DataFrame(y_val,columns=['Default_Flag'])
y_val_concat.reset_index(inplace=True, drop = True )

In [23]:
y_train_concat.to_feather('datasets/dataset_noiseoof/y_train.feather')
y_val_concat.to_feather('datasets/dataset_noiseoof/y_val.feather')

In [24]:
X_train_concat = addmetafeatures_TrainOOF(models_dict,X_train_concat)
X_train_concat.to_feather('datasets/dataset_noiseoof/X_train.feather')

In [25]:
# Retrain the meta-models on the train data. We will use this to make predictions on test and validation data
models_dict = retrain_metaclassifier(models_dict,X_train,y_train, dir='datasets/dataset_noiseoof')


 RetrainingCatB_1

0:	learn: 0.7958734	total: 18.6ms	remaining: 18.6s
1:	learn: 0.7980416	total: 35.5ms	remaining: 17.7s
2:	learn: 0.8004140	total: 51.5ms	remaining: 17.1s
3:	learn: 0.8004985	total: 68.3ms	remaining: 17s
4:	learn: 0.8014763	total: 84.7ms	remaining: 16.9s
5:	learn: 0.8013565	total: 102ms	remaining: 16.8s
6:	learn: 0.8012693	total: 118ms	remaining: 16.7s
7:	learn: 0.8015362	total: 134ms	remaining: 16.6s
8:	learn: 0.8015771	total: 151ms	remaining: 16.6s
9:	learn: 0.8017078	total: 167ms	remaining: 16.5s
10:	learn: 0.8020156	total: 180ms	remaining: 16.2s
11:	learn: 0.8022771	total: 193ms	remaining: 15.9s
12:	learn: 0.8025141	total: 206ms	remaining: 15.6s
13:	learn: 0.8027947	total: 220ms	remaining: 15.5s
14:	learn: 0.8030398	total: 233ms	remaining: 15.3s
15:	learn: 0.8032795	total: 247ms	remaining: 15.2s
16:	learn: 0.8033421	total: 261ms	remaining: 15.1s
17:	learn: 0.8036009	total: 274ms	remaining: 15s
18:	learn: 0.8036636	total: 289ms	remaining: 14.9s
19:	learn: 0.8040313

In [26]:
X_val_concat = addmetafeatures_Predict(models_dict,X_val)
X_val_concat.to_feather('datasets\dataset_noiseoof\X_val.feather')

In [30]:
X_train_concat.head()

Unnamed: 0,ID,B_37,S_24,S_4,S_14,B_25,D_38,B_30,D_138,P_2,...,onehot__XZ,onehot__O,onehot__R,onehot__U,metaCat_0,metaCat_1,metaCat_2,metaLGB_0,metaLGB_1,metaLGB_2
0,9154471113,0.0,0.0,0.0,0.0,19.0,,61.0,9.0,49.0,...,0.0,0.0,0.0,100.0,0.502915,0.05762,0.191852,0.402654,0.094175,0.228125
1,4484721791,0.0,0.0,70.0,50.0,100.0,,0.0,81.0,72.0,...,0.0,100.0,0.0,0.0,0.99729,0.001045,0.00083,0.998064,0.000873,0.000551
2,5814976439,0.0,0.0,0.0,0.0,100.0,,0.0,71.0,98.0,...,0.0,100.0,0.0,0.0,0.997664,0.000193,0.000815,0.998541,0.000134,0.000434
3,1752935483,0.0,0.0,12.0,0.0,44.0,,2.0,65.0,90.0,...,0.0,100.0,0.0,0.0,0.988878,0.001574,0.003922,0.988263,0.002513,0.003659
4,7488720305,0.0,0.0,31.0,42.0,11.0,,45.0,29.0,72.0,...,0.0,100.0,0.0,0.0,0.899017,0.008915,0.034891,0.888371,0.011555,0.037257


In [28]:
X_val_concat.head()

Unnamed: 0,ID,B_37,S_24,S_4,S_14,B_25,D_38,B_30,D_138,P_2,...,onehot__XZ,onehot__O,onehot__R,onehot__U,metaCat_0,metaCat_1,metaCat_2,metaLGB_0,metaLGB_1,metaLGB_2
0,8827482872,0.0,0.0,66.0,42.0,100.0,,3.0,26.0,57.0,...,0.0,100.0,0.0,0.0,0.98704,0.001395,0.0054,0.987156,0.001343,0.004936
1,9998285343,0.0,0.0,0.0,0.0,58.0,,0.0,45.0,83.0,...,0.0,100.0,0.0,0.0,0.989636,0.000394,0.003184,0.987763,0.00032,0.002869
2,9273704936,0.0,0.0,40.0,28.0,100.0,,1.0,78.0,100.0,...,0.0,100.0,0.0,0.0,0.997248,0.000485,0.000998,0.998606,0.000388,0.00046
3,7932017154,0.0,0.0,0.0,0.0,100.0,,0.0,50.0,90.0,...,0.0,0.0,100.0,0.0,0.995283,0.000347,0.001589,0.997431,0.000289,0.000952
4,8076049797,0.0,0.0,0.0,0.0,100.0,,0.0,37.0,,...,0.0,0.0,0.0,100.0,0.973615,0.003674,0.012168,0.986292,0.001518,0.006537


In [27]:
X_test_concat = addmetafeatures_Predict(models_dict,X_test)
X_test_concat.to_feather('datasets/dataset_noiseoof/X_test.feather')