<h1 align='center'> ❎ XGBoost Classifier Tab Hack 2.0 🚨</h1>

This notebook is the XGBoost Classifier notebook as part of Tab Hack 2.0, a Machine Learning Hackathon organized by IITG.ai, The Artificial Intelligence Community of IIT Guwahati.


For more such hackathons and everything AI, do follow IITG.ai on these socials:
    
* [Linkedin](https://www.linkedin.com/company/iitg-ai/)
* [Website](https://www.iitg.ac.in/sa/ai/#/)
* [Instagram](https://www.instagram.com/iitg.ai/)

## 📦 Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import time
 
import xgboost as xgb
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler

import optuna
from optuna import Trial, visualization

## 🔃 Data Loading

In [2]:
train = pd.read_csv('../input/tab-hack-20/Week8_train.csv')
test = pd.read_csv('../input/tab-hack-20/Week8_test.csv')
id_col = test['id']

In [3]:
# Class Imbalance
perc = (train['TARGET'].value_counts()[1]/train['TARGET'].value_counts()[0])*100
print(f'Percentage of Minority Class {perc}')

Percentage of Minority Class 24.093904908970874


In [4]:
# Large Dataset
train.shape

(110899, 483)

## ⚒ Data Preprocessing

In [5]:
''' Utils '''

def get_constant_features(df, threshold=0.99, dropna=False):
    '''
    For a given dataframe, identify the constant and quasi constant features.
    To get all the constant & quasi constant features in a list - constant_features_df['Var'].to_list()
    
    Parameters:
    -----------
        df: 'dataframe'
        threshold: 'float'. default = 0.99
        dropna: 'bool'. default = false
        
    Returns:
    --------
        constant_features_df: 'dataframe'
    '''
    constant_features = []
    constant_features_df = pd.DataFrame(columns=['Desc', 'Var', 'Value', 'Perc'])
    all_vars = list(df.columns)
    i=0
    for var in all_vars:
        s = df[var].value_counts(normalize=True, dropna=dropna)
        value = s.index[0]
        perc = s.iloc[0]
    
        if perc==1:
            constant_features_df.loc[i] = ['Constant', var, value, 100*perc]

        elif perc>threshold:
            constant_features_df.loc[i] = ['Quasi Constant', var, value, 100*perc]
    
        i=i+1
    
    constant_features_df = constant_features_df.sort_values(by='Perc', ascending=False, ignore_index=True) 

    return constant_features_df

### 🔻 Reducing Memory Usage

In [6]:
# Datatype Conversions
for i in train.columns:
    if train[i].dtypes == 'float64':
        train[i] = train[i].astype('float32')
    elif train[i].dtypes == 'int64':
        train[i] = train[i].astype('int32')
        
for i in test.columns:
    if test[i].dtypes == 'float64':
        test[i] = test[i].astype('float32')
    elif test[i].dtypes == 'int64':
        test[i] = test[i].astype('int32')

### 💧 Dropping Constant Features

In [7]:
constant_features = get_constant_features(train, threshold=1, dropna=False)
lst = constant_features['Var'].to_list()
lst.append('id')
train.drop(lst,axis = 'columns',inplace = True)
test.drop(lst,axis = 'columns',inplace = True)

### 🏹 Missing Value Imputation

In [8]:
train['V_434'] = train['V_434'].fillna('Private')
train['V_301'] = train['V_301'].fillna('Private')
train['V_304'] = train['V_304'].fillna('U')
test['V_434'] = test['V_434'].fillna('Private')
test['V_301'] = test['V_301'].fillna('Private')
test['V_304'] = test['V_304'].fillna('U')

### 😢 Dropping Some features with 0 MI scores

In [9]:
# from sklearn.feature_selection import mutual_info_classif
# X_train = train.drop(columns='TARGET').copy()
# y_train = train['TARGET']
# mi = mutual_info_classif(X_train, y_train, random_state=2021)
# mi = pd.Series(mi)
# mi.index = X_train.columns
# mi = mi.sort_values(ascending=False)
# df = pd.DataFrame(mi, columns = ['mi_value'])
# lst = df[df['mi_value']==0].index.tolist()

In [10]:
# Saved the list here as the MI value finding process takes some time
lst = ['V_357','V_362','V_354','V_365','V_359','V_360','V_353','V_348','V_1','V_412','V_368','V_369','V_436','V_437','V_438', 'V_439','V_441','V_442','V_444','V_448','V_451','V_452','V_453','V_454','V_456','V_457','V_458','V_461','V_466','V_467','V_469','V_470','V_474','V_477','V_479','V_480','V_481','V_429','V_426','V_424','V_390','V_370','V_371','V_374','V_375','V_376','V_377','V_378','V_384','V_385','V_387','V_388','V_392','V_423','V_394','V_396','V_397','V_398', 'V_399','V_402','V_408','V_342','V_418', 'V_420', 'V_421', 'V_343','V_240','V_341','V_108','V_118','V_116','V_114','V_113', 'V_111','V_110','V_106','V_170','V_104','V_102','V_100', 'V_99', 'V_93', 'V_92','V_119','V_124','V_126','V_127','V_131','V_140','V_141','V_143','V_144','V_146','V_148', 'V_152','V_153','V_156', 'V_165', 'V_167', 'V_168', 'V_91', 'V_90', 'V_87', 'V_43', 'V_3', 'V_6', 'V_7', 'V_10', 'V_12', 'V_14', 'V_19', 'V_25', 'V_28', 'V_32', 'V_33', 'V_35', 'V_37', 'V_38', 'V_44', 'V_86', 'V_49', 'V_53', 'V_56', 'V_62', 'V_63', 'V_64', 'V_69', 'V_71', 'V_73', 'V_74', 'V_75', 'V_78', 'V_80', 'V_81', 'V_169', 'V_171', 'V_339', 'V_270', 'V_288', 'V_286', 'V_284', 'V_283', 'V_281', 'V_272','V_269','V_172','V_263','V_262','V_260','V_258','V_253','V_250','V_292','V_295','V_296','V_299','V_300','V_306','V_309','V_310','V_315', 'V_319','V_321','V_323','V_324','V_328','V_329','V_330','V_331','V_249','V_244','V_242','V_204','V_175','V_177','V_178','V_180','V_182','V_186','V_187','V_188','V_192','V_196','V_197','V_198','V_200','V_202','V_205','V_2','V_208','V_209','V_213','V_214','V_217','V_219', 'V_221','V_223','V_224','V_226','V_230','V_231','V_235','V_239','V_482']
train.drop(lst,axis = 'columns',inplace = True)
test.drop(lst,axis = 'columns',inplace = True)

### 🏷 Label Encoding the Categorical Features

In [11]:
object_cols = list(train.dtypes[train.dtypes == 'object'].keys())
le = preprocessing.LabelEncoder()
for col in object_cols:
    temp = pd.concat([train,test])
    le.fit(temp[col])
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

### ❎ Creating folds for Cross Validation

In [12]:
feature_cols = [col for col in train.columns.tolist() if col not in ['TARGET']]
target_cols = ['TARGET']

# Stratified KFold due to class imbalance
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for i, (trn, val) in enumerate(skf.split(train[feature_cols], train[target_cols])):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

## 🚂 Model Training

In [13]:
# Parameters with random state 7 for randomundersampler (gpu)
# params = {
#             'n_estimators':1000, 
#             'subsample':0.6,
#             'colsample_bytree':0.6, 
#             'eta':0.028088855508628937,
#             'max_depth':5,
#             'min_child_weight':4,
#              'random_state':42,
#             'tree_method':'gpu_hist'
#         }

In [14]:
# 42 (gpu) and 7 (gpu)
def run_training():
    oof = np.zeros((train.shape[0], 2))
    pred = np.zeros((test.shape[0], 2))
    
    for fold in range(5):

        print(f"\nFOLD: {fold}")
        start = time.time()
        
        params = {
            "tree_method": "gpu_hist",
            "random_state": 42,
            'n_estimators': 100, 
            'subsample': 0.95,
            'colsample_bytree': 0.9,
            'eta': 0.09,
#             'alpha': 0.1, 
            'max_depth': 4, 
            'gamma':0.15,
#             'min_child_weight': 10
        }
        
        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target_cols].values
        xval, yval = val[feature_cols].values, val[target_cols].values
        xtest = test[feature_cols].values
        
        undersample = RandomUnderSampler(sampling_strategy = 0.65,random_state = 420)
        xtr,ytr = undersample.fit_resample(xtr,ytr)
        

        model = xgb.XGBClassifier(**params)
        model.fit(xtr, ytr.reshape(-1,))
        print("Training Accuracy Score - ", accuracy_score(ytr, model.predict(xtr)))
        print("Training F1 Score - ", f1_score(ytr, model.predict(xtr), average='macro'))
        
        print("Validation Accuracy Score - ", accuracy_score(yval, model.predict(xval)))
        print("Validation F1 Score - ", f1_score(yval, model.predict(xval), average='macro'))
        
        oof[val_idx, :] += model.predict_proba(xval)
        
        # Test preds
        pred += model.predict_proba(xtest)/5
        
        print(f"FOLD {fold} completed in {time.time()-start} seconds")
        
    return oof, pred

oof, pred = run_training()


FOLD: 0
Training Accuracy Score -  0.708738308139136
Training F1 Score -  0.645128272679947
Validation Accuracy Score -  0.8100991884580704
Validation F1 Score -  0.6462937324289627
FOLD 0 completed in 4.9008948802948 seconds

FOLD: 1
Training Accuracy Score -  0.7067944290712832
Training F1 Score -  0.6426333354612297
Validation Accuracy Score -  0.812082957619477
Validation F1 Score -  0.6499086370177222
FOLD 1 completed in 2.3199987411499023 seconds

FOLD: 2
Training Accuracy Score -  0.7048370497427101
Training F1 Score -  0.6378878557920964
Validation Accuracy Score -  0.8177186654643823
Validation F1 Score -  0.6591988338557042
FOLD 2 completed in 2.463948965072632 seconds

FOLD: 3
Training Accuracy Score -  0.7098456260720412
Training F1 Score -  0.6455729148738982
Validation Accuracy Score -  0.8111361587015329
Validation F1 Score -  0.6512076401667606
FOLD 3 completed in 2.1019086837768555 seconds

FOLD: 4
Training Accuracy Score -  0.707206074050358
Training F1 Score -  0.64

In [15]:
def find_oof_score(oof):
    predictions = oof.argmax(axis=1)
    return f1_score(train[target_cols], predictions, average='macro')

find_oof_score(oof)

0.6522287312834548

## 🎶 Hyperparameter Tuning Optuna Template Code

In [16]:
def fit_xgb(trial, xtr, ytr, xval, yval):
    params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [100,150]), #50,100,150,500,1000,1500
        "subsample": trial.suggest_discrete_uniform("subsample", 0.5,1,0.1), 
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.7,1,0.1), 
        "eta": trial.suggest_loguniform("eta",1e-2,0.1),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        "gamma": trial.suggest_loguniform("gamma",0.05,1),
        "max_depth": trial.suggest_categorical("max_depth",[3,5,7]),
        "min_child_weight": trial.suggest_int("min_child_weight",0,120),
        "tree_method": "gpu_hist",
        "random_state": 42
    }

    model = xgb.XGBClassifier(**params)
    model.fit(xtr, ytr.reshape(-1,),eval_set = [(xval,yval)],verbose = False)
    
    y_val_pred = model.predict(xval)
    
    log = {
        "train f1": f1_score(ytr, model.predict(xtr), average="macro"),
        "valid f1": f1_score(yval, y_val_pred, average="macro")
    }
    
    return model, y_val_pred, log

def objective(trial):
    oof = np.zeros((train.shape[0], 2))
    
    for fold in range(5):
        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target_cols].values
        xval, yval = val[feature_cols].values, val[target_cols].values
        undersample = RandomUnderSampler(sampling_strategy = 0.65,random_state = 42)
        xtr,ytr = undersample.fit_resample(xtr,ytr)
        model, y_val_pred, log = fit_xgb(trial, xtr, ytr, xval, yval)
        oof[val_idx, :] += model.predict_proba(xval)
        
    return find_oof_score(oof)

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials = 50)

# history = study.trials_dataframe()
# history.sort_values(by="value", ascending=False)

## 📝 Submission

In [17]:
final_preds = pred.argmax(axis=1)
pred_csv = pd.DataFrame(final_preds.reshape(-1), columns=['TARGET'] )
pred_csv['id'] = id_col
pred_csv

Unnamed: 0,TARGET,id
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
110894,0,110894
110895,0,110895
110896,1,110896
110897,0,110897


In [18]:
pred_csv.to_csv('submission.csv',index = False)
np.save('oof_xgb.npy', oof)
np.save('pred_xgb.npy', pred)