<h1 align='center'> 🐈 CatBoost Classifier Tab Hack 2.0 🐱‍🏍</h1>

This notebook is the CatBoost Classifier notebook as part of Tab Hack 2.0, a Machine Learning Hackathon organized by IITG.ai, The Artificial Intelligence Community of IIT Guwahati.


For more such hackathons and everything AI, do follow IITG.ai on these socials:
    
* [Linkedin](https://www.linkedin.com/company/iitg-ai/)
* [Website](https://www.iitg.ac.in/sa/ai/#/)
* [Instagram](https://www.instagram.com/iitg.ai/)

## 📦 Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from time import time

 
import catboost
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import f1_score

# Optuna Imports
import optuna
from optuna import Trial, visualization

## 🔃 Data Loading

In [2]:
train = pd.read_csv('../input/tab-hack-20/Week8_train.csv')
test = pd.read_csv('../input/tab-hack-20/Week8_test.csv')
id_col = test['id']

In [3]:
# Class Imbalance
perc = (train['TARGET'].value_counts()[1]/train['TARGET'].value_counts()[0])*100
print(f'Percentage of Minority Class {perc}')

Percentage of Minority Class 24.093904908970874


In [4]:
# Large Dataset
train.shape

(110899, 483)

### 🔮 Attempted Pseudo Labelling

In [5]:
# catboosti = pd.read_csv('../input/catboost/submission (6).csv')
# pseudo_label = pd.DataFrame()
# pseudo_label['TARGET'] = catboosti['TARGET']
# pseudo_label = pd.concat([decision_tree, lgbm, catboost], axis=1)
# pseudo_label['final'] = np.where(pseudo_label.sum(axis=1) > 1, 1, 0)
# test['TARGET'] = [x for x in pseudo_label['TARGET']]
# train = pd.concat([train, test], axis=0)
# train = train.reset_index(drop = True)
# test.drop(['TARGET'],axis = 'columns',inplace = True)

## ⚒ Data Preprocessing

In [6]:
''' Utils '''

def get_constant_features(df, threshold=0.99, dropna=False):
    '''
    For a given dataframe, identify the constant and quasi constant features.
    To get all the constant & quasi constant features in a list - constant_features_df['Var'].to_list()
    
    Parameters:
    -----------
        df: 'dataframe'
        threshold: 'float'. default = 0.99
        dropna: 'bool'. default = false
        
    Returns:
    --------
        constant_features_df: 'dataframe'
    '''
    constant_features = []
    constant_features_df = pd.DataFrame(columns=['Desc', 'Var', 'Value', 'Perc'])
    all_vars = list(df.columns)
    i=0
    for var in all_vars:
        s = df[var].value_counts(normalize=True, dropna=dropna)
        value = s.index[0]
        perc = s.iloc[0]
    
        if perc==1:
            constant_features_df.loc[i] = ['Constant', var, value, 100*perc]

        elif perc>threshold:
            constant_features_df.loc[i] = ['Quasi Constant', var, value, 100*perc]
    
        i=i+1
    
    constant_features_df = constant_features_df.sort_values(by='Perc', ascending=False, ignore_index=True) 

    return constant_features_df

### 🔻 Reducing Memory Usage

In [7]:
# Datatype Conversions
for i in train.columns:
    if train[i].dtypes == 'float64':
        train[i] = train[i].astype('float32')
    elif train[i].dtypes == 'int64':
        train[i] = train[i].astype('int32')
        
for i in test.columns:
    if test[i].dtypes == 'float64':
        test[i] = test[i].astype('float32')
    elif test[i].dtypes == 'int64':
        test[i] = test[i].astype('int32')

### 💧 Dropping Constant Features

In [8]:
constant_features = get_constant_features(train, threshold=1, dropna=False)
lst = constant_features['Var'].to_list()
lst.append('id')
train.drop(lst,axis = 'columns',inplace = True)
test.drop(lst,axis = 'columns',inplace = True)

### 🏹 Missing Value Imputation

In [9]:
train['V_434'] = train['V_434'].fillna('Private')
train['V_301'] = train['V_301'].fillna('Private')
train['V_304'] = train['V_304'].fillna('U')
test['V_434'] = test['V_434'].fillna('Private')
test['V_301'] = test['V_301'].fillna('Private')
test['V_304'] = test['V_304'].fillna('U')

### 😢 Dropping Some features with 0 MI scores

In [10]:
# from sklearn.feature_selection import mutual_info_classif
# X_train = train.drop(columns='TARGET').copy()
# y_train = train['TARGET']
# mi = mutual_info_classif(X_train, y_train, random_state=2021)
# mi = pd.Series(mi)
# mi.index = X_train.columns
# mi = mi.sort_values(ascending=False)
# df = pd.DataFrame(mi, columns = ['mi_value'])
# df[df['mi_value']==0].index.tolist()

In [11]:
lst = ['V_357','V_362','V_354','V_365','V_359','V_360','V_353','V_348','V_1','V_412','V_368','V_369','V_436','V_437','V_438', 'V_439','V_441','V_442','V_444','V_448','V_451','V_452','V_453','V_454','V_456','V_457','V_458','V_461','V_466','V_467','V_469','V_470','V_474','V_477','V_479','V_480','V_481','V_429','V_426','V_424','V_390','V_370','V_371','V_374','V_375','V_376','V_377','V_378','V_384','V_385','V_387','V_388','V_392','V_423','V_394','V_396','V_397','V_398', 'V_399','V_402','V_408','V_342','V_418', 'V_420', 'V_421', 'V_343','V_240','V_341','V_108','V_118','V_116','V_114','V_113', 'V_111','V_110','V_106','V_170','V_104','V_102','V_100', 'V_99', 'V_93', 'V_92','V_119','V_124','V_126','V_127','V_131','V_140','V_141','V_143','V_144','V_146','V_148', 'V_152','V_153','V_156', 'V_165', 'V_167', 'V_168', 'V_91', 'V_90', 'V_87', 'V_43', 'V_3', 'V_6', 'V_7', 'V_10', 'V_12', 'V_14', 'V_19', 'V_25', 'V_28', 'V_32', 'V_33', 'V_35', 'V_37', 'V_38', 'V_44', 'V_86', 'V_49', 'V_53', 'V_56', 'V_62', 'V_63', 'V_64', 'V_69', 'V_71', 'V_73', 'V_74', 'V_75', 'V_78', 'V_80', 'V_81', 'V_169', 'V_171', 'V_339', 'V_270', 'V_288', 'V_286', 'V_284', 'V_283', 'V_281', 'V_272','V_269','V_172','V_263','V_262','V_260','V_258','V_253','V_250','V_292','V_295','V_296','V_299','V_300','V_306','V_309','V_310','V_315', 'V_319','V_321','V_323','V_324','V_328','V_329','V_330','V_331','V_249','V_244','V_242','V_204','V_175','V_177','V_178','V_180','V_182','V_186','V_187','V_188','V_192','V_196','V_197','V_198','V_200','V_202','V_205','V_2','V_208','V_209','V_213','V_214','V_217','V_219', 'V_221','V_223','V_224','V_226','V_230','V_231','V_235','V_239','V_482']
train.drop(lst,axis = 'columns',inplace = True)
test.drop(lst,axis = 'columns',inplace = True)

### 👨‍🔬 Feature Engineering Attempt

In [12]:
# train['cool'] = np.where(train['V_27']*train['V_42']!=0,1,0)
# test['cool'] = np.where(test['V_27']*test['V_42']!=0,1,0)

# train['coolv2'] = np.where(train['V_27']*train['V_98']!=0,1,0)
# test['coolv2'] = np.where(test['V_27']*test['V_98']!=0,1,0)

# for col in list(train.dtypes[train.dtypes == 'float32'].keys()):
#     temp = train.groupby('V_434')[col].agg(['mean']).rename({'mean':f'col_V_434_mean'},axis=1)
#     train = pd.merge(train,temp,on='V_434',how='left')
#     temp = test.groupby('V_434')[col].agg(['mean']).rename({'mean':f'col_V_434_mean'},axis=1)
#     test = pd.merge(test,temp,on='V_434',how='left')

# train['V_434_301'] = train['V_434'] + '_' + train['V_301']
# test['V_434_301'] = test['V_434'] + '_' + test['V_301']

# train['V_245_400'] = train['V_245'] + '_' + train['V_400']
# test['V_245_400'] = test['V_245'] + '_' + test['V_400']


# lst = ['V_434','V_301']
# for col in lst:
#     cool = pd.concat([train[col],test[col]])
#     temp = cool.value_counts().to_dict()
#     train[f'{col}_counts'] = train[col].map(temp)
# #     temp = test[col].value_counts().to_dict()
#     test[f'{col}_counts'] = test[col].map(temp)

# object_cols = list(train.dtypes[train.dtypes == 'object'].keys())
# le = preprocessing.LabelEncoder()
# for col in object_cols:
#     train[col] = le.fit_transform(train[col])
#     test[col] = le.transform(test[col])
    
# lst = ['V_434','V_301']
# for col in lst:
#     cool = train[col]
#     temp = cool.value_counts().to_dict()
#     train[f'{col}_counts'] = train[col].map(temp)
#     test[f'{col}_counts'] = test[col].map(temp)

### 🏷 Label Encoding the Categorical Features

In [13]:
object_cols = list(train.dtypes[train.dtypes == 'object'].keys())
le = preprocessing.LabelEncoder()
for col in object_cols:
    temp = pd.concat([train,test])
    le.fit(temp[col])
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])


### ❎ Creating folds for Cross Validation

In [14]:
feature_cols = [col for col in train.columns.tolist() if col not in ['TARGET']]
target_cols = ['TARGET']

skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for i, (trn, val) in enumerate(skf.split(train[feature_cols], train[target_cols])):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

### 🎁 Function to plot feature importances

In [15]:
def plot_feature_importance(importance,names,model_type, figsize=(10, 8)):
    
    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    #Define size of bar plot
    plt.figure(figsize=figsize)
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

## 🚂 Model Training

In [16]:
# 42-gpu   69-gpu
def find_oof_score(oof):
    predictions = oof.argmax(axis=1)
    return f1_score(train[target_cols], predictions, average='macro')

def run_training(random_state):
    oof = np.zeros((train.shape[0], 2))
    pred = np.zeros((test.shape[0], 2))
    
    for fold in range(5):

        print(f"\nStarting FOLD: {fold}")
        start = time()
        params = {
            'task_type':'GPU',
            'verbose':0,
#         'max_depth': 5,
#             'learning_rate': 0.1,
#             'n_estimators': 500, 
#             'max_bin': 334,
#             'min_data_in_leaf': 80,
#             'l2_leaf_reg': 0.062365205524235925,
#             'subsample': 0.6550580361245057}
        }
        
        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target_cols].values
        xval, yval = val[feature_cols].values, val[target_cols].values
        xtest = test[feature_cols].values
        undersample = RandomUnderSampler(sampling_strategy = 0.65,random_state = random_state)
        xtr,ytr = undersample.fit_resample(xtr,ytr)
        
        model = catboost.CatBoostClassifier(**params)
        
        model.fit(xtr, ytr.reshape(-1,))
        print("Training Accuracy Score - ", accuracy_score(ytr, model.predict(xtr)))
        print("Training F1 Score - ", f1_score(ytr, model.predict(xtr), average='macro'))
        
        print("Validation Accuracy Score - ", accuracy_score(yval, model.predict(xval)))
        print("Validation F1 Score - ", f1_score(yval, model.predict(xval), average='macro'))
        
        oof[val_idx, :] += model.predict_proba(xval)
        
        # Test preds
        pred += model.predict_proba(xtest)/5
        
        print(f"FOLD {fold} completed in {time()-start} seconds")
#     plot_feature_importance(model.feature_importances_, train.columns, 'CatBoost', figsize=(8, 16))
#     plt.show()
    return oof, pred

oof, pred = run_training(69)


Starting FOLD: 0
Training Accuracy Score -  0.7129004962608915
Training F1 Score -  0.6532125802254822
Validation Accuracy Score -  0.8065825067628494
Validation F1 Score -  0.6461797044863049
FOLD 0 completed in 141.72901940345764 seconds

Starting FOLD: 1
Training Accuracy Score -  0.7126031971093375
Training F1 Score -  0.650716459202777
Validation Accuracy Score -  0.8125338142470694
Validation F1 Score -  0.6521785503083553
FOLD 1 completed in 59.523966789245605 seconds

Starting FOLD: 2
Training Accuracy Score -  0.7121097770154374
Training F1 Score -  0.6501999110771939
Validation Accuracy Score -  0.8131650135256988
Validation F1 Score -  0.659955719695241
FOLD 2 completed in 60.315746545791626 seconds

Starting FOLD: 3
Training Accuracy Score -  0.7108747855917668
Training F1 Score -  0.6490762944390378
Validation Accuracy Score -  0.8097385031559964
Validation F1 Score -  0.6507882906007804
FOLD 3 completed in 60.14697051048279 seconds

Starting FOLD: 4
Training Accuracy Sco

In [17]:
# 0.6525343017800713
print(find_oof_score(oof))

0.6523905036267263


## 🎶 Hyperparameter Tuning Optuna Template Code

In [18]:
def fit_cb(trial, xtr, ytr, xval, yval):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.02, 0.05, 0.08, 0.1]),
        'n_estimators': trial.suggest_int('n_estimators', 100,100),
        'max_bin': trial.suggest_int('max_bin', 200, 400),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 1.0, log = True),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'random_seed': 42,
        'task_type': 'GPU',
#         'loss_function': 'Logloss',
        'bootstrap_type': 'Poisson',
        'verbose':0
    }
#     undersample = RandomUnderSampler(sampling_strategy = 0.65,random_state = 42)
#     xtr,ytr = undersample.fit_resample(xtr,ytr)
    model = catboost.CatBoostClassifier(**params)
    model.fit(xtr, ytr.reshape(-1,))
    
    y_val_pred = model.predict(xval)
    
    log = {
        "train f1": f1_score(ytr, model.predict(xtr), average="macro"),
        "valid f1": f1_score(yval, y_val_pred, average="macro")
    }
    
    return model, y_val_pred, log

def objective(trial):
    oof = np.zeros((train.shape[0], 2))
    
    for fold in range(5):
        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target_cols].values
        xval, yval = val[feature_cols].values, val[target_cols].values
        undersample = RandomUnderSampler(sampling_strategy = 0.65,random_state = 42)
        xtr,ytr = undersample.fit_resample(xtr,ytr)
        model, y_val_pred, log = fit_cb(trial, xtr, ytr, xval, yval)
        oof[val_idx, :] += model.predict_proba(xval)
        
    return find_oof_score(oof)

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50)

## 📝 Submission

In [19]:
final_preds = pred.argmax(axis=1)
pred_csv = pd.DataFrame(final_preds.reshape(-1), columns=['TARGET'] )
pred_csv['id'] = id_col
pred_csv

Unnamed: 0,TARGET,id
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
110894,0,110894
110895,0,110895
110896,1,110896
110897,0,110897


In [20]:
pred_csv.to_csv('submission.csv',index = False)
np.save('oof_cat.npy', oof)
np.save('pred_cat.npy', pred)