In [1]:
# Importing Requiured Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)# To see all the columns of a dataframe
#pd.set_option('display.max_rows', None)

In [2]:
# Function to reduce the memory usage of various Dataframes
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
       
        1. Iterate over every column
        2. Determine if the column is numeric
        3. Determine if the column can be represented by an integer
        4. Find the min and the max value
        5. Determine and apply the smallest datatype that can fit the range of values

    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [3]:
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [4]:
# Loading reduced feature training set
X_train = import_data('X_train_final.csv')
y_train = pd.read_csv('y_train.final.csv')

Memory usage of dataframe is 181.24 MB
Memory usage after optimization is: 38.27 MB
Decreased by 78.9%


In [5]:
# Loading reduced feature test set
X_test = import_data('X_test_final.csv')
y_test = pd.read_csv('y_test.final.csv')

Memory usage of dataframe is 60.41 MB
Memory usage after optimization is: 12.76 MB
Decreased by 78.9%


## Model_5: XGboost Classifier with Tuned Hyperparameters using Optuna.

In [6]:
# Importing the Sklearn's roc_auc_score module
from sklearn.metrics import roc_auc_score

In [7]:
# Importing required Libraries
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [8]:
# Instantiating the Stratified K fold object
cv_strat = StratifiedKFold(5,random_state=42)

In [9]:
# Importing  hyperparamater tuning optimizer optuna
import optuna

In [10]:
# Importing Xgboost Classifier
import xgboost as xgb

In [11]:
# Computing the ratio of -ve to +ve classes. Note here class_weight is a series and should be converted to float value
# using float() to be used as a sclar inside xgboost. 
class_weight = float((y_train==0).sum()/(y_train==1).sum())

In [12]:
# Defining the appropriate objective function for the XGboost classifier

def objective_wrappper_xgb(X_tr, y_tr, cls=None, cv_strat=None, cl_w=None):
    '''
    Optimizes classifier's cls (Xgboost here) parameters on the given training set X_tr,y_tr
    using cross-validation cv_strat & Class weights cl_w objects.
    
    '''
    
    def objective(trial):
        params = {
        #'scale_pos_weight': trial.suggest_categorical('scale_pos_weight',[class_weight,9,10]),
        'n_estimators': trial.suggest_int('n_estimators', 10, 400),
        'gamma': trial.suggest_int('gamma', 0, 5),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 10),
        'max_delta_step': trial.suggest_int('max_delta_step', 1, 10),
        'max_depth': trial.suggest_int('max_depth', 2, 75),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1)
        }
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective


In [13]:
# Defining the evaluation function for study's best parameters
def study_best_score_params(X_tr, y_tr, cls, obj_func, cv_strat, cl_w=None, n_trials=100):
    ''' Computes the best hyper parameters of the classsifier and returns 
    Optuna's study's best score & clasifier parameters'''
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func(X_tr, y_tr, cls, cv_strat, cl_w), n_trials)
    best_score = study.best_value
    best_params = study.best_params
    return (best_score,best_params)

In [14]:
# Instantiating the xgboost classifier  
xgb_s = xgb.XGBClassifier(random_state=42,n_jobs=5,objective='binary:logistic',scale_pos_weight=class_weight)

#### Computing the best hyperparameters for the  XGboost Classifier using Reduced feature Training Set.

In [15]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = study_best_score_params(X_train.values, y_train.values, xgb_s, objective_wrappper_xgb, cv_strat=cv_strat,
                                                       n_trials=50)

[32m[I 2020-12-29 19:52:27,775][0m A new study created in memory with name: no-name-89669194-2e29-4139-9f9f-fba2759a1009[0m
[32m[I 2020-12-29 19:55:04,573][0m Trial 0 finished with value: 0.7399930764722756 and parameters: {'n_estimators': 60, 'gamma': 4, 'reg_alpha': 6.461297110356901, 'reg_lambda': 3.0651989087218023, 'max_delta_step': 10, 'max_depth': 21, 'colsample_bytree': 0.8139076755003545, 'learning_rate': 0.009908592475110082}. Best is trial 0 with value: 0.7399930764722756.[0m
[32m[I 2020-12-29 20:00:51,137][0m Trial 1 finished with value: 0.7427592646621521 and parameters: {'n_estimators': 78, 'gamma': 0, 'reg_alpha': 2.936107049035633, 'reg_lambda': 7.615633231837846, 'max_delta_step': 4, 'max_depth': 48, 'colsample_bytree': 0.9706296338541937, 'learning_rate': 0.03717515418317997}. Best is trial 1 with value: 0.7427592646621521.[0m
[32m[I 2020-12-29 20:25:00,319][0m Trial 2 finished with value: 0.7132375519974194 and parameters: {'n_estimators': 318, 'gamma': 0,

[32m[I 2020-12-30 00:15:23,014][0m Trial 22 finished with value: 0.7613284330679144 and parameters: {'n_estimators': 238, 'gamma': 3, 'reg_alpha': 9.899992443768694, 'reg_lambda': 8.952592161647805, 'max_delta_step': 3, 'max_depth': 3, 'colsample_bytree': 0.7048275001505966, 'learning_rate': 0.05718306911861545}. Best is trial 20 with value: 0.7653360616080642.[0m
[32m[I 2020-12-30 00:19:19,561][0m Trial 23 finished with value: 0.7547546814816826 and parameters: {'n_estimators': 286, 'gamma': 2, 'reg_alpha': 8.897534428056064, 'reg_lambda': 9.096711364349284, 'max_delta_step': 3, 'max_depth': 11, 'colsample_bytree': 0.6478299154867042, 'learning_rate': 0.05432015025761056}. Best is trial 20 with value: 0.7653360616080642.[0m
[32m[I 2020-12-30 00:22:47,998][0m Trial 24 finished with value: 0.7522154472481712 and parameters: {'n_estimators': 179, 'gamma': 2, 'reg_alpha': 8.058538512617575, 'reg_lambda': 7.958900607838448, 'max_delta_step': 4, 'max_depth': 13, 'colsample_bytree': 

[32m[I 2020-12-30 01:50:33,853][0m Trial 45 finished with value: 0.7459880495760225 and parameters: {'n_estimators': 124, 'gamma': 4, 'reg_alpha': 9.421698227088745, 'reg_lambda': 5.511374533273388, 'max_delta_step': 4, 'max_depth': 14, 'colsample_bytree': 0.6877849347781635, 'learning_rate': 0.0026112989981588895}. Best is trial 20 with value: 0.7653360616080642.[0m
[32m[I 2020-12-30 01:51:16,297][0m Trial 46 finished with value: 0.7538414106811981 and parameters: {'n_estimators': 94, 'gamma': 4, 'reg_alpha': 7.840314418561915, 'reg_lambda': 6.612710993880608, 'max_delta_step': 5, 'max_depth': 6, 'colsample_bytree': 0.7919852239984206, 'learning_rate': 0.03486241525285229}. Best is trial 20 with value: 0.7653360616080642.[0m
[32m[I 2020-12-30 01:51:42,731][0m Trial 47 finished with value: 0.7530426832305496 and parameters: {'n_estimators': 54, 'gamma': 5, 'reg_alpha': 8.556811625152534, 'reg_lambda': 4.324513727303082, 'max_delta_step': 4, 'max_depth': 7, 'colsample_bytree': 0

In [16]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.7653360616080642


In [17]:
print('The best study parameters for the classifier are: ',best_study_params)

The best study parameters for the classifier are:  {'n_estimators': 236, 'gamma': 2, 'reg_alpha': 9.665486607047548, 'reg_lambda': 9.147237022712792, 'max_delta_step': 3, 'max_depth': 6, 'colsample_bytree': 0.6499073257076501, 'learning_rate': 0.05820583531821543}


In [18]:
# Obtaining the best Xgboost model by setting best study parameters.
xgb_s = xgb_s.set_params(**best_study_params)

In [19]:
# fitting the best Xgboost model on the whole Reduced feature training set
xgb_s.fit(X_train.values, y_train.values)

XGBClassifier(colsample_bytree=0.6499073257076501, gamma=2,
              learning_rate=0.05820583531821543, max_delta_step=3, max_depth=6,
              n_estimators=236, n_jobs=5, random_state=42,
              reg_alpha=9.665486607047548, reg_lambda=9.147237022712792,
              scale_pos_weight=11.386970299156776)

In [20]:
# Defining the function to calculate the roc_auc score for the feature sets
def cal_roc_auc(X, y, cls, f_set, t_set, model_name):
    ''' Calculates the roc auc score using the best study parameters 
        f_set : String: specifies 'full feature', 'Reduced feature'
        t_set: String: specifies 'training', 'test'
        model_name: String: specifies Name of the model '''
        
    y_pred = cls.predict_proba(X)
    print('The roc_auc_score for the {} {} set using the best {} classifier is '.format(f_set,t_set,model_name),roc_auc_score(y, y_pred[:,1]))

In [21]:
# Calculating the Reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train.values, y_train.values, xgb_s, 'Reduced feature', 'training', 'Xgboost')

The roc_auc_score for the Reduced feature training set using the best Xgboost classifier is  0.8401392081413007


In [22]:
# Calculating the Reduced feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test.values, y_test.values, xgb_s, 'Reduced feature', 'test', 'Xgboost')

The roc_auc_score for the Reduced feature test set using the best Xgboost classifier is  0.7691327155992314


### Calculating R_R ratio for best Xgboost Classifier.

In [23]:
# Computing the CV scores using sklearn's cross_val_score
score_xgb = cross_val_score(xgb_s, X_train.values, y_train.values, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [24]:
print('The reward associated with the best Xgboost Classifier using roc_auc metric is: ',np.mean(score_xgb))

The reward associated with the best Xgboost Classifier using roc_auc metric is:  0.7653360616080642


In [25]:
print('The risk associated with the best Xgboost Classifier using roc_auc metric is: ',np.std(score_xgb))

The risk associated with the best Xgboost Classifier using roc_auc metric is:  0.004942475986184466


In [26]:
R_R_Ratio_Xgboost = np.mean(score_xgb)/np.std(score_xgb)

In [27]:
print('The reward risk ratio for the best Xgboost Classifier using roc_auc metric is: ',R_R_Ratio_Xgboost)

The reward risk ratio for the best Xgboost Classifier using roc_auc metric is:  154.84871626030798


In [28]:
# Saving the Reduced feature set best Xgboost Classifier 
import joblib
joblib.dump(xgb_s, 'XGboost.joblib')

['XGboost.joblib']

#### R_R Ratio for the Xgboost classifier using roc_auc metric is:  154.84871626030798

## Observations:
### 1) The xgboost classifier's test set roc_auc score & R_R ratio are slightly less than those of light Gbm classifier_2. May be more extensive hyperparameter search , might result in better score for xgboost classifier.
### 2) The xgboost classifier is clearly overfitting the dataset, which is evident looking at the difference between training set & test set roc_auc scores.
### 3) Tuning xgboost requires more computational resources, which may be better done on cloud than on PC & since we are already getting good performance using Light Gbm classifier, we won't be further tuning Xgboost classifier for this dataset. 

### R_R Ratio for the best Tree based boosting classifier using roc_auc score is:  167.37634851965672,  corresponding to Tuned more regularized Light Gbm classifier_2.

### Taking everything into consideration, such as  Overfitting, test set roc_auc score, R_R ratios & Computational costs, the best tree based boosting classifier is Light Gbm classifier_2.