In [1]:
# Importing Relevant Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)# To see all the columns of a dataframe
#pd.set_option('display.max_rows', None)

In [2]:
# Function to reduce the memory usage of various Dataframes
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
       
        1. Iterate over every column
        2. Determine if the column is numeric
        3. Determine if the column can be represented by an integer
        4. Find the min and the max value
        5. Determine and apply the smallest datatype that can fit the range of values

    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [3]:
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [4]:
# Loading reduced feature training set
X_train = import_data('X_train_final.csv')
y_train = pd.read_csv('y_train.final.csv')

Memory usage of dataframe is 181.24 MB
Memory usage after optimization is: 38.27 MB
Decreased by 78.9%


In [5]:
# Loading reduced feature test set
X_test = import_data('X_test_final.csv')
y_test = pd.read_csv('y_test.final.csv')

Memory usage of dataframe is 60.41 MB
Memory usage after optimization is: 12.76 MB
Decreased by 78.9%


## Model_4: Light Gbm classifiers with Tuned Hyperparameters using Optuna.

In [6]:
# Importing the Sklearn's roc_auc_score module
from sklearn.metrics import roc_auc_score

In [7]:
# Importing required Libraries
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [8]:
# Instantiating the Stratified K fold object
cv_strat = StratifiedKFold(5,random_state=42)

In [9]:
# Importing  hyperparamater tuning optimizer optuna
import optuna

In [10]:
# Importing lightgbm Classifier
from lightgbm import LGBMClassifier

In [11]:
# defining Class weights
cl_weight = [None,'balanced',{0:1.0,1:12},{0:1.0,1:13},{0:1.0,1:14},{0:1.0,1:15},{0:1.0,1:16},
            {0:1.0,1:17},{0:1.0,1:18},{0:1.0,1:19},{0:1.0,1:20}]

In [19]:
# Defining the appropriate objective function for the Light GBM classifier

def objective_wrappper_lgbm(X_tr, y_tr, cls=None, cv_strat=None, cl_w=None):
    '''
    Optimizes classifier's cls (LightGBM here) parameters on the given training set X_tr, y_tr
    using cross-validation cv_strat & Class weights cl_w objects.
    
    '''
        
    def objective(trial):
        params = {
        'class_weight': trial.suggest_categorical('class_weight',cl_w),
        'n_estimators': trial.suggest_int('n_estimators', 10, 2000),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 250),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 250),
        'max_depth': trial.suggest_int('max_depth', 2, 100),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1)
        }
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective

In [20]:
# Defining the evaluation function for study's best parameters
def study_best_score_params(X_tr, y_tr, cls, obj_func, cv_strat, cl_w, n_trials=100):
    ''' Computes the best hyper parameters of the classsifier and returns 
    Optuna's study's best score & clasifier parameters'''
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func(X_tr, y_tr, cls, cv_strat, cl_w), n_trials)
    best_score = study.best_value
    best_params = study.best_params
    return (best_score,best_params)

In [21]:
# Initializing the LightGBM Classifier
lgb_c = LGBMClassifier(random_state=42,objective='binary',n_jobs=5)

#### Computing the best hyperparameters for the LightGBM classifier_1 using Reduced Feature Training Set.

In [22]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = study_best_score_params(X_train.values, y_train.values, lgb_c, objective_wrappper_lgbm, cv_strat,
                                                            cl_weight, n_trials=50)
# Used X_train.values & y_train.values as the some feature names had JSON characters , which were causing trouble while
# code execution.

[32m[I 2020-12-28 00:14:41,858][0m A new study created in memory with name: no-name-bd44e6a1-d6d3-47c9-8d60-d8bcd0ba442f[0m
[32m[I 2020-12-28 00:16:33,594][0m Trial 0 finished with value: 0.7576727155403674 and parameters: {'class_weight': {0: 1.0, 1: 18}, 'n_estimators': 695, 'min_child_samples': 132, 'reg_alpha': 2.3430293701805294, 'reg_lambda': 1.8647198679602717, 'num_leaves': 158, 'max_depth': 13, 'colsample_bytree': 0.7628483925715319, 'learning_rate': 0.004005083001523948}. Best is trial 0 with value: 0.7576727155403674.[0m
[32m[I 2020-12-28 00:16:38,336][0m Trial 1 finished with value: 0.7326449646003608 and parameters: {'class_weight': {0: 1.0, 1: 19}, 'n_estimators': 25, 'min_child_samples': 123, 'reg_alpha': 9.943504572731808, 'reg_lambda': 8.743105942981085, 'num_leaves': 54, 'max_depth': 45, 'colsample_bytree': 0.6497680849804933, 'learning_rate': 0.007890445950599945}. Best is trial 0 with value: 0.7576727155403674.[0m
[32m[I 2020-12-28 00:16:56,709][0m Trial 

[32m[I 2020-12-28 01:00:58,179][0m Trial 20 finished with value: 0.7649470063402004 and parameters: {'class_weight': None, 'n_estimators': 1194, 'min_child_samples': 182, 'reg_alpha': 3.204521658106843, 'reg_lambda': 5.784251801656469, 'num_leaves': 207, 'max_depth': 59, 'colsample_bytree': 0.990479769723445, 'learning_rate': 0.005999824619983893}. Best is trial 14 with value: 0.7671802717146841.[0m
[32m[I 2020-12-28 01:04:29,644][0m Trial 21 finished with value: 0.7675424057666671 and parameters: {'class_weight': {0: 1.0, 1: 15}, 'n_estimators': 1333, 'min_child_samples': 209, 'reg_alpha': 5.371928310382926, 'reg_lambda': 9.855672244353572, 'num_leaves': 228, 'max_depth': 85, 'colsample_bytree': 0.6448919770743746, 'learning_rate': 0.007926009764444197}. Best is trial 21 with value: 0.7675424057666671.[0m
[32m[I 2020-12-28 01:07:52,113][0m Trial 22 finished with value: 0.7569794800038225 and parameters: {'class_weight': 'balanced', 'n_estimators': 1431, 'min_child_samples': 15

[32m[I 2020-12-28 01:50:52,858][0m Trial 40 finished with value: 0.7466885415930347 and parameters: {'class_weight': {0: 1.0, 1: 19}, 'n_estimators': 484, 'min_child_samples': 212, 'reg_alpha': 4.182188477397222, 'reg_lambda': 5.136740398225163, 'num_leaves': 59, 'max_depth': 48, 'colsample_bytree': 0.7750980881201373, 'learning_rate': 0.003258359145538698}. Best is trial 21 with value: 0.7675424057666671.[0m
[32m[I 2020-12-28 01:53:24,026][0m Trial 41 finished with value: 0.7673566047915893 and parameters: {'class_weight': {0: 1.0, 1: 16}, 'n_estimators': 1171, 'min_child_samples': 124, 'reg_alpha': 4.881840143403759, 'reg_lambda': 5.112668178931702, 'num_leaves': 155, 'max_depth': 54, 'colsample_bytree': 0.6493329332052672, 'learning_rate': 0.008022585467796278}. Best is trial 21 with value: 0.7675424057666671.[0m
[32m[I 2020-12-28 01:55:30,135][0m Trial 42 finished with value: 0.7657546068639969 and parameters: {'class_weight': {0: 1.0, 1: 16}, 'n_estimators': 1163, 'min_chi

In [23]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.7675424057666671


In [24]:
print('The best study parameters for the classifier are: ',best_study_params)

('The best study parameters for the classifier are: ', {'class_weight': {0: 1.0, 1: 15}, 'n_estimators': 1333, 'min_child_samples': 209, 'reg_alpha': 5.371928310382926, 'reg_lambda': 9.855672244353572, 'num_leaves': 228, 'max_depth': 85, 'colsample_bytree': 0.6448919770743746, 'learning_rate': 0.007926009764444197})


In [25]:
# Obtaining the best tuned LightGbm model by setting best study parameters.
lgb_c = lgb_c.set_params(**best_study_params)

In [26]:
# fitting the best tuned lightgbm model on the Reduced feature training set.
lgb_c.fit(X_train.values, y_train.values)

LGBMClassifier(class_weight={0: 1.0, 1: 15},
               colsample_bytree=0.6448919770743746,
               learning_rate=0.007926009764444197, max_depth=85,
               min_child_samples=209, n_estimators=1333, n_jobs=5,
               num_leaves=228, objective='binary', random_state=42,
               reg_alpha=5.371928310382926, reg_lambda=9.855672244353572)

In [27]:
# Defining the function to calculate the roc_auc score for the feature sets
def cal_roc_auc(X, y, cls, f_set, t_set, model_name):
    ''' Calculates the roc auc score using the best study parameters 
        f_set : String: specifies 'full feature', 'Reduced feature'
        t_set: String: specifies 'training', 'test'
        model_name: String: specifies Name of the model '''
        
    y_pred = cls.predict_proba(X)
    print('The roc_auc_score for the {} {} set using the best {} is '.format(f_set,t_set,model_name),roc_auc_score(y,y_pred[:,1]))

In [30]:
# Calculating the Reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train.values, y_train, lgb_c, 'Reduced feature', 'training', 'Light Gbm')

The roc_auc_score for the Reduced feature training set using the best Light Gbm is  0.9516343112915224


In [32]:
# Calculating the Reduced feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test.values, y_test, lgb_c, 'Reduced feature', 'test','Light Gbm')

The roc_auc_score for the Reduced feature test set using the best Light Gbm is  0.7725786275246422


### Calculating R_R ratio for the Light GBM Classifier_1.

In [33]:
# Computing the CV scores using sklearn's cross_val_score
score_Light_Gbm_1 = cross_val_score(lgb_c, X_train.values, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [34]:
print('The reward associated with the Light GMB Classifier_1 using roc_auc metric is: ',np.mean(score_Light_Gbm_1))

The reward associated with the Light GMB Classifier_1 using roc_auc metric is:  0.7675424057666671


In [35]:
print('The risk associated with the Light GMB Classifier_1 using roc_auc metric is: ',np.std(score_Light_Gbm_1))

The risk associated with the Light GMB Classifier_1 using roc_auc metric is:  0.004609725931915861


In [36]:
R_R_Ratio_Light_Gbm_1 = np.mean(score_Light_Gbm_1)/np.std(score_Light_Gbm_1)

In [37]:
print('The reward risk ratio for the best Light Gbm Classifier using roc_auc metric is: ',R_R_Ratio_Light_Gbm_1)

The reward risk ratio for the best Light Gbm Classifier using roc_auc metric is:  166.50499771635376


In [40]:
print('5 fold CV roc_auc scores for the light_gbm classifier are: ',score_Light_Gbm_1)

5 fold CV roc_auc scores for the light_gbm classifier are:  [0.76296609 0.76266868 0.77517289 0.7693666  0.76753777]


In [39]:
# Saving the Reduced feature set best Light Gbm Classifier_1
import joblib
joblib.dump(lgb_c,'Light_Gbm_1.joblib')

['Light_Gbm_1.joblib']

#### R_R Ratio for the Light Gbm classifier_1 using roc_auc metric is:  166.50499771635376

## Observation(s):
### 1) Of all the models fitted till now, lgb classifier_1 has the best test set roc_auc score as well as the R_R ratio. 
### 2) But clearly lgb classifier_1 has overfitted the dataset , as there is a large difference between the training set and test set roc_auc score. This overfitting is typical of most boosting classifiers.

### Light Gbm Classifier_2 : A more regularized version of the previous light gbm model. 

In [15]:
# Defining the appropriate objective function for the more regularized Light GBM classifier

def objective_wrappper_lgbm(X_tr, y_tr, cls=None, cv_strat=None, cl_w=None):
    '''
    Optimizes classifier's cls (LightGBM here) parameters on the given training set X_tr, y_tr
    using cross-validation cv_strat & Class weights cl_w objects.
        
    '''
        
    def objective(trial):
        params = {
        'class_weight': trial.suggest_categorical('class_weight',cl_w),
        'n_estimators': trial.suggest_int('n_estimators', 10, 1200),# More Regularization
        'min_child_samples': trial.suggest_int('min_child_samples', 150, 250),# More Regularization
        'reg_alpha': trial.suggest_uniform('reg_alpha', 10, 15), # More Regularization
        'reg_lambda': trial.suggest_uniform('reg_lambda', 10, 15), # More Regularization
        'num_leaves': trial.suggest_int('num_leaves', 2, 175),# More Regularization
        'max_depth': trial.suggest_int('max_depth', 2, 75), # More Regularization
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1)
        }
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective

In [16]:
# Defining the evaluation function for study's best parameters
def study_best_score_params(X_tr, y_tr, cls, obj_func, cv_strat, cl_w, n_trials=100):
    ''' Computes the best hyper parameters of the classsifier and returns 
    Optuna's study's best score & clasifier parameters'''
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func(X_tr, y_tr, cls, cv_strat, cl_w), n_trials)
    best_score = study.best_value
    best_params = study.best_params
    return (best_score,best_params)

In [17]:
# Initializing the LightGBM Classifier
lgb_c = LGBMClassifier(random_state=42, objective='binary', n_jobs=5)

#### Computing the best hyperparameters for the Light GBM classifier_2 using Reduced Feature Training Set.

In [18]:
# Extracting the best model parameters and best study score
best_study_score,best_study_params = study_best_score_params(X_train.values, y_train.values, lgb_c, objective_wrappper_lgbm, cv_strat,
                                                            cl_weight, n_trials=50)
# Used X_train.values & y_train.values as the some feature names had JSON characters , which were causing trouble 
# during execution.

[32m[I 2020-12-29 00:55:17,563][0m A new study created in memory with name: no-name-c3fbece1-4af2-4fc5-90f5-f1df582e037e[0m
[32m[I 2020-12-29 00:56:45,218][0m Trial 0 finished with value: 0.7657969584471875 and parameters: {'class_weight': {0: 1.0, 1: 15}, 'n_estimators': 742, 'min_child_samples': 180, 'reg_alpha': 13.215252563900878, 'reg_lambda': 11.805865838506026, 'num_leaves': 92, 'max_depth': 50, 'colsample_bytree': 0.9574888587995646, 'learning_rate': 0.022330012555304646}. Best is trial 0 with value: 0.7657969584471875.[0m
[32m[I 2020-12-29 00:56:53,747][0m Trial 1 finished with value: 0.7428068313833717 and parameters: {'class_weight': 'balanced', 'n_estimators': 45, 'min_child_samples': 209, 'reg_alpha': 13.137942508776664, 'reg_lambda': 13.082211190821157, 'num_leaves': 129, 'max_depth': 11, 'colsample_bytree': 0.7757218805719028, 'learning_rate': 0.016394553189848563}. Best is trial 0 with value: 0.7657969584471875.[0m
[32m[I 2020-12-29 00:59:01,591][0m Trial 2 f

[32m[I 2020-12-29 01:16:45,502][0m Trial 20 finished with value: 0.7658235488797565 and parameters: {'class_weight': {0: 1.0, 1: 14}, 'n_estimators': 1159, 'min_child_samples': 157, 'reg_alpha': 11.413233642148334, 'reg_lambda': 12.57576219574895, 'num_leaves': 33, 'max_depth': 22, 'colsample_bytree': 0.7415456532250405, 'learning_rate': 0.030628097188775343}. Best is trial 3 with value: 0.7678642284061652.[0m
[32m[I 2020-12-29 01:17:19,584][0m Trial 21 finished with value: 0.7666741192614777 and parameters: {'class_weight': {0: 1.0, 1: 12}, 'n_estimators': 887, 'min_child_samples': 177, 'reg_alpha': 12.605330250526153, 'reg_lambda': 11.553929031524818, 'num_leaves': 14, 'max_depth': 5, 'colsample_bytree': 0.60138059303007, 'learning_rate': 0.06410606944257212}. Best is trial 3 with value: 0.7678642284061652.[0m
[32m[I 2020-12-29 01:17:48,776][0m Trial 22 finished with value: 0.766583857283782 and parameters: {'class_weight': {0: 1.0, 1: 12}, 'n_estimators': 1052, 'min_child_sa

[32m[I 2020-12-29 01:36:36,962][0m Trial 40 finished with value: 0.7669913489621383 and parameters: {'class_weight': 'balanced', 'n_estimators': 331, 'min_child_samples': 158, 'reg_alpha': 10.775108042042056, 'reg_lambda': 14.477806057132073, 'num_leaves': 74, 'max_depth': 25, 'colsample_bytree': 0.6232371636433602, 'learning_rate': 0.03906752143790038}. Best is trial 34 with value: 0.7683459294690801.[0m
[32m[I 2020-12-29 01:37:27,917][0m Trial 41 finished with value: 0.7674698356792335 and parameters: {'class_weight': {0: 1.0, 1: 16}, 'n_estimators': 649, 'min_child_samples': 154, 'reg_alpha': 13.01209097184585, 'reg_lambda': 13.691399894129871, 'num_leaves': 57, 'max_depth': 34, 'colsample_bytree': 0.6429349602276662, 'learning_rate': 0.018594156418477153}. Best is trial 34 with value: 0.7683459294690801.[0m
[32m[I 2020-12-29 01:38:16,903][0m Trial 42 finished with value: 0.7672295363591557 and parameters: {'class_weight': {0: 1.0, 1: 17}, 'n_estimators': 664, 'min_child_sam

In [19]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.7683459294690801


In [20]:
print('The best study parameters for the classifier are: ',best_study_params)

The best study parameters for the classifier are:  {'class_weight': 'balanced', 'n_estimators': 781, 'min_child_samples': 167, 'reg_alpha': 11.766191043484469, 'reg_lambda': 14.206551461816186, 'num_leaves': 54, 'max_depth': 16, 'colsample_bytree': 0.6296980381635193, 'learning_rate': 0.016921731098527678}


In [21]:
# Obtaining the best tuned & more regularized LightGbm model by setting best study parameters.
lgb_c = lgb_c.set_params(**best_study_params)

In [22]:
# fitting the best tuned  & more regularized lightgbm model on the Reduced feature training set
lgb_c.fit(X_train.values, y_train.values)

LGBMClassifier(class_weight='balanced', colsample_bytree=0.6296980381635193,
               learning_rate=0.016921731098527678, max_depth=16,
               min_child_samples=167, n_estimators=781, n_jobs=5, num_leaves=54,
               objective='binary', random_state=42,
               reg_alpha=11.766191043484469, reg_lambda=14.206551461816186)

In [23]:
# Defining the function to calculate the roc_auc score for the feature sets
def cal_roc_auc(X, y, cls, f_set, t_set, model_name):
    ''' Calculates the roc auc score using the best study parameters 
        f_set : String: specifies 'full feature', 'Reduced feature'
        t_set: String: specifies 'training', 'test'
        model_name: String: specifies Name of the model '''
        
    y_pred = cls.predict_proba(X)
    print('The roc_auc_score for the {} {} set using the best {} is '.format(f_set,t_set,model_name),roc_auc_score(y,y_pred[:,1]))

In [26]:
# Calculating the Reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train.values, y_train, lgb_c, 'Reduced feature', 'training', 'updated Light Gbm')

The roc_auc_score for the Reduced feature training set using the best updated Light Gbm is  0.8443935385323283


In [27]:
# Calculating the Reduced feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test.values, y_test, lgb_c, 'Reduced feature', 'test','updated Light Gbm')

The roc_auc_score for the Reduced feature test set using the best updated Light Gbm is  0.7721968636105587


### Calculating R_R ratio for the Light GBM Classifier_2.

In [28]:
# Computing the CV scores using sklearn's cross_val_score for more regularized light gbm classifier_2
score_Light_Gbm_2 = cross_val_score(lgb_c, X_train.values, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [30]:
print('The reward associated with the Light GMB Classifier_2 using roc_auc metric is: ',np.mean(score_Light_Gbm_2))

The reward associated with the Light GMB Classifier_2 using roc_auc metric is:  0.7683459294690801


In [31]:
print('The risk associated with the Light GMB Classifier_2 using roc_auc metric is: ',np.std(score_Light_Gbm_2))

The risk associated with the Light GMB Classifier_2 using roc_auc metric is:  0.00459052868738408


In [32]:
R_R_Ratio_Light_Gbm_2 = np.mean(score_Light_Gbm_2)/np.std(score_Light_Gbm_2)

In [35]:
print('The reward risk ratio for the best Light Gbm Classifier_2 using roc_auc metric is: ',R_R_Ratio_Light_Gbm_2)

The reward risk ratio for the best Light Gbm Classifier_2 using roc_auc metric is:  167.37634851965672


In [36]:
print('5 fold CV roc_auc scores for the light_gbm classifier_2 are: ',score_Light_Gbm_2)

5 fold CV roc_auc scores for the light_gbm classifier_2 are:  [0.76351166 0.76321873 0.7754102  0.77069052 0.76889855]


In [37]:
# Saving the Reduced feature set best updated regularized  Light Gbm Classifier 
import joblib
joblib.dump(lgb_c,'Light_Gbm_2.joblib')

['Light_Gbm_2.joblib']

#### R_R Ratio for the more regularized Light Gbm classifier_2 using roc_auc metric is:  167.37634851965672

## Observations:
### 1) By making use of more regularization, Light Gbm classifier_2 has been able to substantially reduce overfitting as compared to that of Light Gbm classifier_1, with the test set roc_auc scores being almost equal for both of them.
### 2) Further R_R ratio for the Light Gbm classifier_2 is marginally higher than that of Light Gbm classifier_1.
### 3) Thus Light Gbm classifier_2 beats the Light Gbm classifier_1 hands down and has the best test set roc_auc as well as R_R ratio of all the models tested till now for this dataset.

### R_R Ratio for the best  Light Gbm classifier using roc_auc metric is:  167.37634851965672 , corresponding to more regularized Light Gbm classifier_2.