In [1]:
# Importing necessary packages 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)# To see all the columns of a dataframe
#pd.set_option('display.max_rows', None)

In [2]:
# Function to reduce the memory usage of various Dataframes
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
       
        1. Iterate over every column
        2. Determine if the column is numeric
        3. Determine if the column can be represented by an integer
        4. Find the min and the max value
        5. Determine and apply the smallest datatype that can fit the range of values

    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [3]:
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [4]:
# Loadiing reduced feature training set
X_train = import_data('X_train_final.csv')
y_train = pd.read_csv('y_train.final.csv')

Memory usage of dataframe is 181.24 MB
Memory usage after optimization is: 38.27 MB
Decreased by 78.9%


In [5]:
# Loadiing reduced feature test set
X_test = import_data('X_test_final.csv')
y_test = pd.read_csv('y_test.final.csv')

Memory usage of dataframe is 60.41 MB
Memory usage after optimization is: 12.76 MB
Decreased by 78.9%


In [15]:
# Re-computing the ratio of class 0 to class 1 entries in the response variable y_train.
r_train = float(y_train.value_counts().loc[0].values/y_train.value_counts().loc[1].values)
print('The ratio of class 0 to class 1 entries in y_train is: ',r_train)

The ratio of class 0 to class 1 entries in y_train is:  11.386970299156776


In [14]:
# Re-computing the ratio of class 0 to class 1 entries in the response variable y_train.
r_test = float(y_test.value_counts().loc[0].values/y_test.value_counts().loc[1].values)
print('The ratio of class 0 to class 1 entries in y_test is: ',r_test)

The ratio of class 0 to class 1 entries in y_test is:  11.387689332903642


#### As expected both training  & test set have same ratio of class 0 to class 1 entries.

## Baseline Model: Dummy Classifier with parameter (Strategy=“most_frequent”)

In [16]:
# Importing Dummy classifier from Scikit Learn
from sklearn.dummy import DummyClassifier

In [17]:
# Importing the Sklearn's roc_auc_score module
from sklearn.metrics import roc_auc_score

In [18]:
# Instantiating the Dummy classifier object
bm = DummyClassifier(random_state=42,strategy='most_frequent')

#### Fitting dummy classifier (baseline model) to reduced feature training set and computing resulting  training & test set roc_auc_scores.

In [19]:
# Fitting the simple dummy classifier to reduced feature Training set.
bm.fit(X_train,y_train)

DummyClassifier(random_state=42, strategy='most_frequent')

In [20]:
# Accuracy for the Reduced feature Training set with baseline model
bm.score(X_train,y_train)

0.9192700090620163

In [21]:
# Accuracy for the Reduced feature Test set with baseline model
bm.score(X_test,y_test)

0.9192746949712531

In [22]:
# Predicting the probabilities of y=1 for the reduced feature training set
y_pred_train = bm.predict_proba(X_train)

In [24]:
print('The ROC AUC for the Reduced_feature Training set is:',roc_auc_score(y_train,y_pred_train[:,1]))

The ROC AUC for the Reduced_feature Training set is: 0.5


In [25]:
# Predicting the probabilities of y=1 for the Reduced feature test set
y_pred = bm.predict_proba(X_test)

In [26]:
print('The ROC AUC for the Reduced_feature test set is:',roc_auc_score(y_test,y_pred[:,1]))

The ROC AUC for the Reduced_feature test set is: 0.5


### Observations:
### 1) The roc_auc scores of both the training  & test set  are equal , but are substantially less than the corresponding accuracy scores , which is typically the case with imbalanced class . Thus the baseline model is clearly underfitting the dataset.
### 2) The roc_auc score of the baseline model is .5, same as that of random guessing. Thus we clearly need more powerful complex models.

## Model_1: Logistics Regression with Tuned Hyperparameters using Optuna.

In [27]:
# Importing  hyperparamater tuning optimizer optuna
import optuna

In [28]:
# Importing required Libraries
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [31]:
# Instantiating the Stratified K fold object
cv_strat = StratifiedKFold(5,random_state=42)

In [32]:
# Defining the class weights
cl_weight = [None,'balanced',{0:1.0,1:12},{0:1.0,1:13},{0:1.0,1:14},{0:1.0,1:15},{0:1.0,1:16}]

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
# Instantiating the logistic Regression classifier
lr_s =  LogisticRegression(random_state=42,solver='saga',n_jobs=5)

In [35]:
# Defining the appropriate objective function for the Logistic regression classifier
def objective_wrappper_ls(X_tr, y_tr, cls=None, cv_strat=None, cl_w=None):
    '''
    Optimizes Logistics Regression parameters on the given training set X_tr,y_tr
    using given cv_strat cross-validation  & class weights cl_w objects.
    
    '''
    
    def objective(trial):
        params = {
        'C': trial.suggest_loguniform('C', 1e-5, 1e2),
        'l1_ratio':trial.suggest_uniform('l1_ratio',0,1),
        'class_weight':trial.suggest_categorical('class_weight',cl_w),
        'penalty':trial.suggest_categorical('penalty',['none','elasticnet'])
            
        }
        
        cls.set_params(**params)#Initializing the model with the parameters 
    
        return np.mean(cross_val_score(cls, X_tr, y_tr, cv=cv_strat, n_jobs=5, scoring='roc_auc'))  
    return objective

In [42]:
# Defining the evaluation function for study's best parameters
def study_best_score_params(X_tr, y_tr, cls, obj_func, cv_strat, cl_w, n_trials=100):
    ''' Computes the best hyper parameters of the classsifier and returns 
    Optuna's study's best score & clasifier parameters'''
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func(X_tr, y_tr, cls, cv_strat, cl_w), n_trials)
    best_score = study.best_value
    best_params = study.best_params
    return (best_score,best_params)

#### Computing the best hyperparameters for the  logistic Regression using Optuna on reduced feature Training Set.

In [43]:
# Extracting the best model parameters and best study score
best_study_score, best_study_params = study_best_score_params(X_train, y_train, lr_s, objective_wrappper_ls, cv_strat,
                                                            cl_weight)

[32m[I 2020-12-26 19:09:59,173][0m A new study created in memory with name: no-name-324081e1-a513-4d8a-adc2-d12b54d33366[0m
[32m[I 2020-12-26 19:10:25,917][0m Trial 0 finished with value: 0.7525991494103144 and parameters: {'C': 4.283090544812846, 'l1_ratio': 0.8160827573169284, 'class_weight': {0: 1.0, 1: 15}, 'penalty': 'none'}. Best is trial 0 with value: 0.7525991494103144.[0m
[32m[I 2020-12-26 19:10:51,424][0m Trial 1 finished with value: 0.7508320049755364 and parameters: {'C': 0.03394573278920371, 'l1_ratio': 0.15355461801208292, 'class_weight': None, 'penalty': 'none'}. Best is trial 0 with value: 0.7525991494103144.[0m
[32m[I 2020-12-26 19:11:18,547][0m Trial 2 finished with value: 0.7525806358983473 and parameters: {'C': 0.1863596644337647, 'l1_ratio': 0.8909663821160461, 'class_weight': {0: 1.0, 1: 16}, 'penalty': 'none'}. Best is trial 0 with value: 0.7525991494103144.[0m
[32m[I 2020-12-26 19:11:50,532][0m Trial 3 finished with value: 0.7442200483050623 and pa

[32m[I 2020-12-26 19:24:16,735][0m Trial 31 finished with value: 0.7526549371060198 and parameters: {'C': 0.027808922827249304, 'l1_ratio': 0.976418519647739, 'class_weight': 'balanced', 'penalty': 'none'}. Best is trial 12 with value: 0.7526549371060198.[0m
[32m[I 2020-12-26 19:24:39,965][0m Trial 32 finished with value: 0.7526549371060198 and parameters: {'C': 0.0042257946366346675, 'l1_ratio': 0.7926757303083694, 'class_weight': 'balanced', 'penalty': 'none'}. Best is trial 12 with value: 0.7526549371060198.[0m
[32m[I 2020-12-26 19:25:03,281][0m Trial 33 finished with value: 0.7526549371060198 and parameters: {'C': 0.5613232585733001, 'l1_ratio': 0.7649703822873588, 'class_weight': 'balanced', 'penalty': 'none'}. Best is trial 12 with value: 0.7526549371060198.[0m
[32m[I 2020-12-26 19:25:26,378][0m Trial 34 finished with value: 0.7525806358983473 and parameters: {'C': 0.06692076444702408, 'l1_ratio': 0.9766994861760351, 'class_weight': {0: 1.0, 1: 16}, 'penalty': 'none'}.

[32m[I 2020-12-26 19:39:11,290][0m Trial 62 finished with value: 0.7527038980844903 and parameters: {'C': 0.0025169420258605592, 'l1_ratio': 0.2004616739060698, 'class_weight': {0: 1.0, 1: 15}, 'penalty': 'elasticnet'}. Best is trial 59 with value: 0.7527962080533527.[0m
[32m[I 2020-12-26 19:39:43,956][0m Trial 63 finished with value: 0.7526229721275212 and parameters: {'C': 0.007315621217135961, 'l1_ratio': 0.24125471817117253, 'class_weight': {0: 1.0, 1: 15}, 'penalty': 'elasticnet'}. Best is trial 59 with value: 0.7527962080533527.[0m
[32m[I 2020-12-26 19:40:15,376][0m Trial 64 finished with value: 0.7520618594637141 and parameters: {'C': 0.00025106418929606754, 'l1_ratio': 0.05376118501351772, 'class_weight': {0: 1.0, 1: 15}, 'penalty': 'elasticnet'}. Best is trial 59 with value: 0.7527962080533527.[0m
[32m[I 2020-12-26 19:40:46,990][0m Trial 65 finished with value: 0.7525814147215015 and parameters: {'C': 0.021075857246310246, 'l1_ratio': 0.17066466786126, 'class_weight

[32m[I 2020-12-26 19:54:26,017][0m Trial 92 finished with value: 0.7528272960640725 and parameters: {'C': 0.003035732079539218, 'l1_ratio': 0.4124378106960251, 'class_weight': {0: 1.0, 1: 14}, 'penalty': 'elasticnet'}. Best is trial 89 with value: 0.7528306081047205.[0m
[32m[I 2020-12-26 19:54:57,369][0m Trial 93 finished with value: 0.7526711958125198 and parameters: {'C': 0.007922605487155314, 'l1_ratio': 0.41486182351153805, 'class_weight': {0: 1.0, 1: 14}, 'penalty': 'elasticnet'}. Best is trial 89 with value: 0.7528306081047205.[0m
[32m[I 2020-12-26 19:55:28,205][0m Trial 94 finished with value: 0.752609749822237 and parameters: {'C': 0.023393673805993283, 'l1_ratio': 0.4567218082566226, 'class_weight': {0: 1.0, 1: 14}, 'penalty': 'elasticnet'}. Best is trial 89 with value: 0.7528306081047205.[0m
[32m[I 2020-12-26 19:56:01,883][0m Trial 95 finished with value: 0.7525852278637662 and parameters: {'C': 0.06508653480055386, 'l1_ratio': 0.3803415921539912, 'class_weight': {

In [44]:
print('The best roc_auc_score for the study is: ',best_study_score)

The best roc_auc_score for the study is:  0.7528306081047205


In [45]:
print(('The best study parameters for the classifier are: ',best_study_params))

('The best study parameters for the classifier are: ', {'C': 0.0031160262723627184, 'l1_ratio': 0.4046164083668398, 'class_weight': {0: 1.0, 1: 14}, 'penalty': 'elasticnet'})


#### Computing the  Reduced feature roc_auc score for the test data using the best study Parameters

In [46]:
# Obtaining the best reduced feature LR model by setting best study parameters.
lr_R = lr_s.set_params(**best_study_params)

In [47]:
# fitting the best Logistics regression model on the reduced feature training set
lr_R.fit(X_train, y_train)

LogisticRegression(C=0.0031160262723627184, class_weight={0: 1.0, 1: 14},
                   l1_ratio=0.4046164083668398, n_jobs=5, penalty='elasticnet',
                   random_state=42, solver='saga')

In [49]:
# Defining the function to calculate the roc_auc score for the feature sets
def cal_roc_auc(X, y, cls, f_set, t_set, model_name):
    ''' Calculates the roc auc score using the best study parameters 
        f_set : String: specifies 'full feature', 'Reduced feature'
        t_set: String: specifies 'training', 'test'
        model_name: String: specifies Name of the model '''
        
    y_pred = cls.predict_proba(X)
    print('The roc_auc_score for the {} {} set using the best {} is '.format(f_set,t_set,model_name),roc_auc_score(y,y_pred[:,1]))

In [51]:
# Calculating the reduced feature training set roc_auc score using the best study parameters
cal_roc_auc(X_train, y_train, lr_R,'reduced feature', 'training', 'Logistic Regression')

The roc_auc_score for the reduced feature training set using the best Logistic Regression is  0.7542487254350991


In [52]:
# Calculating the reduced feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test, y_test, lr_R, 'reduced feature','test','Logistic Regression')

The roc_auc_score for the reduced feature test set using the best Logistic Regression is  0.7536336223586382


In [53]:
# Saving the  best Logistic Regression model 
import joblib
joblib.dump(lr_R,'Log_Reg.joblib')

['Log_Reg.joblib']

## Observations:
### 1) The roc_auc test set score for the tuned Logistic model is much higher than the corresponding score from the  baseline model, which was expected , as the Logistic Regression is much more complex model than the simple Dummy Classifier.
### 2) The training & test set roc_auc scores are almost equal to each other , indicating that Logistic Regression has well fitted the dataset with no indications of overfitting.

## Defining  Reward Risk Ratio for a Family of Machine Learning Models:

## R_R Ratio = Mean of  K Fold CV score on training data / Std. Dev of K Fold CV score on training data

### R_R ratio may be helpful in choosing among models having same computational complexity

### Calculating R_R ratio for best Logistic Regression Model 

In [54]:
# Instantiating the Stratified K fold object
cv_strat = StratifiedKFold(5,random_state=42)

In [55]:
# Computing the Reward, Risk of the Logistic Regression Model
score_Log_Reg = cross_val_score(lr_R, X_train, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [56]:
print('The reward associated with the best Logistics Regression Model using roc_auc metric is: ',np.mean(score_Log_Reg))

The reward associated with the best Logistics Regression Model using roc_auc metric is:  0.7528306081047205


In [57]:
print('The risk associated with the best Logistics Regression Model using roc_auc metric is: ',np.std(score_Log_Reg))

The risk associated with the best Logistics Regression Model using roc_auc metric is:  0.005199052773158794


In [58]:
R_R_Ratio_Log_Reg = np.mean(score_Log_Reg)/np.std(score_Log_Reg)

In [59]:
print('The reward risk ratio for the best Logistics Regression Model using roc_auc metric is: ',R_R_Ratio_Log_Reg)

The reward risk ratio for the best Logistics Regression Model using roc_auc metric is:  144.80149383967927


### R_R Ratio for the best Logistic Regression model, utilizing roc_auc metric is: 144.80149383967927