In [2]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('dark_background')
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, classification_report, roc_auc_score, precision_score, recall_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, KBinsDiscretizer, PolynomialFeatures, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, classification_report, roc_auc_score, precision_score, recall_score
from sklearn.impute import KNNImputer
import pickle
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML
from pprint import pprint

# Set random seed
"""
We fix random seed while creating training and testing data, so that we get the same datasets.
"""
np.random.seed(128)

In [44]:
# Loading Dataset

In [3]:
DF = pd.read_csv("cleaned_df.csv")
Test_DF = DF
DF.shape

(12684, 23)

In [4]:
# Looking at the correlation matrix for a dataframe
DF.corr()

Unnamed: 0,temperature,has_children,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,Y
temperature,1.0,-0.019716,-0.155332,-0.216254,0.097085,0.06124
has_children,-0.019716,1.0,0.078211,-0.013722,-0.03162,-0.045557
toCoupon_GEQ15min,-0.155332,0.078211,1.0,0.324984,-0.303533,-0.081602
toCoupon_GEQ25min,-0.216254,-0.013722,0.324984,1.0,-0.192319,-0.103633
direction_same,0.097085,-0.03162,-0.303533,-0.192319,1.0,0.01457
Y,0.06124,-0.045557,-0.081602,-0.103633,0.01457,1.0


In [110]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Y                     12684 non-null  int64   
 1   destination           12684 non-null  category
 2   passanger             12684 non-null  category
 3   weather               12684 non-null  category
 4   temperature           12684 non-null  int64   
 5   time                  12684 non-null  category
 6   coupon                12684 non-null  category
 7   expiration            12684 non-null  category
 8   gender                12684 non-null  category
 9   age                   12684 non-null  category
 10  maritalStatus         12684 non-null  category
 11  has_children          12684 non-null  int64   
 12  education             12684 non-null  category
 13  occupation            12684 non-null  category
 14  income                12684 non-null  category
 15  Ba

## Accepting User Inputs

In [126]:
"""
Inputs from user
"""

# Metric to measure the classification model
Metric = 'accuracy' # 'f1' can be used while dealing with imbalanced cases as it can penalize extremities of precision and recall

# Target Variable
Target = 'Y'

In [114]:
# Obtaining imbalance ratio in the target variable

Target_Dist = DF[Target].value_counts().tolist()

Imbalance_Ratio = (min(Target_Dist)/sum(Target_Dist))

print(f"Imbalance Ratio of the Target Variable is: {Imbalance_Ratio:0.2f}")

Imbalance Ratio of the Target Variable is: 0.43


## Train and Test Datasets

In [115]:
# Separating dependent and independent variables
X = DF.drop(labels= Target, axis=1)
y = DF.loc[:, Target]

X.shape, y.shape

((12684, 22), (12684,))

In [116]:
# Generating train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [117]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9513, 22), (3171, 22), (9513,), (3171,))

## Preprocessing data

In [118]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_cols = list(X.select_dtypes(include = numerics).columns)
num_cols

['temperature',
 'has_children',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same']

In [119]:
cat_cols = list(X.select_dtypes(include = ['category', 'object']).columns)
cat_cols

['destination',
 'passanger',
 'weather',
 'time',
 'coupon',
 'expiration',
 'gender',
 'age',
 'maritalStatus',
 'education',
 'occupation',
 'income',
 'Bar',
 'CoffeeHouse',
 'CarryAway',
 'RestaurantLessThan20',
 'Restaurant20To50']

In [122]:
# Building pipeline to automate treatments specific to numeric columns and categorical columns

# For numeric, missing values are treated using iterative imputer and data is scaled using robust scaler
num_transformer = make_pipeline(IterativeImputer(max_iter=1000, random_state=0),
                                RobustScaler())

# For cat variables, missing values are imputed with mode and one hot encoder is used to make variable machine readable
cat_transformer = make_pipeline(SimpleImputer(strategy= 'most_frequent'),
                                OneHotEncoder(handle_unknown='ignore'))

# All column transformation operations are put into the column transformer
preprocessor = make_column_transformer((num_transformer, num_cols),
                                       (cat_transformer, cat_cols))

## Training the model

In [146]:
# By using make_pipeline, we 1st preprocess the data and then apply ML algorithm
pipelgbm = make_pipeline(preprocessor, LGBMClassifier(), verbose = False)


# Grid for hyperparameter tuning
lgbm_param_grid = {'lgbmclassifier__num_leaves': [10, 20, 31, 60], # Ideally, number of leaves for LGBM should be less than 2*max_Depth to avoid overfitting
#               'lgbmclassifier__min_child_samples': [20, 5, 10],
              #'lgbmclassifier__max_depth': [5, 10],
#               'lgbmclassifier__learning_rate': [0.05,0.1,0.2], # should not be too high which could cause missing out on patterns in data
              'lgbmclassifier__reg_alpha': [0, 0.01]}

# If we do not want to do parameter tuning, we can give single options for paramters --- we can keep using this
# template for ease of use
grid_lgbm = GridSearchCV(pipelgbm,
                         param_grid=param_grid,
                         cv=3,
                         n_jobs=-1, # if -1 uses all cores
                         scoring=['accuracy', 'precision', 'recall', 'f1'], # all the different metrics we want to store
                         refit = Metric) # metric is given by the user

# Fit on train data
grid_lgbm.fit(X_train, y_train)

# Score on Test data
accuracy = grid_lgbm.score(X_test, y_test)
print('Accuracy score of the {} is {:.2f}'.format(grid_lgbm.__class__.__name__, accuracy))

Accuracy score of the GridSearchCV is 0.80


In [177]:
pipexgb = make_pipeline(preprocessor, XGBClassifier(), verbose = False)

xgb_param_grid = {
       # 'xgbclassifier__min_child_weight': [1, 5, 10],
        'xgbclassifier__gamma': [0.5, 1, 1.5, 2, 5],
       # 'xgbclassifier__subsample': [0.6, 0.8, 1.0],
       # 'xgbclassifier__colsample_bytree': [0.6, 0.8, 1.0],
        'xgbclassifier__max_depth': [3, 4, 5]
        }

grid_xgb= GridSearchCV(pipexgb,
                         param_grid=xgb_param_grid,
                         cv=3,
                         n_jobs=-1, # if -1 uses all cores
                         scoring=['accuracy', 'precision', 'recall', 'f1'],
                         refit = Metric) # metric is given by the user

grid_xgb.fit(X_train, y_train)
accuracy = grid_xgb.score(X_test, y_test)
print('Accuracy score of the {} is {:.2f}'.format(grid_xgb.__class__.__name__, accuracy))

Accuracy score of the GridSearchCV is 0.79


In [187]:
pipe_rf = make_pipeline(preprocessor, RandomForestClassifier(), verbose = False)

rf_param_grid = {'randomforestclassifier__n_estimators': [120, 140],'randomforestclassifier__max_depth': [30, 40],
                 'randomforestclassifier__min_samples_split': [2, 3],'randomforestclassifier__min_samples_leaf': [3, 5]}

grid_rf= GridSearchCV(pipe_rf,
                         param_grid=rf_param_grid,
                         cv=3,
                         n_jobs=-1, # if -1 uses all cores
                         scoring=['accuracy', 'precision', 'recall', 'f1'],
                         refit = Metric) # metric is given by the user

grid_rf.fit(X_train, y_train)
accuracy = grid_rf.score(X_test, y_test)
print('Accuracy score of the {} is {:.2f}'.format(grid_rf.__class__.__name__, accuracy))

Accuracy score of the GridSearchCV is 0.80


In [175]:
def clean_results(res_dic): # input is a dictionary for all trained models and their score
    cv_results =  pd.DataFrame(res_dic)
    # first select oly the needed columns
    res = cv_results[['params', 'mean_test_f1', 'mean_test_precision', 'mean_test_recall', 'mean_test_accuracy', 'mean_fit_time']]
    
    # use model_name_extractor to extract and clean the names of the models
    def model_name_extractor(col):
        # first get the name of the classifier
        name = list(col.keys())[0]
#         print(name)
        
        model_name = name.split('__')[0] + '_'
#         print(model_name)
        # add hyper parameters values in front of the model name andf return it
        return model_name + '_'.join(["%s" % (  str(v)) for k, v in col.items()])
    
    
    res['params'] = res['params'].apply(model_name_extractor)
    
    # Rename columns and report the final result
    res.rename(columns = {'params' : 'Models',
                          'mean_test_f1': 'f1',
                          'mean_test_precision': 'Precision',
                          'mean_test_recall': 'Recall',
                          'mean_test_accuracy': 'Accuracy',
                          'mean_fit_time': 'fit time'}, inplace=True)
    
    # res is a dataframe thats shows all the metrics
    res = res.set_index('Models', drop= True)
    df_res_sorted = res.sort_values(by = ['f1'], ascending=False) # f1 c\
    return df_res_sorted 

In [188]:
clean_results(grid_rf.cv_results_)

Unnamed: 0_level_0,f1,Precision,Recall,Accuracy,fit time
Models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
randomforestclassifier_30_3_2_140,0.782806,0.739034,0.832102,0.737517,19.339382
randomforestclassifier_30_3_2_120,0.782588,0.739252,0.831362,0.737412,15.869275
randomforestclassifier_30_3_3_140,0.781735,0.736563,0.832841,0.735625,18.617449
randomforestclassifier_40_3_3_140,0.781626,0.736794,0.832287,0.735625,16.818105
randomforestclassifier_40_3_2_120,0.781206,0.736535,0.831733,0.735204,14.848599
randomforestclassifier_30_3_3_120,0.780469,0.735028,0.831916,0.733943,18.842298
randomforestclassifier_40_3_2_140,0.78037,0.735439,0.831177,0.734048,17.637681
randomforestclassifier_40_3_3_120,0.779494,0.734086,0.830993,0.732787,14.599129
randomforestclassifier_30_5_3_140,0.77932,0.73215,0.833026,0.731841,17.742067
randomforestclassifier_30_5_2_120,0.778446,0.730885,0.832656,0.730579,11.637576


In [191]:
pipe_log = make_pipeline(preprocessor, LogisticRegression(), verbose = False)

log_param_grid = {"logisticregression__C" : np.logspace(-3,3,7), "logisticregression__penalty" : ["l1","l2"]} # l1 lasso l2 ridge

grid_log = GridSearchCV( pipe_log,
                         param_grid=log_param_grid,
                         cv=3,
                         n_jobs=-1, # if -1 uses all cores
                         scoring=['accuracy', 'precision', 'recall', 'f1'],
                         refit = Metric) # metric is given by the user

grid_log.fit(X_train, y_train)
accuracy = grid_log.score(X_test, y_test)
print('Accuracy score of the {} is {:.2f}'.format(grid_log.__class__.__name__, accuracy))

Accuracy score of the GridSearchCV is 0.75


In [192]:
clean_results(grid_log.cv_results_)

Unnamed: 0_level_0,f1,Precision,Recall,Accuracy,fit time
Models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
logisticregression_0.001_l2,0.744066,0.630201,0.908285,0.644802,0.600423
logisticregression_0.01_l2,0.737292,0.6895,0.792348,0.678966,0.353977
logisticregression_0.1_l2,0.734624,0.701984,0.770527,0.683486,0.762945
logisticregression_1.0_l2,0.733622,0.702607,0.767568,0.683065,0.729792
logisticregression_10.0_l2,0.733193,0.702597,0.766643,0.68275,0.726068
logisticregression_100.0_l2,0.732921,0.702568,0.766089,0.68254,0.743577
logisticregression_1000.0_l2,0.732921,0.702568,0.766089,0.68254,0.51867
logisticregression_0.001_l1,,,,,0.282101
logisticregression_0.01_l1,,,,,0.42076
logisticregression_0.1_l1,,,,,0.299299


In [193]:
def clean_results_top_2(res_dic):
    """
    This function is used to obtain the top 2 performances of each model
    """
    cv_results =  pd.DataFrame(res_dic)
    res = cv_results[['params', 'mean_test_f1', 'mean_test_precision', 'mean_test_recall', 'mean_test_accuracy', 'mean_fit_time']]

    def model_name_extractor(col):
        name = list(col.keys())[0]
        model_name = name.split('__')[0] + '_'
        return model_name + '_'.join(["%s" % (  str(v)) for k, v in col.items()])
    
    res['params'] = res['params'].apply(model_name_extractor)
    
    res.rename(columns = {'params' : 'Models',
                          'mean_test_f1': 'f1',
                          'mean_test_precision': 'Precision',
                          'mean_test_recall': 'Recall',
                          'mean_test_accuracy': 'Accuracy',
                          'mean_fit_time': 'fit time'}, inplace=True)

    res = res.set_index('Models', drop= True)
    df_res_sorted = res.sort_values(by = ['f1'], ascending=False)
    return df_res_sorted.head(2)

In [194]:
Model_List = [grid_lgbm, grid_log, grid_xgb, grid_rf]

Final_Results = []
for i in Model_List:
    Temp = clean_results_top_2(i.cv_results_)
    Final_Results.append(Temp)
    
Final_Results = pd.concat(Final_Results)

In [197]:
# Below we can see the summary of all models and the results
Final_Results.sort_values(by = 'f1', ascending = False)

Unnamed: 0_level_0,f1,Precision,Recall,Accuracy,fit time
Models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lgbmclassifier_60_0,0.788342,0.761731,0.816939,0.750657,0.636929
lgbmclassifier_60_0.01,0.787829,0.762721,0.81472,0.750552,0.573177
xgbclassifier_0.5_5,0.785438,0.759746,0.813056,0.747503,7.315755
xgbclassifier_2_5,0.783709,0.757255,0.812316,0.745191,5.46958
randomforestclassifier_30_3_2_140,0.782806,0.739034,0.832102,0.737517,19.339382
randomforestclassifier_30_3_2_120,0.782588,0.739252,0.831362,0.737412,15.869275
logisticregression_0.001_l2,0.744066,0.630201,0.908285,0.644802,0.600423
logisticregression_0.01_l2,0.737292,0.6895,0.792348,0.678966,0.353977


In [None]:
# Storing results by pickling the model so that we can reuse the model

import pickle
file_name = 'LGBM_model_pkl'

# create an iterator object with write permission - model.pkl
with open(file_name, 'wb') as files:
    pickle.dump(grid_lgbm, files)
    
# load the model from disk
loaded_model = pickle.load(open(file_name, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)