# Data Description

In [3]:
RANDOM_STATE=3

# Imports

In [4]:
# Supress Notebook Warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [5]:
df_raw = pd.read_csv('data/online_shoppers_intention.csv', delimiter=',')
df_raw.tail(5)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
12325,3.0,145.0,0.0,0.0,53.0,1783.791667,0.007143,0.029031,12.241717,0.0,Dec,4,6,1,1,Returning_Visitor,True,False
12326,0.0,0.0,0.0,0.0,5.0,465.75,0.0,0.021333,0.0,0.0,Nov,3,2,1,8,Returning_Visitor,True,False
12327,0.0,0.0,0.0,0.0,6.0,184.25,0.083333,0.086667,0.0,0.0,Nov,3,2,1,13,Returning_Visitor,True,False
12328,4.0,75.0,0.0,0.0,15.0,346.0,0.0,0.021053,0.0,0.0,Nov,2,2,3,11,Returning_Visitor,False,False
12329,0.0,0.0,0.0,0.0,3.0,21.25,0.0,0.066667,0.0,0.0,Nov,3,2,1,2,New_Visitor,True,False


### Check for null values

In [6]:
# Show counter for null values
df_raw.isnull().sum()

Administrative             14
Administrative_Duration    14
Informational              14
Informational_Duration     14
ProductRelated             14
ProductRelated_Duration    14
BounceRates                14
ExitRates                  14
PageValues                  0
SpecialDay                  0
Month                       0
OperatingSystems            0
Browser                     0
Region                      0
TrafficType                 0
VisitorType                 0
Weekend                     0
Revenue                     0
dtype: int64

In [7]:
df_raw.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.317798,80.906176,0.503979,34.506387,31.763884,1196.037057,0.022152,0.043003,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586
std,3.322754,176.860432,1.270701,140.825479,44.490339,1914.372511,0.048427,0.048527,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169
min,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,185.0,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,8.0,0.0,0.0,18.0,599.76619,0.003119,0.025124,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.5,0.0,0.0,38.0,1466.479902,0.016684,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


In [8]:
# Rows with negative duration
print('Problematic Rows Count:',df_raw[df_raw['Informational_Duration'] < 0].count()[1])

Problematic Rows Count: 33


## Cleaning Data

Try Oversampling / Undersampling

https://www.kaggle.com/saurav9786/ensemble-techniques

In [9]:
# remove rows with null & negative value
df = df_raw[((df_raw['Administrative_Duration'] >= 0) & (df_raw['Informational_Duration'] >= 0) & (df_raw['ProductRelated_Duration'] >= 0))].copy()

## Data Transformation

In [10]:
df['Month']= df['Month'].astype('category').cat.codes
df['Month'].value_counts()

6    3357
7    2995
5    1884
1    1727
8     549
9     448
0     433
3     431
4     288
2     171
Name: Month, dtype: int64

### Binary Encoding

In [11]:
# Binary Encoding
from sklearn.preprocessing import label_binarize

# From Label to Boolean
df['Revenue'] = label_binarize(df['Revenue'], classes=[False,True])
df['Weekend'] = label_binarize(df['Weekend'], classes=[False,True])

In [12]:
df_raw.tail()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
12325,3.0,145.0,0.0,0.0,53.0,1783.791667,0.007143,0.029031,12.241717,0.0,Dec,4,6,1,1,Returning_Visitor,True,False
12326,0.0,0.0,0.0,0.0,5.0,465.75,0.0,0.021333,0.0,0.0,Nov,3,2,1,8,Returning_Visitor,True,False
12327,0.0,0.0,0.0,0.0,6.0,184.25,0.083333,0.086667,0.0,0.0,Nov,3,2,1,13,Returning_Visitor,True,False
12328,4.0,75.0,0.0,0.0,15.0,346.0,0.0,0.021053,0.0,0.0,Nov,2,2,3,11,Returning_Visitor,False,False
12329,0.0,0.0,0.0,0.0,3.0,21.25,0.0,0.066667,0.0,0.0,Nov,3,2,1,2,New_Visitor,True,False


## Create Boolean Cols from Cardinal

In [13]:
# Boolean - Visited the type of page
df['Administrative_Visited'] = np.where(df['Administrative'] > 1, 1,0)
df['Informational_Visited'] = np.where(df['Informational'] > 1, 1,0)
df['ProductRelated_Visited'] = np.where(df['ProductRelated'] > 1, 1,0)

In [14]:
df['IsSpecialDate'] = np.where(df['SpecialDay'] > 1, 1,0)

### Intervals

In [15]:
# Administrative
column_name = 'Administrative'
interval_step = 3
new_column_name = column_name + '_Bins'
interval = range(0, int(max(df[column_name])), interval_step)
max_number_of_intervals =  10

# Create Column
df[new_column_name] = pd.cut(df[column_name], interval, right=False)
df[new_column_name] = df[new_column_name].astype('category').cat.codes
df.loc[(df[new_column_name].astype('category').cat.codes >= max_number_of_intervals),new_column_name] = max_number_of_intervals
df[new_column_name].value_counts()

 0    8189
 1    2255
 2    1057
 3     483
 4     186
 5      78
 6      20
 7       9
-1       6
Name: Administrative_Bins, dtype: int64

In [16]:
# ProductRelated
column_name = 'ProductRelated'
interval_step = 30
new_column_name = column_name + '_Bins'
interval = range(0, int(max(df[column_name])), interval_step)
max_number_of_intervals =  10

# Create Column
df[new_column_name] = pd.cut(df[column_name], interval, right=False)
df[new_column_name] = df[new_column_name].astype('category').cat.codes
df.loc[(df[new_column_name].astype('category').cat.codes >= max_number_of_intervals),new_column_name] = max_number_of_intervals
df[new_column_name].value_counts()

 0     8235
 1     2329
 2      826
 3      380
 4      201
 5      120
 10      70
 6       53
 7       46
 8       22
-1        1
Name: ProductRelated_Bins, dtype: int64

### Create dummy columns
Drop is manual, because we often want to keep the first column and drop the last one, which is often group of nonlabeled elements ('Others').

In [17]:
df.VisitorType.unique()

array(['Returning_Visitor', 'New_Visitor', 'Other'], dtype=object)

In [18]:
df = pd.get_dummies(df, columns=['VisitorType'], drop_first=False)
df = df.drop(['VisitorType_Other'],axis=1)

In [19]:
df = pd.get_dummies(df, columns=['TrafficType'], drop_first=False)
df = df.drop(['TrafficType_20'],axis=1)

In [20]:
df = pd.get_dummies(df, columns=['OperatingSystems'], drop_first=False)
df = df.drop(['OperatingSystems_8'],axis=1)

In [21]:
df = pd.get_dummies(df, columns=['Browser'], drop_first=False)
df = df.drop(['Browser_13'],axis=1)

In [22]:
df = pd.get_dummies(df, columns=['Region'], drop_first=False)
df = df.drop(['Region_9'],axis=1)

In [23]:
df.dtypes.unique()

array([dtype('float64'), dtype('int8'), dtype('int64'), dtype('uint8')],
      dtype=object)

In [24]:
df.columns

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'Weekend', 'Revenue', 'Administrative_Visited', 'Informational_Visited',
       'ProductRelated_Visited', 'IsSpecialDate', 'Administrative_Bins',
       'ProductRelated_Bins', 'VisitorType_New_Visitor',
       'VisitorType_Returning_Visitor', 'TrafficType_1', 'TrafficType_2',
       'TrafficType_3', 'TrafficType_4', 'TrafficType_5', 'TrafficType_6',
       'TrafficType_7', 'TrafficType_8', 'TrafficType_9', 'TrafficType_10',
       'TrafficType_11', 'TrafficType_12', 'TrafficType_13', 'TrafficType_14',
       'TrafficType_15', 'TrafficType_16', 'TrafficType_17', 'TrafficType_18',
       'TrafficType_19', 'OperatingSystems_1', 'OperatingSystems_2',
       'OperatingSystems_3', 'OperatingSystems_4', 'OperatingSystems_5',
       'OperatingSystems_6', 'OperatingSystems

# Test Train Split

In [25]:
TARGET_VARIABLE = 'Revenue'
TEST_SIZE = 0.3
df[TARGET_VARIABLE].value_counts()

0    10375
1     1908
Name: Revenue, dtype: int64

In [26]:
X = df.drop(labels=[TARGET_VARIABLE], axis=1)
y = df[TARGET_VARIABLE]

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [28]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
strat_kfold = StratifiedKFold(n_splits=5, shuffle=False, random_state=RANDOM_STATE)

def print_acc_score(y_test, predictions):
    print("Accuracy: {:.2%}".format(accuracy_score(y_test, predictions))) 

def print_cv_score(cv_score):
    print("Accuracy: %0.2f (+/- %0.2f with 95 % confidence)" % (cv_score.mean(), cv_score.std() * 2))
    
    #print("Precision: ",round(precision_score(y_test,y_pred),2),"Recall: ",round(recall_score(y_test,y_pred),2))

In [29]:
#print_acc_score(y_test, predictions)

## Choosing Metrics - Evaluation Matrix
### Metrics
- Recall
- Precision
- F1 Score
To gather the evaluation results from different models into one df.

In [30]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, fbeta_score

In [31]:
evaluation_matrix_columns = ['Model',
                             'EvaluationInfo',
                             'Accuracy',
                             'False_Precision',
                             'True_Precision',
                             'True_Recall',
                             'True_F1_Score']

evaluation_matrix = pd.DataFrame(columns = evaluation_matrix_columns)
evaluation_matrix

Unnamed: 0,Model,EvaluationInfo,Accuracy,False_Precision,True_Precision,True_Recall,True_F1_Score


In [32]:
# Evaluate Model and Add results to the matrix
def evaluate_model(model, y_test, y_pred, evaluation_info=''):
    global evaluation_matrix
    float_precision = 3
    total_accuracy  = round(100 * accuracy_score(y_test, y_pred), float_precision)
    false_precision = round(100 * precision_score(y_test,y_pred, pos_label=0, average='binary'), float_precision)
    true_precision  = round(100 * precision_score(y_test,y_pred, pos_label=1, average='binary'), float_precision)
    true_recall     = round(100 * recall_score(y_test,y_pred, pos_label=1, average='binary'), float_precision)
    true_f1_beta    = round(100 * fbeta_score(y_test, y_pred, beta=1.5), float_precision)
    
    model_evaluation_dict = {'Model':model.__class__.__name__,
                             'EvaluationInfo':evaluation_info,
                             'Accuracy':total_accuracy,
                             'False_Precision':false_precision,
                             'True_Precision':true_precision,
                             'True_Recall':true_recall,
                             'True_F1_Score':true_f1_beta}
    evaluation_matrix = evaluation_matrix.append(model_evaluation_dict, ignore_index=True)

In [33]:
evaluation_matrix

Unnamed: 0,Model,EvaluationInfo,Accuracy,False_Precision,True_Precision,True_Recall,True_F1_Score


## TODO - Cross Validation

In [34]:
## TODO Implement Cross validation
# cross_val_score(gaussiannb, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy')

## TODO - Pick right Metrics

# Baseline Model

## Dummy Classifier
- Most basic models that are used as a baseline models for the clasification.

### Most Frequent
- Is taking most frequent class for prediction

In [35]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, confusion_matrix

dummy_clf_mf = DummyClassifier(strategy="most_frequent")
dummy_clf_mf.fit(X, y)
y_pred = dummy_clf_mf.predict(X_test)

In [36]:
evaluate_model(dummy_clf_mf,y_test,y_pred,'most_frequent')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      1.00      0.91      3105
           1       0.00      0.00      0.00       580

    accuracy                           0.84      3685
   macro avg       0.42      0.50      0.46      3685
weighted avg       0.71      0.84      0.77      3685



### Stratified
- Keep the ratio of classes 

In [37]:
dummy_clf_st = DummyClassifier(strategy="stratified")
dummy_clf_st.fit(X, y)
y_pred = dummy_clf_st.predict(X_test)

evaluate_model(dummy_clf_st,y_test,y_pred,'stratified')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      3105
           1       0.15      0.15      0.15       580

    accuracy                           0.73      3685
   macro avg       0.49      0.49      0.49      3685
weighted avg       0.73      0.73      0.73      3685



In [38]:
accuracy_score(y_test, y_pred)

0.7305291723202171

# Clasification Models

## Decision Tree Classifier
- (https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

Hence the Y variable has a high class imbalance. Hence accuracy will not be a reliable model performance measure.

FN is very critical for this business case because a false negative is a customer who will potentially subscribe for a loan but who has been classified as 'will not subscribe'. Hence the most relevant model performance measure is recall

In [39]:
# Train
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='gini', splitter='best', random_state=RANDOM_STATE)
dtree.fit(X_train, y_train)

# Test
y_pred = dtree.predict(X_test)
evaluate_model(dtree,y_test,y_pred,'gini')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      3105
           1       0.58      0.54      0.56       580

    accuracy                           0.86      3685
   macro avg       0.75      0.73      0.74      3685
weighted avg       0.86      0.86      0.86      3685



In [40]:
# Train
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=RANDOM_STATE)
dtree.fit(X_train, y_train)

# Test
y_pred = dtree.predict(X_test)
evaluate_model(dtree,y_test,y_pred,'entropy')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      3105
           1       0.57      0.54      0.56       580

    accuracy                           0.86      3685
   macro avg       0.74      0.73      0.74      3685
weighted avg       0.86      0.86      0.86      3685



In [41]:
evaluation_matrix.tail(2)

Unnamed: 0,Model,EvaluationInfo,Accuracy,False_Precision,True_Precision,True_Recall,True_F1_Score
2,DecisionTreeClassifier,gini,86.486,91.473,57.565,53.793,54.9
3,DecisionTreeClassifier,entropy,86.459,91.523,57.404,54.138,55.103


## Random Forest

In [42]:
# Train
from sklearn.ensemble import RandomForestClassifier
ran_forest = RandomForestClassifier(n_estimators=500, min_samples_split=3, random_state=RANDOM_STATE,
                                   bootstrap=True,)
ran_forest.fit(X_train, y_train)

# Test
y_pred = ran_forest.predict(X_test)
evaluate_model(ran_forest,y_test,y_pred,'n_estimators=500')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95      3105
           1       0.81      0.51      0.63       580

    accuracy                           0.91      3685
   macro avg       0.86      0.75      0.79      3685
weighted avg       0.90      0.91      0.90      3685



## GaussianNB

In [43]:
# Train
from sklearn.naive_bayes import GaussianNB
gaussiannb= GaussianNB()
gaussiannb.fit(X_train, y_train)

# Test
y_pred = gaussiannb.predict(X_test)
evaluate_model(gaussiannb,y_test,y_pred,'')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.76      0.84      3105
           1       0.35      0.67      0.46       580

    accuracy                           0.75      3685
   macro avg       0.64      0.72      0.65      3685
weighted avg       0.83      0.75      0.78      3685



## Gradient Boosting Classifier

In [44]:
# Train
from sklearn.ensemble import GradientBoostingClassifier
gbk = GradientBoostingClassifier(learning_rate=0.01, n_estimators=500,random_state=RANDOM_STATE)
gbk.fit(X_train, y_train)

# Test
y_pred = gbk.predict(X_test)
evaluate_model(gbk,y_test,y_pred,'n_estimators=500')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.97      0.95      3105
           1       0.78      0.57      0.66       580

    accuracy                           0.91      3685
   macro avg       0.85      0.77      0.80      3685
weighted avg       0.90      0.91      0.90      3685



## KNeighbors Classifier

In [47]:
# Train
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(),n_estimators=500,
                            max_samples=0.9, max_features=0.9,bootstrap=True,
                            bootstrap_features=False)
bagging.fit(X_train, y_train)

# Test
y_pred = bagging.predict(X_test)
evaluate_model(bagging,y_test,y_pred,'n_estimators=500')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      3105
           1       0.72      0.25      0.37       580

    accuracy                           0.87      3685
   macro avg       0.80      0.62      0.65      3685
weighted avg       0.85      0.87      0.84      3685



## Bagging Aggregation

In [None]:
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(GradientBoostingClassifier(),n_estimators=20,
                            max_samples=0.9, max_features=0.9,bootstrap=True,
                            bootstrap_features=False)
bagging.fit(X_train, y_train)

In [None]:
y_pred = bagging.predict(X_test)
evaluate_model(bagging,y_test,y_pred,'n_estimators=50')
print(classification_report(y_test, y_pred))

## Logistic Regression

In [50]:
# Train
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(solver='liblinear', C=19.1, penalty='l1',random_state=RANDOM_STATE)
logmodel.fit(X, y)

# Test
y_pred = logmodel.predict(X_test)
evaluate_model(logmodel,y_test,y_pred,'C=19.1, penalty=l1')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[3044   61]
 [ 352  228]]
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      3105
           1       0.79      0.39      0.52       580

    accuracy                           0.89      3685
   macro avg       0.84      0.69      0.73      3685
weighted avg       0.88      0.89      0.87      3685



### Predict_proba
Returns
Tarray-like of shape (n_samples, n_classes)
Returns the probability of the sample for each class in the model, where classes are ordered as they are in self.classes_

In [90]:
# Get probabilities for Revenue = 1
y_prob_pred = logmodel.predict_proba(X_test)[:,1]
y_prob_pred

array([0.07423112, 0.0139902 , 0.33429053, ..., 0.03999546, 0.09052969,
       0.16198218])

### LG: Probability Threshold Optimization

In [91]:
thresholds = np.arange(0.2, 0.9, 0.05)
for threshold in thresholds:
    y_pred = y_prob_pred > threshold
    print(f'LG with threshold: {threshold}')
    #print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))

LG with threshold: 0.2
              precision    recall  f1-score   support

           0       0.95      0.92      0.93      3105
           1       0.62      0.72      0.67       580

    accuracy                           0.89      3685
   macro avg       0.79      0.82      0.80      3685
weighted avg       0.90      0.89      0.89      3685

LG with threshold: 0.25
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      3105
           1       0.67      0.61      0.64       580

    accuracy                           0.89      3685
   macro avg       0.80      0.78      0.79      3685
weighted avg       0.89      0.89      0.89      3685

LG with threshold: 0.3
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      3105
           1       0.70      0.54      0.61       580

    accuracy                           0.89      3685
   macro avg       0.81      0.75      0.77      3685
weig

In [98]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(logmodel, X, y, cv=10, scoring='roc_auc')
score.mean()

0.8710933375004106

In [95]:
np.linspace(0.2, 20, 10)

array([ 0.2,  2.4,  4.6,  6.8,  9. , 11.2, 13.4, 15.6, 17.8, 20. ])

### LG: Hyper Parameter Tuning

In [100]:
# Type of penalty - Lasso(l1) or Ridge(l2)
penalties = ['l1','l2']
C_values = np.linspace(0.2, 20, 10)

In [101]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Cross-Validation
cross_valid = StratifiedKFold(n_splits=10)

# Hyperparameter Tuning
params = {'penalty': penalties, 'C': C_values}

logmodel2 = LogisticRegression(solver='liblinear')
grid = GridSearchCV(estimator=logmodel2, param_grid=params, scoring='recall', n_jobs=-1, cv=cross_valid)
grid.fit(X,y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([ 0.2,  2.4,  4.6,  6.8,  9. , 11.2, 13.4, 15.6, 17.8, 20. ]),
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=0)

In [60]:
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

{'C': 1.19, 'penalty': 'l1'}
0.8830043757934606
LogisticRegression(C=1.19, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)


## Neural Network

### Z - score Scaler - {-1;1}

In [103]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scal = sc.fit_transform(df.drop(labels=['Revenue'], axis=1))
X_scal
X_train, X_test, y_train, y_test = train_test_split(X_scal, y, test_size=0.3, random_state=3)

### Compile NN model

In [110]:
import keras as K
from keras import Sequential
from keras.layers import Dense

In [111]:
classifier = Sequential()
#First Hidden Layer
classifier.add(Dense(12, activation='tanh', kernel_initializer='random_normal', input_dim=X_scal.shape[1]))
#Second  Hidden Layer
classifier.add(Dense(12, activation='tanh', kernel_initializer='random_normal'))
#Output Layer
classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))

In [112]:
#Compiling the neural network
classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

In [108]:
#Fitting the data to the training dataset
classifier.fit(X_train,y_train, batch_size=10, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x7fa4c85d3b10>

In [114]:
eval_model = classifier.evaluate(X_train, y_train)
eval_model



[0.6930214621128384, 0.5132588744163513]

In [119]:
y_prod_pred = classifier.predict(X_test)

In [120]:
thresholds = np.arange(0.15, 0.9, 0.05)
for threshold in thresholds:
    y_pred = y_prob_pred > threshold
    print(f'LG with threshold: {threshold}')
    #print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))

LG with threshold: 0.15
              precision    recall  f1-score   support

           0       0.96      0.85      0.90      3105
           1       0.49      0.80      0.61       580

    accuracy                           0.84      3685
   macro avg       0.73      0.82      0.76      3685
weighted avg       0.89      0.84      0.85      3685

LG with threshold: 0.2
              precision    recall  f1-score   support

           0       0.95      0.92      0.93      3105
           1       0.62      0.72      0.67       580

    accuracy                           0.89      3685
   macro avg       0.79      0.82      0.80      3685
weighted avg       0.90      0.89      0.89      3685

LG with threshold: 0.25
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      3105
           1       0.67      0.61      0.64       580

    accuracy                           0.89      3685
   macro avg       0.80      0.78      0.79      3685
wei

## NN V2

### MinMax

In [122]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scal = scaler.fit_transform(df.drop(labels=['Revenue'], axis=1))
X_scal
X_train, X_test, y_train, y_test = train_test_split(X_scal, y, test_size=0.3, random_state=3)

In [123]:
classifier = Sequential()
#First Hidden Layer
classifier.add(Dense(12, activation='relu', kernel_initializer='random_normal', input_dim=X_scal.shape[1]))
#Second  Hidden Layer
classifier.add(Dense(12, activation='relu', kernel_initializer='random_normal'))
#Output Layer
classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))


#Compiling the neural network
classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

#Fitting the data to the training dataset
classifier.fit(X_train,y_train, batch_size=10, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x7fa4c8060e10>

In [126]:
y_prod_pred = classifier.predict(X_test)

In [127]:
thresholds = np.arange(0.15, 0.9, 0.05)
for threshold in thresholds:
    y_pred = y_prob_pred > threshold
    print(f'LG with threshold: {threshold}')
    #print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))

LG with threshold: 0.15
              precision    recall  f1-score   support

           0       0.96      0.85      0.90      3105
           1       0.49      0.80      0.61       580

    accuracy                           0.84      3685
   macro avg       0.73      0.82      0.76      3685
weighted avg       0.89      0.84      0.85      3685

LG with threshold: 0.2
              precision    recall  f1-score   support

           0       0.95      0.92      0.93      3105
           1       0.62      0.72      0.67       580

    accuracy                           0.89      3685
   macro avg       0.79      0.82      0.80      3685
weighted avg       0.90      0.89      0.89      3685

LG with threshold: 0.25
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      3105
           1       0.67      0.61      0.64       580

    accuracy                           0.89      3685
   macro avg       0.80      0.78      0.79      3685
wei