# Data Description

In [1]:
RANDOM_STATE=3

# Imports

In [2]:
# Supress Notebook Warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [3]:
df_raw = pd.read_csv('data/online_shoppers_intention.csv', delimiter=',')
df_raw.tail(5)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
12325,3.0,145.0,0.0,0.0,53.0,1783.791667,0.007143,0.029031,12.241717,0.0,Dec,4,6,1,1,Returning_Visitor,True,False
12326,0.0,0.0,0.0,0.0,5.0,465.75,0.0,0.021333,0.0,0.0,Nov,3,2,1,8,Returning_Visitor,True,False
12327,0.0,0.0,0.0,0.0,6.0,184.25,0.083333,0.086667,0.0,0.0,Nov,3,2,1,13,Returning_Visitor,True,False
12328,4.0,75.0,0.0,0.0,15.0,346.0,0.0,0.021053,0.0,0.0,Nov,2,2,3,11,Returning_Visitor,False,False
12329,0.0,0.0,0.0,0.0,3.0,21.25,0.0,0.066667,0.0,0.0,Nov,3,2,1,2,New_Visitor,True,False


### Check for null values

In [4]:
# Show counter for null values
df_raw.isnull().sum()

Administrative             14
Administrative_Duration    14
Informational              14
Informational_Duration     14
ProductRelated             14
ProductRelated_Duration    14
BounceRates                14
ExitRates                  14
PageValues                  0
SpecialDay                  0
Month                       0
OperatingSystems            0
Browser                     0
Region                      0
TrafficType                 0
VisitorType                 0
Weekend                     0
Revenue                     0
dtype: int64

In [5]:
df_raw.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.317798,80.906176,0.503979,34.506387,31.763884,1196.037057,0.022152,0.043003,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586
std,3.322754,176.860432,1.270701,140.825479,44.490339,1914.372511,0.048427,0.048527,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169
min,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,185.0,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,8.0,0.0,0.0,18.0,599.76619,0.003119,0.025124,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.5,0.0,0.0,38.0,1466.479902,0.016684,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


In [6]:
# Rows with negative duration
print('Problematic Rows Count:',df_raw[df_raw['Informational_Duration'] < 0].count()[1])

Problematic Rows Count: 33


## Cleaning Data

Try Oversampling / Undersampling

https://www.kaggle.com/saurav9786/ensemble-techniques

In [7]:
# Take rows without null & negative value
df = df_raw[((df_raw['Administrative_Duration'] >= 0) & (df_raw['Informational_Duration'] >= 0) & (df_raw['ProductRelated_Duration'] >= 0))].copy()

In [8]:
# Binary Encoding
from sklearn.preprocessing import label_binarize

df['Revenue'] = label_binarize(df['Revenue'], classes=[False,True])
df['Weekend'] = label_binarize(df['Weekend'], classes=[False,True])

In [9]:
df_raw.tail()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
12325,3.0,145.0,0.0,0.0,53.0,1783.791667,0.007143,0.029031,12.241717,0.0,Dec,4,6,1,1,Returning_Visitor,True,False
12326,0.0,0.0,0.0,0.0,5.0,465.75,0.0,0.021333,0.0,0.0,Nov,3,2,1,8,Returning_Visitor,True,False
12327,0.0,0.0,0.0,0.0,6.0,184.25,0.083333,0.086667,0.0,0.0,Nov,3,2,1,13,Returning_Visitor,True,False
12328,4.0,75.0,0.0,0.0,15.0,346.0,0.0,0.021053,0.0,0.0,Nov,2,2,3,11,Returning_Visitor,False,False
12329,0.0,0.0,0.0,0.0,3.0,21.25,0.0,0.066667,0.0,0.0,Nov,3,2,1,2,New_Visitor,True,False


In [10]:
df.VisitorType.unique()

array(['Returning_Visitor', 'New_Visitor', 'Other'], dtype=object)

In [11]:
df['Administrative_Visited'] = np.where(df['Administrative'] > 1, 1,0)
df['Informational_Visited'] = np.where(df['Administrative'] > 1, 1,0)
df['ProductRelated_Visited'] = np.where(df['Administrative'] > 1, 1,0)

### Intervals

In [12]:
# Administrative
column_name = 'Administrative'
interval_step = 3
new_column_name = column_name + '_Bins'
interval = range(0, int(max(df[column_name])), interval_step)
max_number_of_intervals =  10

# Create Column
df[new_column_name] = pd.cut(df[column_name], interval, right=False)
df[new_column_name] = df[new_column_name].astype('category').cat.codes
df.loc[(df[new_column_name].astype('category').cat.codes >= max_number_of_intervals),new_column_name] = max_number_of_intervals
df[new_column_name].value_counts()

 0    8189
 1    2255
 2    1057
 3     483
 4     186
 5      78
 6      20
 7       9
-1       6
Name: Administrative_Bins, dtype: int64

In [13]:
# ProductRelated
column_name = 'ProductRelated'
interval_step = 30
new_column_name = column_name + '_Bins'
interval = range(0, int(max(df[column_name])), interval_step)
max_number_of_intervals =  10

# Create Column
df[new_column_name] = pd.cut(df[column_name], interval, right=False)
df[new_column_name] = df[new_column_name].astype('category').cat.codes
df.loc[(df[new_column_name].astype('category').cat.codes >= max_number_of_intervals),new_column_name] = max_number_of_intervals
df[new_column_name].value_counts()

 0     8235
 1     2329
 2      826
 3      380
 4      201
 5      120
 10      70
 6       53
 7       46
 8       22
-1        1
Name: ProductRelated_Bins, dtype: int64

### Create dummy columns
Drop is manual, because we often want to keep the first column and drop the last one, which is often group of nonlabeled elements ('Others').

In [14]:
df = pd.get_dummies(df, columns=['VisitorType'], drop_first=False)
df = df.drop(['VisitorType_Other'],axis=1)

In [15]:
df = pd.get_dummies(df, columns=['TrafficType'], drop_first=False)
df = df.drop(['TrafficType_20'],axis=1)

In [16]:
df = pd.get_dummies(df, columns=['OperatingSystems'], drop_first=False)
df = df.drop(['OperatingSystems_8'],axis=1)

In [17]:
df = pd.get_dummies(df, columns=['Browser'], drop_first=False)
df = df.drop(['Browser_13'],axis=1)

In [18]:
df['IsSpecialDate'] = np.where(df['SpecialDay'] > 1, 1,0)

In [19]:
df = pd.get_dummies(df, columns=['Region'], drop_first=False)
df = df.drop(['Region_9'],axis=1)

In [20]:
df['Month']= df['Month'].astype('category').cat.codes
df['Month'].value_counts()

6    3357
7    2995
5    1884
1    1727
8     549
9     448
0     433
3     431
4     288
2     171
Name: Month, dtype: int64

## Test Train Split

### Target Variable

In [21]:
TARGET_VARIABLE = 'Revenue'
df[TARGET_VARIABLE].value_counts()

0    10375
1     1908
Name: Revenue, dtype: int64

In [22]:
X = df.drop(labels=[TARGET_VARIABLE], axis=1)
y = df[TARGET_VARIABLE]

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

In [30]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
strat_kfold = StratifiedKFold(n_splits=5, shuffle=False, random_state=RANDOM_STATE)

def print_acc_score(y_test, predictions):
    print("Accuracy: {:.2%}".format(accuracy_score(y_test, predictions))) 

def print_cv_score(cv_score):
    print("Accuracy: %0.2f (+/- %0.2f with 95 % confidence)" % (cv_score.mean(), cv_score.std() * 2))
    
    #print("Precision: ",round(precision_score(y_test,y_pred),2),"Recall: ",round(recall_score(y_test,y_pred),2))

In [25]:
#print_acc_score(y_test, predictions)

## Evaluation Matrix

In [71]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, fbeta_score

In [100]:
evaluation_matrix_columns = ['Model',
                             'Evaluation',
                             'Accuracy',
                             'False_Precision',
                             'True_Precision',
                             'True_Recall',
                             'True_F1_Score']

evaluation_matrix = pd.DataFrame(columns = evaluation_matrix_columns)

In [101]:
evaluation_matrix

Unnamed: 0,Model,Evaluation,Accuracy,False_Precision,True_Precision,True_Recall,True_F1_Score


In [112]:
float_precision = 4
total_accuracy  = round(100 * accuracy_score(y_test, y_pred), float_precision)
false_precision = round(100 * precision_score(y_test,y_pred, pos_label=0, average='binary'), float_precision)
true_precision  = round(100 * precision_score(y_test,y_pred, pos_label=1, average='binary'), float_precision)
true_recall     = round(100 * recall_score(y_test,y_pred, pos_label=1, average='binary'), float_precision)
true_f1_beta    = round(100 * fbeta_score(y_test, y_pred, beta=1.5), float_precision)

In [113]:
float_precision = 4
model_evaluation_dict = {'Model':dummy_clf_01.__class__.__name__,
                             'Evaluation':'Stratified_Cross_Validation',
                             'Accuracy':total_accuracy,
                             'False_Precision':false_precision,
                             'True_Precision':true_precision,
                             'True_Recall':true_recall,
                             'True_F1_Score':true_f1_beta}

In [114]:
evaluation_matrix.append(model_evaluation_dict, ignore_index=True)

Unnamed: 0,Model,Evaluation,Accuracy,False_Precision,True_Precision,True_Recall,True_F1_Score
0,DummyClassifier,Stratified_Cross_Validation,84.26,84.26,0.0,0.0,0.0


In [None]:
#def evaluate_model(model_name='default_name',evaluation,model, y_true):    

# Classifiers

## Dummy Classifier

In [26]:
from sklearn.dummy import DummyClassifier
dummy_clf_01 = DummyClassifier(strategy="most_frequent")
dummy_clf_01.fit(X, y)
y_pred = dummy_clf_01.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [27]:
from sklearn.metrics import classification_report, confusion_matrix

# print(confusion_matrix(y_test,predictions))
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      1.00      0.91      3105
           1       0.00      0.00      0.00       580

    accuracy                           0.84      3685
   macro avg       0.42      0.50      0.46      3685
weighted avg       0.71      0.84      0.77      3685



In [28]:
dummy_clf_02 = DummyClassifier(strategy="stratified")
dummy_clf_02.fit(X, y)
predictions = dummy_clf_02.predict(X_test)
predictions
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      3105
           1       0.17      0.16      0.16       580

    accuracy                           0.75      3685
   macro avg       0.51      0.51      0.51      3685
weighted avg       0.74      0.75      0.74      3685



In [34]:
accuracy_score(y_test, predictions)

0.9063772048846676

## Decision Tree Classifier

Hence the Y variable has a high class imbalance. Hence accuracy will not be a reliable model performance measure.

FN is very critical for this business case because a false negative is a customer who will potentially subscribe for a loan but who has been classified as 'will not subscribe'. Hence the most relevant model performance measure is recall

In [24]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [31]:
predictions = dtree.predict(X_test)
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [32]:
from sklearn.metrics import classification_report, confusion_matrix

# print(confusion_matrix(y_test,predictions))
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      3105
           1       0.56      0.55      0.56       580

    accuracy                           0.86      3685
   macro avg       0.74      0.73      0.74      3685
weighted avg       0.86      0.86      0.86      3685



In [31]:
from sklearn.ensemble import RandomForestClassifier

ran_forest = RandomForestClassifier(n_estimators=500, min_samples_split=3, random_state=102)
ran_forest.fit(X_train, y_train)

predictions = ran_forest.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95      3105
           1       0.81      0.53      0.64       580

    accuracy                           0.91      3685
   macro avg       0.86      0.75      0.79      3685
weighted avg       0.90      0.91      0.90      3685



In [33]:
from sklearn.metrics import precision_score, accuracy_score
print('precision on the evaluation set: ', precision_score(y_test, predictions))
print('accuracy on the evaluation set: ', accuracy_score(y_test, predictions))

precision on the evaluation set:  0.8100263852242744
accuracy on the evaluation set:  0.9063772048846676


In [36]:
print(confusion_matrix(y_test, predictions))

[[3033   72]
 [ 273  307]]


In [37]:
from sklearn.naive_bayes import GaussianNB
gaussiannb= GaussianNB()
gaussiannb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [38]:
gaussiannbpred = gaussiannb.predict(X_test)
probs = gaussiannb.predict(X_test)

print(confusion_matrix(y_test, predictions ))
print(round(accuracy_score(y_test, predictions),2)*100)
#GAUSIAN = (cross_val_score(gaussiannb, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy').mean())

[[3033   72]
 [ 273  307]]
91.0


In [39]:
from sklearn.metrics import precision_score, accuracy_score
print('precision on the evaluation set: ', precision_score(y_test, predictions))
print('accuracy on the evaluation set: ', accuracy_score(y_test, predictions))

precision on the evaluation set:  0.8100263852242744
accuracy on the evaluation set:  0.9063772048846676


In [40]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95      3105
           1       0.81      0.53      0.64       580

    accuracy                           0.91      3685
   macro avg       0.86      0.75      0.79      3685
weighted avg       0.90      0.91      0.90      3685



In [41]:
from sklearn.ensemble import GradientBoostingClassifier
gbk = GradientBoostingClassifier()
gbk.fit(X_train, y_train)
gbkpred = gbk.predict(X_test)
print(confusion_matrix(y_test, predictions ))
print(round(accuracy_score(y_test, predictions),2)*100)
#GBKCV = (cross_val_score(gbk, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy').mean())

[[3033   72]
 [ 273  307]]
91.0


In [42]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95      3105
           1       0.81      0.53      0.64       580

    accuracy                           0.91      3685
   macro avg       0.86      0.75      0.79      3685
weighted avg       0.90      0.91      0.90      3685



## BaggingClassifier

In [43]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5)

In [44]:
bagging.fit(X_train, y_train)

predictions = bagging.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.86      0.99      0.92      3105
           1       0.72      0.11      0.20       580

    accuracy                           0.85      3685
   macro avg       0.79      0.55      0.56      3685
weighted avg       0.83      0.85      0.81      3685



## Logistic Regression

In [53]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(solver='liblinear')

In [54]:
logmodel=LogisticRegression(solver='liblinear', C=19.1, penalty='l1')

In [55]:
logmodel.fit(X, y)

LogisticRegression(C=19.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [56]:
predictions = logmodel.predict(X_test)
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [57]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[3044   61]
 [ 354  226]]
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      3105
           1       0.79      0.39      0.52       580

    accuracy                           0.89      3685
   macro avg       0.84      0.69      0.73      3685
weighted avg       0.88      0.89      0.87      3685



### Predict_proba
Returns
Tarray-like of shape (n_samples, n_classes)
Returns the probability of the sample for each class in the model, where classes are ordered as they are in self.classes_

In [50]:
# TODO: Set thresholds
predictions = logmodel.predict_proba(X_test)
predictions

array([[0.92214268, 0.07785732],
       [0.98403275, 0.01596725],
       [0.66212347, 0.33787653],
       ...,
       [0.95918516, 0.04081484],
       [0.91248202, 0.08751798],
       [0.83544891, 0.16455109]])

In [51]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(logmodel, X, y, cv=10)
score.mean()

0.882434807038375

In [58]:
# Type of penalty - Lasso(l1) or Ridge(l2)
penalties = ['l1','l2']
C_values = np.linspace(0.2, 20, 41)

In [59]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Cross-Validation
cross_valid = StratifiedKFold(n_splits=10)

# Hyperparameter Tuning
params = {'penalty': penalties, 'C': C_values}

logmodel2 = LogisticRegression(solver='liblinear')
grid = GridSearchCV(estimator=logmodel2, param_grid=params, scoring='accuracy', n_jobs=-1, cv=cross_valid)
grid.fit(X,y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='de...
             param_grid={'C': array([ 0.2  ,  0.695,  1.19 ,  1.685,  2.18 ,  2.675,  3.17 ,  3.665,
        4.16 ,  4.655,  5.15 ,  5.645,  6.14 ,  6.635,  7.13 ,  7.625,
        8.12 ,  8.615,  9.11 ,  9.605, 10.1  , 10.595, 11.09 , 11.585,
       12.08 , 12.575, 13.07 , 13.565, 14.06 , 14.5

In [60]:
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

{'C': 1.19, 'penalty': 'l1'}
0.8830043757934606
LogisticRegression(C=1.19, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)


## NN with Scaler

## Zscore Scaler - {-1;1}

In [61]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scal = sc.fit_transform(df.drop(labels=['Revenue'], axis=1))
X_scal
X_train, X_test, y_train, y_test = train_test_split(X_scal, y, test_size=0.3, random_state=3)

In [62]:
import keras as K

Using TensorFlow backend.


In [63]:

X_train, X_test, y_train, y_test = train_test_split(X_scal, y, test_size=0.3, random_state=3)

In [64]:
from keras import Sequential
from keras.layers import Dense

In [65]:
classifier = Sequential()
#First Hidden Layer
classifier.add(Dense(12, activation='tanh', kernel_initializer='random_normal', input_dim=X_scal.shape[1]))
#Second  Hidden Layer
classifier.add(Dense(12, activation='tanh', kernel_initializer='random_normal'))
#Output Layer
classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))

In [66]:
#Compiling the neural network
classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

In [67]:
#Fitting the data to the training dataset
classifier.fit(X_train,y_train, batch_size=10, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x7f4de6572ad0>

In [69]:
eval_model=classifier.evaluate(X_train, y_train)
eval_model



[0.16848199560659435, 0.9294021725654602]

In [74]:
predictions=classifier.predict(X_test)
predictions =(predictions>0.5)
predictions

array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [ True]])

In [75]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93      3105
           1       0.66      0.56      0.61       580

    accuracy                           0.89      3685
   macro avg       0.79      0.75      0.77      3685
weighted avg       0.88      0.89      0.88      3685



In [76]:
predictions=classifier.predict(X_test)
predictions =(predictions>0.25)
predictions

array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [ True]])

In [78]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.94      0.88      0.91      3105
           1       0.53      0.71      0.61       580

    accuracy                           0.86      3685
   macro avg       0.74      0.80      0.76      3685
weighted avg       0.88      0.86      0.86      3685



## NN with MinMax

In [79]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scal = scaler.fit_transform(df.drop(labels=['Revenue'], axis=1))
X_scal
X_train, X_test, y_train, y_test = train_test_split(X_scal, y, test_size=0.3, random_state=3)

In [80]:
classifier = Sequential()
#First Hidden Layer
classifier.add(Dense(12, activation='relu', kernel_initializer='random_normal', input_dim=X_scal.shape[1]))
#Second  Hidden Layer
classifier.add(Dense(12, activation='relu', kernel_initializer='random_normal'))
#Output Layer
classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))


#Compiling the neural network
classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

#Fitting the data to the training dataset
classifier.fit(X_train,y_train, batch_size=10, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x7f4de03283d0>

In [81]:
predictions=classifier.predict(X_test)
predictions =(predictions>0.5)
predictions

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      3105
           1       0.67      0.62      0.65       580

    accuracy                           0.89      3685
   macro avg       0.80      0.78      0.79      3685
weighted avg       0.89      0.89      0.89      3685

