# Data Description

In [1]:
RANDOM_STATE=3

# Imports

In [2]:
# Supress Notebook Warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [3]:
df_raw = pd.read_csv('data/online_shoppers_intention.csv', delimiter=',')
df_raw.tail(5)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
12325,3.0,145.0,0.0,0.0,53.0,1783.791667,0.007143,0.029031,12.241717,0.0,Dec,4,6,1,1,Returning_Visitor,True,False
12326,0.0,0.0,0.0,0.0,5.0,465.75,0.0,0.021333,0.0,0.0,Nov,3,2,1,8,Returning_Visitor,True,False
12327,0.0,0.0,0.0,0.0,6.0,184.25,0.083333,0.086667,0.0,0.0,Nov,3,2,1,13,Returning_Visitor,True,False
12328,4.0,75.0,0.0,0.0,15.0,346.0,0.0,0.021053,0.0,0.0,Nov,2,2,3,11,Returning_Visitor,False,False
12329,0.0,0.0,0.0,0.0,3.0,21.25,0.0,0.066667,0.0,0.0,Nov,3,2,1,2,New_Visitor,True,False


### Check for null values

In [4]:
# Show counter for null values
df_raw.isnull().sum()

Administrative             14
Administrative_Duration    14
Informational              14
Informational_Duration     14
ProductRelated             14
ProductRelated_Duration    14
BounceRates                14
ExitRates                  14
PageValues                  0
SpecialDay                  0
Month                       0
OperatingSystems            0
Browser                     0
Region                      0
TrafficType                 0
VisitorType                 0
Weekend                     0
Revenue                     0
dtype: int64

In [5]:
df_raw.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.317798,80.906176,0.503979,34.506387,31.763884,1196.037057,0.022152,0.043003,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586
std,3.322754,176.860432,1.270701,140.825479,44.490339,1914.372511,0.048427,0.048527,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169
min,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,185.0,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,8.0,0.0,0.0,18.0,599.76619,0.003119,0.025124,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.5,0.0,0.0,38.0,1466.479902,0.016684,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


In [6]:
# Rows with negative duration
print('Problematic Rows Count:',df_raw[df_raw['Informational_Duration'] < 0].count()[1])

Problematic Rows Count: 33


## Cleaning Data

Try Oversampling / Undersampling

https://www.kaggle.com/saurav9786/ensemble-techniques

In [7]:
# remove rows with null & negative value
df = df_raw[((df_raw['Administrative_Duration'] >= 0) & (df_raw['Informational_Duration'] >= 0) & (df_raw['ProductRelated_Duration'] >= 0))].copy()

## Data Transformation

In [8]:
df['Month']= df['Month'].astype('category').cat.codes
df['Month'].value_counts()

6    3357
7    2995
5    1884
1    1727
8     549
9     448
0     433
3     431
4     288
2     171
Name: Month, dtype: int64

### Binary Encoding

In [9]:
# Binary Encoding
from sklearn.preprocessing import label_binarize

# From Label to Boolean
df['Revenue'] = label_binarize(df['Revenue'], classes=[False,True])
df['Weekend'] = label_binarize(df['Weekend'], classes=[False,True])

In [10]:
df_raw.tail()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
12325,3.0,145.0,0.0,0.0,53.0,1783.791667,0.007143,0.029031,12.241717,0.0,Dec,4,6,1,1,Returning_Visitor,True,False
12326,0.0,0.0,0.0,0.0,5.0,465.75,0.0,0.021333,0.0,0.0,Nov,3,2,1,8,Returning_Visitor,True,False
12327,0.0,0.0,0.0,0.0,6.0,184.25,0.083333,0.086667,0.0,0.0,Nov,3,2,1,13,Returning_Visitor,True,False
12328,4.0,75.0,0.0,0.0,15.0,346.0,0.0,0.021053,0.0,0.0,Nov,2,2,3,11,Returning_Visitor,False,False
12329,0.0,0.0,0.0,0.0,3.0,21.25,0.0,0.066667,0.0,0.0,Nov,3,2,1,2,New_Visitor,True,False


## Create Boolean Cols from Cardinal

In [11]:
# Boolean - Visited the type of page
df['Administrative_Visited'] = np.where(df['Administrative'] > 1, 1,0)
df['Informational_Visited'] = np.where(df['Informational'] > 1, 1,0)
df['ProductRelated_Visited'] = np.where(df['ProductRelated'] > 1, 1,0)

In [12]:
df['IsSpecialDate'] = np.where(df['SpecialDay'] > 1, 1,0)

### Intervals

In [13]:
# Administrative
column_name = 'Administrative'
interval_step = 3
new_column_name = column_name + '_Bins'
interval = range(0, int(max(df[column_name])), interval_step)
max_number_of_intervals =  10

# Create Column
df[new_column_name] = pd.cut(df[column_name], interval, right=False)
df[new_column_name] = df[new_column_name].astype('category').cat.codes
df.loc[(df[new_column_name].astype('category').cat.codes >= max_number_of_intervals),new_column_name] = max_number_of_intervals
df[new_column_name].value_counts()

 0    8189
 1    2255
 2    1057
 3     483
 4     186
 5      78
 6      20
 7       9
-1       6
Name: Administrative_Bins, dtype: int64

In [14]:
# ProductRelated
column_name = 'ProductRelated'
interval_step = 30
new_column_name = column_name + '_Bins'
interval = range(0, int(max(df[column_name])), interval_step)
max_number_of_intervals =  10

# Create Column
df[new_column_name] = pd.cut(df[column_name], interval, right=False)
df[new_column_name] = df[new_column_name].astype('category').cat.codes
df.loc[(df[new_column_name].astype('category').cat.codes >= max_number_of_intervals),new_column_name] = max_number_of_intervals
df[new_column_name].value_counts()

 0     8235
 1     2329
 2      826
 3      380
 4      201
 5      120
 10      70
 6       53
 7       46
 8       22
-1        1
Name: ProductRelated_Bins, dtype: int64

### Create dummy columns
Drop is manual, because we often want to keep the first column and drop the last one, which is often group of nonlabeled elements ('Others').

In [15]:
df.VisitorType.unique()

array(['Returning_Visitor', 'New_Visitor', 'Other'], dtype=object)

In [16]:
df = pd.get_dummies(df, columns=['VisitorType'], drop_first=False)
df = df.drop(['VisitorType_Other'],axis=1)

In [17]:
df = pd.get_dummies(df, columns=['TrafficType'], drop_first=False)
df = df.drop(['TrafficType_20'],axis=1)

In [18]:
df = pd.get_dummies(df, columns=['OperatingSystems'], drop_first=False)
df = df.drop(['OperatingSystems_8'],axis=1)

In [19]:
df = pd.get_dummies(df, columns=['Browser'], drop_first=False)
df = df.drop(['Browser_13'],axis=1)

In [20]:
df = pd.get_dummies(df, columns=['Region'], drop_first=False)
df = df.drop(['Region_9'],axis=1)

In [21]:
df.dtypes.unique()

array([dtype('float64'), dtype('int8'), dtype('int64'), dtype('uint8')],
      dtype=object)

In [22]:
df.columns

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'Weekend', 'Revenue', 'Administrative_Visited', 'Informational_Visited',
       'ProductRelated_Visited', 'IsSpecialDate', 'Administrative_Bins',
       'ProductRelated_Bins', 'VisitorType_New_Visitor',
       'VisitorType_Returning_Visitor', 'TrafficType_1', 'TrafficType_2',
       'TrafficType_3', 'TrafficType_4', 'TrafficType_5', 'TrafficType_6',
       'TrafficType_7', 'TrafficType_8', 'TrafficType_9', 'TrafficType_10',
       'TrafficType_11', 'TrafficType_12', 'TrafficType_13', 'TrafficType_14',
       'TrafficType_15', 'TrafficType_16', 'TrafficType_17', 'TrafficType_18',
       'TrafficType_19', 'OperatingSystems_1', 'OperatingSystems_2',
       'OperatingSystems_3', 'OperatingSystems_4', 'OperatingSystems_5',
       'OperatingSystems_6', 'OperatingSystems

# Test Train Split

We see that the class ratio is unbalanced more than 5:1 for the customer which haven't generated revenu.  We should use the method of stratification to make sure that classes in both train and test sets are in same ratio.

In [23]:
TARGET_VARIABLE = 'Revenue'
TEST_SIZE = 0.3
df[TARGET_VARIABLE].value_counts()


0    10375
1     1908
Name: Revenue, dtype: int64

In [24]:
X = df.drop(labels=[TARGET_VARIABLE], axis=1)
y = df[TARGET_VARIABLE]

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE
                                                   ,stratify=y)

In [26]:
#print_acc_score(y_test, predictions)

## Choosing Metrics - Evaluation Matrix
### Metrics
- Recall
- Precision
- F1 Score
To gather the evaluation results from different models into one df.

In [27]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, fbeta_score, roc_auc_score

In [28]:
from sklearn.metrics import classification_report, confusion_matrix

In [29]:
evaluation_matrix_columns = ['Model',
                             'EvaluationInfo',
                             'AreaUnderCurve',
                             'Accuracy',
                             'False_Precision',
                             'True_Precision',
                             'True_Recall',
                             'True_F1_Score']

evaluation_matrix = pd.DataFrame(columns = evaluation_matrix_columns)
evaluation_matrix

Unnamed: 0,Model,EvaluationInfo,AreaUnderCurve,Accuracy,False_Precision,True_Precision,True_Recall,True_F1_Score


In [30]:
# Evaluate Model and Add results to the matrix
def evaluate_model(model, y_test, y_pred, evaluation_info=''):
    global evaluation_matrix
    float_precision = 3
    total_accuracy  = round(100 * accuracy_score(y_test, y_pred), float_precision)
    false_precision = round(100 * precision_score(y_test,y_pred, pos_label=0, average='binary'), float_precision)
    true_precision  = round(100 * precision_score(y_test,y_pred, pos_label=1, average='binary'), float_precision)
    true_recall     = round(100 * recall_score(y_test,y_pred, pos_label=1, average='binary'), float_precision)
    AUC     = round(roc_auc_score(y_test,y_pred), float_precision)
    true_f1_beta    = round(100 * fbeta_score(y_test, y_pred, beta=1.5), float_precision)
    
    model_evaluation_dict = {'Model':model.__class__.__name__,
                             'EvaluationInfo':evaluation_info,
                             'AreaUnderCurve': AUC,
                             'Accuracy':total_accuracy,
                             'False_Precision':false_precision,
                             'True_Precision':true_precision,
                             'True_Recall':true_recall,
                             'True_F1_Score':true_f1_beta}
    evaluation_matrix = evaluation_matrix.append(model_evaluation_dict, ignore_index=True)

In [31]:
from sklearn.metrics import plot_roc_curve

#logmodel_disp = plot_roc_curve(logmodel, X_test, y_test)
#ax = plt.gca()
#gbk_disp = plot_roc_curve(gbk, X_test, y_test, ax=ax, alpha=0.8)
#logmodel_disp.plot(ax=ax, alpha=0.8)

In [32]:
evaluation_matrix

Unnamed: 0,Model,EvaluationInfo,AreaUnderCurve,Accuracy,False_Precision,True_Precision,True_Recall,True_F1_Score


## TODO - Cross Validation

In [33]:
# Train
from sklearn.ensemble import RandomForestClassifier
ran_forest = RandomForestClassifier(n_estimators=500, min_samples_split=3, random_state=RANDOM_STATE,bootstrap=True)
ran_forest.fit(X_train, y_train)

# Test
y_pred = ran_forest.predict(X_test)
evaluate_model(ran_forest,y_test,y_pred,'n_estimators=500')
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      3113
           1       0.79      0.51      0.62       572

    accuracy                           0.90      3685
   macro avg       0.85      0.74      0.78      3685
weighted avg       0.90      0.90      0.89      3685



In [34]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Create Split For CV with same ratios of classes in sets
strat_kfold = StratifiedKFold(n_splits=5, shuffle=False, random_state=RANDOM_STATE)

def print_acc_score(y_test, predictions):
    print("Accuracy: {:.2%}".format(accuracy_score(y_test, predictions))) 

def print_cv_score(cv_score):
    print("Accuracy: %0.2f (+/- %0.2f with 95 % confidence)" % (cv_score.mean(), cv_score.std() * 2))
    
    #print("Precision: ",round(precision_score(y_test,y_pred),2),"Recall: ",round(recall_score(y_test,y_pred),2))

In [35]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(ran_forest, X, y, 
                            cv=strat_kfold, scoring='recall')
cv_scores

array([0.51570681, 0.56282723, 0.58115183, 0.45144357, 0.51968504])

In [36]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(ran_forest, X, y, 
                            cv=strat_kfold, scoring='recall')
cv_scores

array([0.51570681, 0.56282723, 0.58115183, 0.45144357, 0.51968504])

In [37]:
recall_score(y_test,y_pred)

0.5052447552447552

In [38]:
recall_score(y_test,y_pred, pos_label=1, average='binary')

0.5052447552447552

In [39]:
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))

Accuracy: 0.53 (+/- 0.09)


In [40]:
## TODO Implement Cross validation
# cross_val_score(gaussiannb, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy')

## TODO - Metrics Commentary

Hence the Y variable has a high class imbalance. Hence accuracy will not be a reliable model performance measure.

FN is very critical for this business case because a false negative is a customer who will potentially subscribe for a loan but who has been classified as 'will not subscribe'. Hence the most relevant model performance measure is recall

### TODO - Oversampling / Undersampling

https://www.kaggle.com/saurav9786/ensemble-techniques

# Baseline Model

## Dummy Classifier
- Most basic models that are used as a baseline models for the clasification.

### Most Frequent
- Is taking most frequent class for prediction

In [None]:
from sklearn.dummy import DummyClassifier


dummy_clf_mf = DummyClassifier(strategy="most_frequent")
dummy_clf_mf.fit(X, y)
y_pred = dummy_clf_mf.predict(X_test)

In [None]:
evaluate_model(dummy_clf_mf,y_test,y_pred,'most_frequent')
print(classification_report(y_test, y_pred))

### Stratified
- Keep the ratio of classes 

In [None]:
dummy_clf_st = DummyClassifier(strategy="stratified", random_state=RANDOM_STATE)
dummy_clf_st.fit(X, y)
y_pred = dummy_clf_st.predict(X_test)

evaluate_model(dummy_clf_st,y_test,y_pred,'stratified')
print(classification_report(y_test, y_pred))

### TODO: Comment Baseline Model

In [None]:
accuracy_score(y_test, y_pred)

# Clasification Models

## Decision Tree Classifier
- (https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [None]:
# Train
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='gini', splitter='best', random_state=RANDOM_STATE)
dtree.fit(X_train, y_train)

# Test
y_pred = dtree.predict(X_test)
evaluate_model(dtree,y_test,y_pred,'gini')
print(classification_report(y_test, y_pred))

In [None]:
# Train
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=RANDOM_STATE)
dtree.fit(X_train, y_train)

# Test
y_pred = dtree.predict(X_test)
evaluate_model(dtree,y_test,y_pred,'entropy')
print(classification_report(y_test, y_pred))

In [None]:
evaluation_matrix.tail(2)

## TODO - Random Forest

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

## TODO - RF GridSearch

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV

## GaussianNB

In [None]:
# Train
from sklearn.naive_bayes import GaussianNB
gaussiannb= GaussianNB()
gaussiannb.fit(X_train, y_train)

# Test
y_pred = gaussiannb.predict(X_test)
evaluate_model(gaussiannb,y_test,y_pred,'')
print(classification_report(y_test, y_pred))

## Gradient Boosting Classifier

In [None]:
# Train
from sklearn.ensemble import GradientBoostingClassifier
gbk = GradientBoostingClassifier(learning_rate=0.01, n_estimators=500,random_state=RANDOM_STATE)
gbk.fit(X_train, y_train)

# Test
y_pred = gbk.predict(X_test)
evaluate_model(gbk,y_test,y_pred,'n_estimators=500')
print(classification_report(y_test, y_pred))

## KNeighbors Classifier

In [None]:
# Train
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(),n_estimators=500,
                            max_samples=0.9, max_features=0.9,bootstrap=True,
                            bootstrap_features=False)
bagging.fit(X_train, y_train)

# Test
y_pred = bagging.predict(X_test)
evaluate_model(bagging,y_test,y_pred,'n_estimators=500')
print(classification_report(y_test, y_pred))

In [None]:
from sklearn import linear_model
clf = linear_model.SGDClassifier()
clf.fit(X_train, y_train)
# Test
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

## Bagging Aggregation - GBC

In [None]:
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(GradientBoostingClassifier(),n_estimators=20,
                            max_samples=0.9, max_features=0.9,bootstrap=True,
                            bootstrap_features=False)
bagging.fit(X_train, y_train)

In [None]:
y_pred = bagging.predict(X_test)
evaluate_model(bagging,y_test,y_pred,'n_estimators=50')
print(classification_report(y_test, y_pred))

## Logistic Regression

In [None]:
# Train
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(solver='liblinear', C=19.1, penalty='l1',random_state=RANDOM_STATE)
logmodel.fit(X, y)

# Test
y_pred = logmodel.predict(X_test)
evaluate_model(logmodel,y_test,y_pred,'C=19.1, penalty=l1')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

### Predict_proba
Returns
Tarray-like of shape (n_samples, n_classes)
Returns the probability of the sample for each class in the model, where classes are ordered as they are in self.classes_

In [None]:
# Get probabilities for Revenue = 1
y_prob_pred = logmodel.predict_proba(X_test)[:,1]
y_prob_pred

In [None]:
y_pred = y_prob_pred > 0.5

### LG: Probability Threshold Optimization

In [None]:
thresholds = np.arange(0.2, 0.9, 0.05)
for threshold in thresholds:
    y_pred = y_prob_pred > threshold
    print(f'LG with threshold: {threshold}')
    #print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))

In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(logmodel, X, y, cv=10, scoring='roc_auc')
score.mean()

### LG: Hyper Parameter Tuning

In [None]:
# Type of penalty - Lasso(l1) or Ridge(l2)
penalties = ['l1','l2']
C_values = np.linspace(0.2, 20, 10)

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Cross-Validation
cross_valid = StratifiedKFold(n_splits=10)

# Hyperparameter Tuning
params = {'penalty': penalties, 'C': C_values}

logmodel2 = LogisticRegression(solver='liblinear')
grid = GridSearchCV(estimator=logmodel2, param_grid=params, scoring='recall', n_jobs=-1, cv=cross_valid)
grid.fit(X,y)

In [None]:
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

## GridSearch - RF

In [44]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Cross-Validation
cross_valid = StratifiedKFold(n_splits=10)

n_estimators_list = [50,500]
                #, 125, 250, 500]
bootstrap_list = [False,True]
# Hyperparameter Tuning
params = {'n_estimators': n_estimators_list, 'bootstrap': bootstrap_list}

# Grid Search
ran_forest = RandomForestClassifier(random_state=RANDOM_STATE)
grid = GridSearchCV(estimator=ran_forest, param_grid=params, scoring='recall', n_jobs=-1, cv=cross_valid)
grid.fit(X,y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                        

In [45]:
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

{'bootstrap': False, 'n_estimators': 500}
0.5198732433177183
RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=3, verbose=0,
                       warm_start=False)


In [51]:
grid.cv_results_['params']

[{'bootstrap': False, 'n_estimators': 50},
 {'bootstrap': False, 'n_estimators': 500},
 {'bootstrap': True, 'n_estimators': 50},
 {'bootstrap': True, 'n_estimators': 500}]

In [53]:
grid.cv_results_['rank_test_score']

array([2, 1, 4, 3], dtype=int32)

## Neural Network

### Z - score Scaler - {-1;1}

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scal = sc.fit_transform(df.drop(labels=['Revenue'], axis=1))
X_scal
X_train, X_test, y_train, y_test = train_test_split(X_scal, y, test_size=0.3, random_state=3)

### Compile NN model

In [None]:
import keras as K
from keras import Sequential
from keras.layers import Dense

In [None]:
classifier = Sequential()
#First Hidden Layer
classifier.add(Dense(12, activation='tanh', kernel_initializer='random_normal', input_dim=X_scal.shape[1]))
#Second  Hidden Layer
classifier.add(Dense(12, activation='tanh', kernel_initializer='random_normal'))
#Output Layer
classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))

In [None]:
#Compiling the neural network
classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

In [None]:
#Fitting the data to the training dataset
classifier.fit(X_train,y_train, batch_size=10, epochs=100)

In [None]:
eval_model = classifier.evaluate(X_train, y_train)
eval_model

In [None]:
y_prod_pred = classifier.predict(X_test)

In [None]:
thresholds = np.arange(0.15, 0.9, 0.05)
for threshold in thresholds:
    y_pred = y_prob_pred > threshold
    print(f'LG with threshold: {threshold}')
    #print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))

## NN V2

### MinMax

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scal = scaler.fit_transform(df.drop(labels=['Revenue'], axis=1))
X_scal
X_train, X_test, y_train, y_test = train_test_split(X_scal, y, test_size=0.3, random_state=3)

In [None]:
classifier = Sequential()
#First Hidden Layer
classifier.add(Dense(12, activation='relu', kernel_initializer='random_normal', input_dim=X_scal.shape[1]))
#Second  Hidden Layer
classifier.add(Dense(12, activation='relu', kernel_initializer='random_normal'))
#Output Layer
classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))


#Compiling the neural network
classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

#Fitting the data to the training dataset
classifier.fit(X_train,y_train, batch_size=10, epochs=100)

In [None]:
y_prod_pred = classifier.predict(X_test)

In [None]:
thresholds = np.arange(0.15, 0.9, 0.05)
for threshold in thresholds:
    y_pred = y_prob_pred > threshold
    print(f'LG with threshold: {threshold}')
    #print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))

In [None]:
# Train
from sklearn.ensemble import RandomForestClassifier
ran_forest = RandomForestClassifier(n_estimators=500, min_samples_split=3, random_state=RANDOM_STATE,
                                   bootstrap=True,)
ran_forest.fit(X_train, y_train)

# Test
y_pred = ran_forest.predict(X_test)
evaluate_model(ran_forest,y_test,y_pred,'n_estimators=500')
print(classification_report(y_test, y_pred))

In [None]:
## DTC