In [1]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

from statistics import mean 

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import seaborn as sns
import pickle
# Importing the required packages 

In [3]:
pickle_in = open("X_train_rusb.pickle","rb")
X_train = pickle.load(pickle_in)

pickle_in = open("y_train_rusb.pickle","rb")
y_train = pickle.load(pickle_in)

pickle_in = open("X_test_rusb.pickle","rb")
X_test = pickle.load(pickle_in)

pickle_in = open("y_test_rusb.pickle","rb")
y_test = pickle.load(pickle_in)


In [4]:
from sklearn.preprocessing import MinMaxScaler

In [5]:
scaler = MinMaxScaler()

In [6]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [7]:

from sklearn import preprocessing
from scipy.io.arff import loadarff
from sklearn.ensemble import RandomForestClassifier
import numpy as np

## XGBoost Classifier

In [8]:
import xgboost as xgb
boost = xgb.XGBClassifier(max_depth=9,
                          subsample=0.9,
                          objective='multi:softmax',
                          num_class = 3,
                          min_child_weight=2,
                          colsample_bytree=0.7,
                          n_estimators=100,
                          learning_rate=0.08,
                          n_jobs = -1)
boost.fit(X_train,y_train)
boost_pred = boost.predict(X_test)


In [9]:
from sklearn.metrics import accuracy_score 
accuracy_score(y_test,boost_pred)

0.9961909608722034

In [10]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,boost_pred))  
print(classification_report(y_test,boost_pred))  
print(accuracy_score(y_test, boost_pred))


[[127217    565]
 [   408 127255]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    127782
           1       1.00      1.00      1.00    127663

    accuracy                           1.00    255445
   macro avg       1.00      1.00      1.00    255445
weighted avg       1.00      1.00      1.00    255445

0.9961909608722034


## Decision Tree

In [11]:
from sklearn import tree
tree_model = tree.DecisionTreeClassifier(random_state=10)
tree_model.fit(X_train,y_train)
y_pred = tree_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)


In [12]:
print("Accuracy score for Decision Tree:",accuracy_score(y_test,y_pred))
print("Depth of the Original Tree :" + str(tree_model.get_depth()))
print("No of leaves in the Original Tree :" + str(tree_model.get_n_leaves()))
print(cm)

Accuracy score for Decision Tree: 0.8316506488676623
Depth of the Original Tree :38
No of leaves in the Original Tree :1102
[[127436    346]
 [ 42658  85005]]


In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

#### Using GridSearchCV to find the best parameters

In [14]:
tree_model = DecisionTreeClassifier(random_state=10)
params = {'splitter': ['best', 'random'], 'max_depth': [10,20,30], 'criterion': ['gini', 'entropy']}
grid_search_cv = GridSearchCV(tree_model, params, cv= 5)
grid_search_cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=10),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [10, 20, 30],
                         'splitter': ['best', 'random']})

In [15]:
grid_search_cv.best_score_

0.9988440313595575

In [16]:
grid_search_cv.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state=10)

In [17]:
tree_model = tree.DecisionTreeClassifier(random_state=10, criterion='entropy', max_depth=20, )

In [18]:
tree_model.fit(X_train,y_train)
y_pred = tree_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

In [19]:
print("Accuracy score for Decision Tree:",accuracy_score(y_test,y_pred))
print("Depth of the Original Tree :" + str(tree_model.get_depth()))
print("No of leaves in the Original Tree :" + str(tree_model.get_n_leaves()))
print(cm)

Accuracy score for Decision Tree: 0.9913719195913014
Depth of the Original Tree :20
No of leaves in the Original Tree :515
[[126752   1030]
 [  1174 126489]]


In [20]:
print(classification_report(y_test,y_pred))  

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    127782
           1       0.99      0.99      0.99    127663

    accuracy                           0.99    255445
   macro avg       0.99      0.99      0.99    255445
weighted avg       0.99      0.99      0.99    255445



### Hyperparameter Tuning : Experiment with optimizing precision and recall 

In [30]:
params = [{'splitter': ['best', 'random'], 'max_depth': [10,20,25,30], 'criterion': ['gini', 'entropy']}]
tree_model = DecisionTreeClassifier(random_state=10)
print('Tuning hyper-parameters for Precision')
grid_search = GridSearchCV(estimator = tree_model,
                           param_grid = params,
                           scoring='precision_weighted',
                           cv = 5,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)
print("Best parameters set")
print(grid_search.best_params_)
print()
print(classification_report(y_test, y_pred))
print("Grid scores on development set:")
print()
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
print()

Tuning hyper-parameters for Precision
Best parameters set
{'criterion': 'entropy', 'max_depth': 20, 'splitter': 'best'}

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    127782
           1       0.99      0.99      0.99    127663

    accuracy                           0.99    255445
   macro avg       0.99      0.99      0.99    255445
weighted avg       0.99      0.99      0.99    255445

Grid scores on development set:

0.994 (+/-0.001) for {'criterion': 'gini', 'max_depth': 10, 'splitter': 'best'}
0.963 (+/-0.015) for {'criterion': 'gini', 'max_depth': 10, 'splitter': 'random'}
0.999 (+/-0.000) for {'criterion': 'gini', 'max_depth': 20, 'splitter': 'best'}
0.995 (+/-0.004) for {'criterion': 'gini', 'max_depth': 20, 'splitter': 'random'}
0.999 (+/-0.000) for {'criterion': 'gini', 'max_depth': 25, 'splitter': 'best'}
0.997 (+/-0.001) for {'criterion': 'gini', 'max_depth': 25, 'splitter': 'random'}
0.998 (+/-0.000) for {'criterion'

In [31]:
grid_search.best_score_

0.9988445133410903

In [32]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[126752   1030]
 [  1174 126489]]
