In [1]:
import numpy as np
import pandas
import sklearn
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
import imblearn

In [2]:
#read the file into a dataframe

data = pandas.read_csv('DataSetForPhishingVSBenignUrl.csv')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier

#Set up features and target sets
X = data.drop('URL_Type_obf_Type', axis=1)
y = data['URL_Type_obf_Type']

#Impute missing values
X = np.where(np.isinf(X), 1e10, X)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

#Split train and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Train model
depths = [1, 3, 6, 9, 12, 15, 18]
criteria = ['gini', 'entropy']

for criterion in criteria:
    for depth in depths:
        dtc = DecisionTreeClassifier(max_depth=depth, criterion=criterion)
        ada = AdaBoostClassifier(dtc)
        ada.fit(X_train, y_train)
        y_pred = ada.predict(X_test)

        #Accuracy
        print(f'Tree Depth: {depth}, Criteria Measure: {criterion}')
        print('Misclassification examples: %d' % (y_test != y_pred).sum())
        print('Train accuracy: %.3f' % ada.score(X_train, y_train))
        print('Test accuracy: %.3f' % ada.score(X_test, y_test))
        print()

Tree Depth: 1, Criteria Measure: gini
Misclassification examples: 2465
Train accuracy: 0.669
Test accuracy: 0.664

Tree Depth: 3, Criteria Measure: gini
Misclassification examples: 2214
Train accuracy: 0.710
Test accuracy: 0.698

Tree Depth: 6, Criteria Measure: gini
Misclassification examples: 641
Train accuracy: 0.935
Test accuracy: 0.913

Tree Depth: 9, Criteria Measure: gini
Misclassification examples: 201
Train accuracy: 0.997
Test accuracy: 0.973

Tree Depth: 12, Criteria Measure: gini
Misclassification examples: 153
Train accuracy: 1.000
Test accuracy: 0.979

Tree Depth: 15, Criteria Measure: gini
Misclassification examples: 145
Train accuracy: 1.000
Test accuracy: 0.980

Tree Depth: 18, Criteria Measure: gini
Misclassification examples: 130
Train accuracy: 1.000
Test accuracy: 0.982

Tree Depth: 1, Criteria Measure: entropy
Misclassification examples: 2782
Train accuracy: 0.635
Test accuracy: 0.621

Tree Depth: 3, Criteria Measure: entropy
Misclassification examples: 2009
Train

In [None]:
# The results from AdaBoost is much better compared to just Deision Tree Classifier from last week. At tree depth 6, DTC had around 75% testing accuracy.
# With Adaboost, the accuracy at depth 6 is 90%+. The improvement in result is significant. At depth 9 onward, the accuracy tops out to 97%

# However, this comes at a cost of runtime and resources. Adaboost took significantly longer to fit compared to DTC. 
# This is due to the way Adaboost iterates several shallow trees as a part of this learning process. 

#Overall Adaboost is a much more accuracy classifier compared to DTC. The downside is Adaboost use more resources. 