In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

import os

import warnings
warnings.filterwarnings("ignore")


df=pd.read_csv(r'C:\Users\isarachchand\\Documents\git\apf\datasets\cyber_risk\data\malign_websites_dataset.csv')


In [2]:
#Prep the data

100 * df['Type'].value_counts()/len(df)   #variable imbalance

#unique categories for each categorical column

for i in df.select_dtypes(include='object').columns:
    print(f"{i} -> {df[i].nunique()}")
    
df['CHARSET'].value_counts()

# Top 5 categories kept

def CHARSET_CLEANER(x):
    if x not in ['UTF-8','ISO-8859-1','utf-8','us-ascii','iso-8859-1']:
        return "OTHERS"
    else:
        return x

df['CHARSET'] = df['CHARSET'].apply(CHARSET_CLEANER)
df['CHARSET'].value_counts()
df['SERVER'].value_counts()

# Top 5 categories kept

def SERVER_CLEANER(x):
    if x not in ['Apache','nginx','None','Microsoft-HTTPAPI/2.0','cloudflare-nginx']:
        return "OTHERS"
    else:
        return x
    
df['SERVER'] = df['SERVER'].apply(SERVER_CLEANER)
df['SERVER'].value_counts()
df['WHOIS_STATEPRO'].value_counts()[:11]

def STATE_CLEANER(x):
    if x not in ['CA','None','NY','WA','Barcelona','FL']:
        return "OTHERS"
    else:
        return x

df['WHOIS_STATEPRO'] = df['WHOIS_STATEPRO'].apply(STATE_CLEANER)
df['WHOIS_STATEPRO'].value_counts()

def DATE_CLEANER(x):
    if x == 'None':
        return "Absent"
    else:
        return "Present"
df['WHOIS_REGDATE'] = df['WHOIS_REGDATE'].apply(DATE_CLEANER)
df['WHOIS_UPDATED_DATE'] = df['WHOIS_UPDATED_DATE'].apply(DATE_CLEANER)

df.drop(['URL','WHOIS_COUNTRY','CONTENT_LENGTH'],axis=1,inplace=True)
# change null values to 0

df = df.fillna(0)

le = LabelEncoder()
for column in ['CHARSET','SERVER', 'WHOIS_STATEPRO', 'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']:
    df[column] = le.fit_transform(df[column].astype(str))



URL -> 1781
CHARSET -> 9
SERVER -> 239
WHOIS_COUNTRY -> 49
WHOIS_STATEPRO -> 182
WHOIS_REGDATE -> 891
WHOIS_UPDATED_DATE -> 594


In [3]:
models_list = ['Random Forest', 'XGB Classifier', 'K Nearest Neighbour', 'Decision Tree']
accuracy_list = []
train_accuracy_list = []
train_acc_list = []
recall_list = []
precision_list = []
f1_score_list = []


In [4]:
# Split data
y = df['Type']
X = df.drop('Type', axis=1)

# Split data
y = df['Type']
X = df.drop('Type', axis=1)

best_index = 1
partition_count = 5

# get train test split
partition_size = math.ceil(len(X) / partition_count)
test_start = best_index * partition_size
test_end = test_start + partition_size
test_x = X[test_start:test_end]
test_y = y[test_start:test_end]
train_x = pd.concat([X[:test_start],X[test_end:]])
train_y =  pd.concat([y[:test_start], y[test_end:]])


## Hyperparameter Optimisation with Random Forest

In [5]:
space={'max_depth': hp.quniform("max_depth", 10, 180, 1), # 120, 180
        'min_sample_leaf' : hp.uniform('min_samples_leaf',1,5),
        'min_samples_split':hp.uniform('min_samples_split',2,6),
        'n_estimators': hp.uniform('n_estimators', 200, 900),
       'max_features': hp.uniform('max_features', 1, 5)
    }

def objective(space):
    
    clf = RandomForestClassifier(
        n_estimators=int(space['n_estimators']), max_depth=int(space['max_depth']),
        min_samples_split=int(space['min_samples_split']),
        min_samples_leaf=int(space['min_sample_leaf']),
        max_features=int(space['max_features']), random_state=2)

    clf.fit(train_x, train_y)
    pred = clf.predict(test_x)
    
    #training accuracy
    y_train_pred = clf.predict(train_x)
    
    #calculate metrics
    f1s = f1_score(pred, test_y)

    return {'loss': -f1s, 'status': STATUS_OK}


trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

print("Best hyperparameters are: {}".format(best_hyperparams))

100%|██████████| 100/100 [01:18<00:00,  1.27trial/s, best loss: -0.7647058823529412]
Best hyperparameters are: {'max_depth': 111.0, 'max_features': 4.387342497187583, 'min_samples_leaf': 2.0770919480139733, 'min_samples_split': 2.3553702014514015, 'n_estimators': 413.4884362317157}


In [6]:
# best model
best_model = RandomForestClassifier(n_estimators=int(best_hyperparams['n_estimators']),
                                    max_depth=int(best_hyperparams['max_depth']),
                                    min_samples_split=int(best_hyperparams['min_samples_split']),
                                    min_samples_leaf=int(best_hyperparams['min_samples_leaf']),
                                    max_features=int(best_hyperparams['max_features']), random_state=2)


best_model.fit(train_x, train_y)

y_train_pred = best_model.predict(train_x)
y_test_pred = best_model.predict(test_x)



train_accuracy = accuracy_score(y_train_pred, train_y)
accuracy = accuracy_score(y_test_pred, test_y)
recall = recall_score(test_y, y_test_pred, average='macro')
precision = precision_score(test_y, y_test_pred, average='macro')
f1score = f1_score(test_y, y_test_pred, average='macro')

accuracy_list.append(accuracy)
recall_list.append(recall)
precision_list.append(precision)
f1_score_list.append(f1score)
train_acc_list.append(train_accuracy)

print("Accuracy : ", accuracy)
print("Training accuracy", train_accuracy)
print("Recall : ", recall)
print("Precision : ", precision)
print("F1 Score : ", f1score)
    

Accuracy :  0.9551820728291317
Training accuracy 0.9915730337078652
Recall :  0.8154908922506947
Precision :  0.9587542087542087
F1 Score :  0.8699690402476781


## Hyperparameter optimisation XGBClassifier

In [7]:

space2={'max_depth': hp.quniform("max_depth", 10, 180, 1), # 120, 180
        'gamma': hp.uniform ('gamma', 1,25),
        #'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        #'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        #'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        #'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.uniform('n_estimators', 200, 900), # 100, 200, 300
        #"n_estimators": hp.quniform('n_estimators', 100,200,1),
        #'seed': 0
    }


def objective2(space2):
    clf_model = xgb.XGBClassifier(
        n_estimators=int(space2['n_estimators']), gamma=space2['gamma'], max_depth=int(space2['max_depth']), random_state=2)

    evaluation = [(train_x, train_y), (test_x, test_y)]

    clf_model.fit(train_x, train_y,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10, verbose=False)

    pred = clf_model.predict(test_x)
    f1s = f1_score(pred, test_y)

    return {'loss': -f1s, 'status': STATUS_OK}


trials2 = Trials()

best_hyperparams2 = fmin(fn = objective2,
                        space = space2,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials2)

print("The best hyperparameters are : ","\n")
print(best_hyperparams2)


100%|██████████| 100/100 [00:08<00:00, 11.89trial/s, best loss: -0.8450704225352113]
The best hyperparameters are :  

{'gamma': 4.082224695979065, 'max_depth': 142.0, 'n_estimators': 893.4900161366173}


In [8]:
# best model
best_model = xgb.XGBClassifier(n_estimators=int(best_hyperparams2['n_estimators']),
                           gamma=best_hyperparams2['gamma'],
                           max_depth=int(best_hyperparams2['max_depth']), random_state=2)


best_model.fit(train_x, train_y)

y_train_pred = best_model.predict(train_x)
y_test_pred = best_model.predict(test_x)



train_accuracy = accuracy_score(y_train_pred, train_y)
accuracy = accuracy_score(y_test_pred, test_y)
recall = recall_score(test_y, y_test_pred, average='macro')
precision = precision_score(test_y, y_test_pred, average='macro')
f1score = f1_score(test_y, y_test_pred, average='macro')

accuracy_list.append(accuracy)
recall_list.append(recall)
precision_list.append(precision)
f1_score_list.append(f1score)
train_acc_list.append(train_accuracy)

print("Accuracy : ", accuracy)
print("Train accuracy: ", train_accuracy)
print("Recall : ", recall)
print("Precision : ", precision)
print("F1 Score : ", f1score)
    

Accuracy :  0.957983193277311
Train accuracy:  0.9824438202247191
Recall :  0.8489117011423278
Precision :  0.9346634615384615
F1 Score :  0.8855598059538821


## K Nearest Neighbour

In [9]:
from sklearn.neighbors import KNeighborsClassifier

spaceknn={'n_neighbors': hp.uniform('n_neighbors', 1,100)}

def objective(spaceknn):
    
    clf = KNeighborsClassifier(n_neighbors=int(spaceknn['n_neighbors']))
        
    clf.fit(train_x, train_y)
    pred = clf.predict(test_x)

    f1s = f1_score(pred, test_y)

    return {'loss': -f1s, 'status': STATUS_OK}


trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = spaceknn,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

print("Best hyperparameters are: {}".format(best_hyperparams))


100%|██████████| 100/100 [00:02<00:00, 40.12trial/s, best loss: -0.676056338028169]
Best hyperparameters are: {'n_neighbors': 1.0541818310979505}


In [10]:
# best model
best_model = KNeighborsClassifier(n_neighbors=int(best_hyperparams['n_neighbors']))


best_model.fit(train_x, train_y)

y_train_pred = best_model.predict(train_x)
y_test_pred = best_model.predict(test_x)

train_accuracy = accuracy_score(y_train_pred, train_y)
accuracy = accuracy_score(y_test_pred, test_y)
recall = recall_score(test_y, y_test_pred, average='macro')
precision = precision_score(test_y, y_test_pred, average='macro')
f1score = f1_score(test_y, y_test_pred, average='macro')

accuracy_list.append(accuracy)
recall_list.append(recall)
precision_list.append(precision)
f1_score_list.append(f1score)
train_acc_list.append(train_accuracy)

print("Accuracy : ", accuracy)
print("Train accuracy: ", train_accuracy)
print("Recall : ", recall)
print("Precision : ", precision)
print("F1 Score : ", f1score)
    

Accuracy :  0.9355742296918768
Train accuracy:  1.0
Recall :  0.7831892559431923
Precision :  0.8740061162079511
F1 Score :  0.820143254550632


## Decision Trees

In [11]:
from sklearn.tree import DecisionTreeClassifier

space4dt = {
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.choice('max_features', range(1,5)),
    'criterion': hp.choice('criterion', ["gini", "entropy"]),
    'min_samples_leaf' : hp.uniform('min_samples_leaf',1,5),
    'min_samples_split':hp.uniform('min_samples_split',2,6),
    
}

def objective(space4dt):
    clf = DecisionTreeClassifier(
        max_depth=int(space4dt['max_depth']),
        max_features=space4dt['max_features'],
        min_samples_split=int(space4dt['min_samples_split']),
        min_samples_leaf=int(space4dt['min_samples_leaf']),
        criterion=space4dt['criterion'],
        random_state=2)

    clf.fit(train_x, train_y)
    pred = clf.predict(test_x)

    f1s = f1_score(test_y, pred, average='macro')

    return {'loss': -f1s, 'status': STATUS_OK}


trials = Trials()
best_hyperparams = fmin(fn = objective,
                        space = space4dt,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

print("Best hyperparameters are: {}".format(best_hyperparams))

100%|██████████| 100/100 [00:01<00:00, 96.89trial/s, best loss: -0.8848572810836962]
Best hyperparameters are: {'criterion': 1, 'max_depth': 9, 'max_features': 3, 'min_samples_leaf': 1.476267608793267, 'min_samples_split': 5.3783066173675005}


In [12]:
# best model

best_model = DecisionTreeClassifier(max_depth=int(best_hyperparams['max_depth']),
                                    max_features=best_hyperparams['max_features'],
                                    min_samples_split=int(best_hyperparams['min_samples_split']),
                                    min_samples_leaf=int(best_hyperparams['min_samples_leaf']),
                                    random_state=2)
#criterion=best_hyperparams['criterion'],

best_model.fit(train_x, train_y)

y_train_pred = best_model.predict(train_x)
y_test_pred = best_model.predict(test_x)

train_accuracy = accuracy_score(y_train_pred, train_y)
accuracy = accuracy_score(y_test_pred, test_y)
recall = recall_score(test_y, y_test_pred, average='macro')
precision = precision_score(test_y, y_test_pred, average='macro')
f1score = f1_score(test_y, y_test_pred, average='macro')

accuracy_list.append(accuracy)
recall_list.append(recall)
precision_list.append(precision)
f1_score_list.append(f1score)
train_acc_list.append(train_accuracy)

print("Accuracy : ", accuracy)
print("Train accuracy: ", train_accuracy)
print("Recall : ", recall)
print("Precision : ", precision)
print("F1 Score : ", f1score)
    

Accuracy :  0.9215686274509803
Train accuracy:  0.9726123595505618
Recall :  0.7540521765977153
Precision :  0.8256976053829408
F1 Score :  0.7837487019730011


## SVM

In [13]:
# from sklearn.svm import SVC

# space4svm = {'C': hp.uniform('C', 0, 20),
#     'kernel': hp.choice('kernel', ['linear', 'sigmoid', 'poly', 'rbf']),
#     'gamma': hp.uniform('gamma', 0, 20),
#     }

# def objective(space4svm):
    
#     clf = SVC(
#         C=space4svm['C'], kernel=space4svm['kernel'],
#         gamma=int(space4svm['gamma']),
#         random_state=2)

#     clf.fit(train_x, train_y)
#     pred = clf.predict(test_x)

#     accuracy = recall_score(test_y, pred, average='macro')
    
#     print("Accuracy: ", accuracy)
#     print("\n")
#     return {'loss': -accuracy, 'status': STATUS_OK}


# trials = Trials()

# best_hyperparams = fmin(fn = objective,
#                         space = space4svm,
#                         algo = tpe.suggest,
#                         max_evals = 50,
#                         trials = trials)

# print("Best hyperparameters are: {}".format(best_hyperparams))
# best_acc = min([t['result']['loss'] for t in trials.trials])
# accuracy_list.append(-best_acc)

## Logistic Regression

In [14]:
# from sklearn.linear_model import LogisticRegression


# space4lr = {
#     'penalty': hp.choice('penalty', ['none', 'l1', 'l2', 'elasticnet']),
#     'solver': hp.choice('solver', ['liblinear', 'none']),
#     'C': hp.uniform('C', 0, 20)
    
# }

# def objective(space4tlr):

#     clf = LogisticRegression(
#         penalty=space4lr['penalty'],
#         solver=space4lr['solver'], 
#         C=space4lr['C'],
#         random_state=2)

#     clf.fit(train_x, train_y)
#     pred = clf.predict(test_x)

#     accuracy = recall_score(test_y, pred, average='macro')
    
#     print("Accuracy: ", accuracy)
#     print("\n")
#     return {'loss': -accuracy, 'status': STATUS_OK}


# trials = Trials()
# best_hyperparams = fmin(fn = objective,
#                         space = space4lr,
#                         algo = tpe.suggest,
#                         max_evals = 200,
#                         trials = trials)

# print("Best hyperparameters are: {}".format(best_hyperparams))

# best_acc = min([t['result']['loss'] for t in trials.trials])
# accuracy_list.append(-best_acc)

In [15]:
print(models_list)
print(train_acc_list)
print(accuracy_list)
print(recall_list)
print(precision_list)
print(f1_score_list)

['Random Forest', 'XGB Classifier', 'K Nearest Neighbour', 'Decision Tree']
[0.9915730337078652, 0.9824438202247191, 1.0, 0.9726123595505618]
[0.9551820728291317, 0.957983193277311, 0.9355742296918768, 0.9215686274509803]
[0.8154908922506947, 0.8489117011423278, 0.7831892559431923, 0.7540521765977153]
[0.9587542087542087, 0.9346634615384615, 0.8740061162079511, 0.8256976053829408]
[0.8699690402476781, 0.8855598059538821, 0.820143254550632, 0.7837487019730011]


In [16]:
# Data frame with accuracies of models

model_scores = pd.DataFrame({
    'Model Name' : models_list,
    'Accuracy' : accuracy_list, 
    'Training accuracy': train_acc_list,
    'Recall' : recall_list, 
    'Precision' : precision_list, 
    'F1 Score' : f1_score_list
})

file_name = r'C:\Users\isarachchand\Documents\git\apf\output\cyber_risk\model_accuracies.csv'

model_scores.to_csv(file_name, encoding='utf-8', index=False)