In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score

import os

import warnings
warnings.filterwarnings("ignore")


df=pd.read_csv(r'C:\Users\isarachchand\\Documents\git\apf\datasets\cyber_risk\data\malign_websites_dataset.csv')


In [2]:
#Prep the data

100 * df['Type'].value_counts()/len(df)   #variable imbalance

#unique categories for each categorical column

for i in df.select_dtypes(include='object').columns:
    print(f"{i} -> {df[i].nunique()}")
    
df['CHARSET'].value_counts()

# Top 5 categories kept

def CHARSET_CLEANER(x):
    if x not in ['UTF-8','ISO-8859-1','utf-8','us-ascii','iso-8859-1']:
        return "OTHERS"
    else:
        return x

df['CHARSET'] = df['CHARSET'].apply(CHARSET_CLEANER)
df['CHARSET'].value_counts()
df['SERVER'].value_counts()

# Top 5 categories kept

def SERVER_CLEANER(x):
    if x not in ['Apache','nginx','None','Microsoft-HTTPAPI/2.0','cloudflare-nginx']:
        return "OTHERS"
    else:
        return x
    
df['SERVER'] = df['SERVER'].apply(SERVER_CLEANER)
df['SERVER'].value_counts()
df['WHOIS_STATEPRO'].value_counts()[:11]

def STATE_CLEANER(x):
    if x not in ['CA','None','NY','WA','Barcelona','FL']:
        return "OTHERS"
    else:
        return x

df['WHOIS_STATEPRO'] = df['WHOIS_STATEPRO'].apply(STATE_CLEANER)
df['WHOIS_STATEPRO'].value_counts()

def DATE_CLEANER(x):
    if x == 'None':
        return "Absent"
    else:
        return "Present"
df['WHOIS_REGDATE'] = df['WHOIS_REGDATE'].apply(DATE_CLEANER)
df['WHOIS_UPDATED_DATE'] = df['WHOIS_UPDATED_DATE'].apply(DATE_CLEANER)

df.drop(['URL','WHOIS_COUNTRY','CONTENT_LENGTH'],axis=1,inplace=True)
# change null values to 0

df = df.fillna(0)

le = LabelEncoder()
for column in ['CHARSET','SERVER', 'WHOIS_STATEPRO', 'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']:
    df[column] = le.fit_transform(df[column].astype(str))



URL -> 1781
CHARSET -> 9
SERVER -> 239
WHOIS_COUNTRY -> 49
WHOIS_STATEPRO -> 182
WHOIS_REGDATE -> 891
WHOIS_UPDATED_DATE -> 594


In [3]:
models_list = []
accuracy_list = []


In [4]:
# Split data
y = df['Type']
X = df.drop('Type', axis=1)

best_index = 1
partition_count = 5

# get train test split
partition_size = math.ceil(len(X) / partition_count)
test_start = best_index * partition_size
test_end = test_start + partition_size
test_x = X[test_start:test_end]
test_y = y[test_start:test_end]
train_x = pd.concat([X[:test_start],X[test_end:]])
train_y =  pd.concat([y[:test_start], y[test_end:]])

## Hyperparameter Optimisation with Random Forest

In [14]:
models_list.append("Random Forest")

space={'max_depth': hp.quniform("max_depth", 10, 180, 1), # 120, 180
        'min_sample_leaf' : hp.uniform('min_samples_leaf',1,5),
        'min_samples_split':hp.uniform('min_samples_split',2,6),
        'n_estimators': hp.uniform('n_estimators', 200, 900), # 100, 200, 300
       'max_features':hp.choice('max_features',['sqrt', 'log2'])
    }

def objective(space):
    
    clf = RandomForestClassifier(
        n_estimators=int(space['n_estimators']), max_depth=int(space['max_depth']),
        min_samples_split=int(space['min_samples_split']),
        min_samples_leaf=int(space['min_sample_leaf']),
        max_features=space['max_features'], random_state=2)

    clf.fit(train_x, train_y)
    pred = clf.predict(test_x)

    accuracy = recall_score(test_y, pred, average='macro')
    #accuracy = accuracy_score(pred, test_y)
    
    print("Accuracy: ", accuracy)
    print("\n")
    return {'loss': -accuracy, 'status': STATUS_OK}


trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 200,
                        trials = trials)

print("Best hyperparameters are: {}".format(best_hyperparams))

best_acc = min([t['result']['loss'] for t in trials.trials])
accuracy_list.append(-best_acc)

Accuracy:                                              
0.9523809523809523                                     
Accuracy:                                                                         
0.9495798319327731                                                                
Accuracy:                                                                         
0.9495798319327731                                                                
Accuracy:                                                                         
0.9495798319327731                                                                
Accuracy:                                                                         
0.9495798319327731                                                                
Accuracy:                                                                         
0.9495798319327731                                                                
Accuracy:                                                 

Accuracy:                                                                          
0.9495798319327731                                                                 
Accuracy:                                                                          
0.9495798319327731                                                                 
Accuracy:                                                                          
0.9495798319327731                                                                 
Accuracy:                                                                          
0.9467787114845938                                                                 
Accuracy:                                                                          
0.9495798319327731                                                                 
Accuracy:                                                                          
0.9495798319327731                                                          

0.9495798319327731                                                                 
Accuracy:                                                                          
0.9551820728291317                                                                 
Accuracy:                                                                          
0.9495798319327731                                                                 
Accuracy:                                                                           
0.9523809523809523                                                                  
Accuracy:                                                                           
0.9523809523809523                                                                  
Accuracy:                                                                           
0.9467787114845938                                                                  
Accuracy:                                                             

0.9523809523809523                                                                  
Accuracy:                                                                           
0.9523809523809523                                                                  
Accuracy:                                                                           
0.9523809523809523                                                                  
Accuracy:                                                                           
0.9523809523809523                                                                  
Accuracy:                                                                           
0.9523809523809523                                                                  
Accuracy:                                                                           
0.9523809523809523                                                                  
Accuracy:                                                        

0.9523809523809523                                                                  
Accuracy:                                                                           
0.9495798319327731                                                                  
Accuracy:                                                                           
0.9551820728291317                                                                  
Accuracy:                                                                           
0.9495798319327731                                                                  
Accuracy:                                                                           
0.9495798319327731                                                                  
Accuracy:                                                                           
0.9495798319327731                                                                  
Accuracy:                                                        

## Hyperparameter optimisation XGBClassifier

In [7]:
models_list.append("XGBClassifier")

space2={'max_depth': hp.quniform("max_depth", 10, 180, 1), # 120, 180
        'gamma': hp.uniform ('gamma', 1,25),
        #'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        #'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        #'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        #'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.uniform('n_estimators', 200, 900), # 100, 200, 300
        #"n_estimators": hp.quniform('n_estimators', 100,200,1),
        #'seed': 0
    }


def objective2(space2):
    clf_model = xgb.XGBClassifier(
        n_estimators=int(space2['n_estimators']), gamma=space2['gamma'], max_depth=int(space2['max_depth']), random_state=2)

    print(int(space2['n_estimators']))
    print(int(space2['max_depth']))

    evaluation = [(train_x, train_y), (test_x, test_y)]

    clf_model.fit(train_x, train_y,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10, verbose=False)

    pred2 = clf_model.predict(test_x)
    #accuracy = accuracy_score(pred, test_y)
    accuracy2 = recall_score(test_y, pred2, average='macro')

    print("SCORE:", accuracy2)
    return {'loss': -accuracy2, 'status': STATUS_OK}


trials2 = Trials()

best_hyperparams2 = fmin(fn = objective2,
                        space = space2,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials2)

print("The best hyperparameters are : ","\n")
print(best_hyperparams2)

best_acc = min([t['result']['loss'] for t in trials2.trials])
accuracy_list.append(-best_acc)

423                                                    
67                                                     
SCORE:                                                 
0.8563599876505095                                     
882                                                                               
129                                                                               
SCORE:                                                                            
0.760381290521766                                                                 
412                                                                               
136                                                                               
SCORE:                                                                            
0.8032957702994752                                                                
446                                                                               
125                          

0.7009879592466811                                                                 
771                                                                                
11                                                                                 
SCORE:                                                                             
0.8473294226613153                                                                 
613                                                                                
38                                                                                 
SCORE:                                                                             
0.8335520222290831                                                                 
724                                                                                
22                                                                                 
SCORE:                                                                      

339                                                                                
75                                                                                 
SCORE:                                                                             
0.8733019450447669                                                                 
343                                                                                
153                                                                                
SCORE:                                                                             
0.7163476381599259                                                                 
704                                                                                
62                                                                                 
SCORE:                                                                             
0.7041525162087063                                                          

93                                                                                 
SCORE:                                                                             
0.7009879592466811                                                                 
706                                                                                
34                                                                                 
SCORE:                                                                             
0.8563599876505095                                                                 
783                                                                                
48                                                                                 
SCORE:                                                                             
0.7879360913862303                                                                 
672                                                                         

SCORE:                                                                             
0.7863538129052177                                                                 
453                                                                                
90                                                                                 
SCORE:                                                                             
0.8489117011423278                                                                 
476                                                                                
70                                                                                 
SCORE:                                                                             
0.730125038592158                                                                  
100%|██████████| 100/100 [00:09<00:00, 10.59trial/s, best loss: -0.8733019450447669]
The best hyperparameters are :  

{'gamma': 5.551956259340795, 'max_depth':

## K Nearest Neighbour

In [8]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import cross_val_score

# models_list.append("K Nearest Neighbour")

# def hyperopt_train_test(params):
#     clf = KNeighborsClassifier(**params)
    
#     #accuracy = accuracy_score(pred, test_y)
#     #accuracy2 = recall_score(test_y, pred2, average='macro')
#     return cross_val_score(clf, X, y).mean()

# space4knn = {
#     'n_neighbors': hp.choice('n_neighbors', range(1,100))
# }

# def f(params):
#     acc = hyperopt_train_test(params)
#     return {'loss': -acc, 'status': STATUS_OK}

# trials = Trials()

# best = fmin(f, space4knn, algo=tpe.suggest, max_evals=100, trials=trials)

# print('best: ', best)

# best_acc = min([t['result']['loss'] for t in trials.trials])
# accuracy_list.append(-best_acc)


100%|██████████| 100/100 [00:11<00:00,  8.68trial/s, best loss: -0.9287020426147988]
best:  {'n_neighbors': 1}


In [17]:
from sklearn.neighbors import KNeighborsClassifier

models_list.append("K Nearest Neighbour")

spaceknn={'n_neighbors': hp.choice('n_neighbors', range(1,100)) }

def objective(spaceknn):
    
    clf = KNeighborsClassifier(
        n_neighbors=int(spaceknn['n_neighbors']))
        
    clf.fit(train_x, train_y)
    pred = clf.predict(test_x)

    accuracy = recall_score(test_y, pred, average='macro')
    #accuracy = accuracy_score(pred, test_y)
    
    print("Accuracy: ", accuracy)
    print("\n")
    return {'loss': -accuracy, 'status': STATUS_OK}


trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = spaceknn,
                        algo = tpe.suggest,
                        max_evals = 200,
                        trials = trials)

print("Best hyperparameters are: {}".format(best_hyperparams))

best_acc = min([t['result']['loss'] for t in trials.trials])
accuracy_list.append(-best_acc)

Accuracy:                                              
0.5090305649891942                                     
Accuracy:                                              
0.5090305649891942                                                                
Accuracy:                                                                         
0.5377045384377894                                                                
Accuracy:                                                                         
0.5090305649891942                                                                
Accuracy:                                                                         
0.5090305649891942                                                                
Accuracy:                                                                         
0.5090305649891942                                                                
Accuracy:                                                                         
0.

Accuracy:                                                                          
0.5090305649891942                                                                 
Accuracy:                                                                          
0.6246526705773388                                                                 
Accuracy:                                                                          
0.6246526705773388                                                                 
Accuracy:                                                                          
0.6246526705773388                                                                 
Accuracy:                                                                          
0.5090305649891942                                                                 
Accuracy:                                                                          
0.5090305649891942                                                          

0.5090305649891942                                                                 
Accuracy:                                                                          
0.5090305649891942                                                                 
Accuracy:                                                                          
0.5090305649891942                                                                 
Accuracy:                                                                          
0.5530642173510343                                                                  
Accuracy:                                                                           
0.5636770608212411                                                                  
Accuracy:                                                                           
0.5090305649891942                                                                  
Accuracy:                                                              

0.5090305649891942                                                                  
Accuracy:                                                                           
0.6750154368632294                                                                  
Accuracy:                                                                           
0.7831892559431923                                                                  
Accuracy:                                                                           
0.5090305649891942                                                                  
Accuracy:                                                                           
0.5090305649891942                                                                  
Accuracy:                                                                           
0.5090305649891942                                                                  
Accuracy:                                                        

0.5090305649891942                                                                  
Accuracy:                                                                           
0.5090305649891942                                                                  
Accuracy:                                                                           
0.5090305649891942                                                                  
Accuracy:                                                                           
0.519643408459401                                                                   
Accuracy:                                                                           
0.5090305649891942                                                                  
Accuracy:                                                                           
0.7831892559431923                                                                  
Accuracy:                                                        

## Decision Trees

In [9]:
from sklearn.tree import DecisionTreeClassifier

models_list.append("Decision Tree")

space4dt = {
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.choice('max_features', range(1,5)),
    'criterion': hp.choice('criterion', ["gini", "entropy"]),
    'min_sample_leaf' : hp.uniform('min_samples_leaf',1,5),
    'min_samples_split':hp.uniform('min_samples_split',2,6),
    
}

def objective(space4dt):
    print("df")
    clf = DecisionTreeClassifier(
        max_depth=int(space4dt['max_depth']),
        max_features=space4dt['max_features'],
        min_samples_split=int(space4dt['min_samples_split']),
        min_samples_leaf=int(space4dt['min_sample_leaf']),
        criterion=space4dt['criterion'],
        random_state=2)

    clf.fit(train_x, train_y)
    pred = clf.predict(test_x)

    #accuracy = accuracy_score(pred, test_y)
    accuracy = recall_score(test_y, pred, average='macro')
    
    print("Accuracy: ", accuracy)
    print("\n")
    return {'loss': -accuracy, 'status': STATUS_OK}


trials = Trials()
print("dff")
best_hyperparams = fmin(fn = objective,
                        space = space4dt,
                        algo = tpe.suggest,
                        max_evals = 200,
                        trials = trials)

print("Best hyperparameters are: {}".format(best_hyperparams))

best_acc = min([t['result']['loss'] for t in trials.trials])
accuracy_list.append(-best_acc)

dff
df                                                     
Accuracy:                                              
0.7694118555109601                                     
df                                                     
Accuracy:                                                                         
0.5912318616857055                                                                
df                                                                                
Accuracy:                                                                         
0.834671194813214                                                                 
df                                                                                
Accuracy:                                                                        
0.6172043840691571                                                               
df                                                                               
Accuracy:                   

df                                                                                 
Accuracy:                                                                          
0.6454152516208707                                                                 
df                                                                                 
Accuracy:                                                                          
0.8579422661315221                                                                 
df                                                                                 
Accuracy:                                                                          
0.863808274158691                                                                  
df                                                                                 
Accuracy:                                                                          
0.6765977153442421                                                          

Accuracy:                                                                          
0.7100185242358753                                                                 
df                                                                                 
Accuracy:                                                                          
0.8728388391478852                                                                 
df                                                                                 
Accuracy:                                                                          
0.8606437171966657                                                                 
df                                                                                 
Accuracy:                                                                          
0.8606437171966657                                                                 
df                                                                          

0.8850339610991047                                                                 
df                                                                                 
Accuracy:                                                                          
0.8351343007100958                                                                 
df                                                                                 
Accuracy:                                                                          
0.711600802716888                                                                  
df                                                                                 
Accuracy:                                                                           
0.8850339610991047                                                                  
df                                                                                  
Accuracy:                                                                

0.8606437171966657                                                                  
df                                                                                  
Accuracy:                                                                           
0.730125038592158                                                                   
df                                                                                  
Accuracy:                                                                           
0.6765977153442421                                                                  
df                                                                                  
Accuracy:                                                                           
0.8579422661315221                                                                  
df                                                                                  
Accuracy:                                                        

0.8351343007100958                                                                  
df                                                                                  
Accuracy:                                                                           
0.8850339610991047                                                                  
df                                                                                  
Accuracy:                                                                           
0.7359910466193269                                                                  
df                                                                                  
Accuracy:                                                                           
0.8606437171966657                                                                  
df                                                                                  
Accuracy:                                                        

0.8606437171966657                                                                  
df                                                                                  
Accuracy:                                                                           
0.6887928372954616                                                                  
df                                                                                  
Accuracy:                                                                           
0.5                                                                                 
df                                                                                  
Accuracy:                                                                           
0.5                                                                                 
df                                                                                  
Accuracy:                                                        

## SVM

In [10]:
# from sklearn.svm import SVC

# models_list.append("SVM")

# space4svm = {'C': hp.uniform('C', 0, 20),
#     'kernel': hp.choice('kernel', ['linear', 'sigmoid', 'poly', 'rbf']),
#     'gamma': hp.uniform('gamma', 0, 20),
#     }

# def objective(space4svm):
    
#     clf = SVC(
#         C=space4svm['C'], kernel=space4svm['kernel'],
#         gamma=int(space4svm['gamma']),
#         random_state=2)

#     clf.fit(train_x, train_y)
#     pred = clf.predict(test_x)

#     accuracy = recall_score(test_y, pred, average='macro')
    
#     print("Accuracy: ", accuracy)
#     print("\n")
#     return {'loss': -accuracy, 'status': STATUS_OK}


# trials = Trials()

# best_hyperparams = fmin(fn = objective,
#                         space = space4svm,
#                         algo = tpe.suggest,
#                         max_evals = 50,
#                         trials = trials)

# print("Best hyperparameters are: {}".format(best_hyperparams))
# best_acc = min([t['result']['loss'] for t in trials.trials])
# accuracy_list.append(-best_acc)

## Logistic Regression

In [11]:
# from sklearn.linear_model import LogisticRegression

# models_list.append("Logistic Regression")

# space4lr = {
#     'penalty': hp.choice('penalty', ['none', 'l1', 'l2', 'elasticnet']),
#     'solver': hp.choice('solver', ['liblinear', 'none']),
#     'C': hp.uniform('C', 0, 20)
    
# }

# def objective(space4tlr):

#     clf = LogisticRegression(
#         penalty=space4lr['penalty'],
#         solver=space4lr['solver'], 
#         C=space4lr['C'],
#         random_state=2)

#     clf.fit(train_x, train_y)
#     pred = clf.predict(test_x)

#     accuracy = recall_score(test_y, pred, average='macro')
    
#     print("Accuracy: ", accuracy)
#     print("\n")
#     return {'loss': -accuracy, 'status': STATUS_OK}


# trials = Trials()
# best_hyperparams = fmin(fn = objective,
#                         space = space4lr,
#                         algo = tpe.suggest,
#                         max_evals = 200,
#                         trials = trials)

# print("Best hyperparameters are: {}".format(best_hyperparams))

# best_acc = min([t['result']['loss'] for t in trials.trials])
# accuracy_list.append(-best_acc)

In [12]:
print(models_list)
print(accuracy_list)

['Random Forest', 'XGBClassifier', 'K Nearest Neighbour', 'Decision Tree']
[-0.8154908922506947, -0.8733019450447669, -0.9287020426147988, -0.8850339610991047]


In [13]:
# Data frame with accuracies of models

model_scores = pd.DataFrame({
    'Model Name' : models_list,
    'Accuracy' : accuracy_list
})

file_name = r'C:\Users\isarachchand\Documents\git\apf\output\cyber_risk\model_accuracies.csv'

model_scores.to_csv(file_name, encoding='utf-8', index=False)