In [9]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score

import os

import warnings
warnings.filterwarnings("ignore")


df=pd.read_csv(r'C:\Users\isarachchand\\Documents\git\apf\datasets\cyber_risk\data\malign_websites_dataset.csv')

In [10]:
100 * df['Type'].value_counts()/len(df)   #variable imbalance

#unique categories for each categorical column

for i in df.select_dtypes(include='object').columns:
    print(f"{i} -> {df[i].nunique()}")
    
df['CHARSET'].value_counts()

# Top 5 categories kept

def CHARSET_CLEANER(x):
    if x not in ['UTF-8','ISO-8859-1','utf-8','us-ascii','iso-8859-1']:
        return "OTHERS"
    else:
        return x

df['CHARSET'] = df['CHARSET'].apply(CHARSET_CLEANER)
df['CHARSET'].value_counts()
df['SERVER'].value_counts()

# Top 5 categories kept

def SERVER_CLEANER(x):
    if x not in ['Apache','nginx','None','Microsoft-HTTPAPI/2.0','cloudflare-nginx']:
        return "OTHERS"
    else:
        return x
    
df['SERVER'] = df['SERVER'].apply(SERVER_CLEANER)
df['SERVER'].value_counts()
df['WHOIS_STATEPRO'].value_counts()[:11]

def STATE_CLEANER(x):
    if x not in ['CA','None','NY','WA','Barcelona','FL']:
        return "OTHERS"
    else:
        return x

df['WHOIS_STATEPRO'] = df['WHOIS_STATEPRO'].apply(STATE_CLEANER)
df['WHOIS_STATEPRO'].value_counts()

def DATE_CLEANER(x):
    if x == 'None':
        return "Absent"
    else:
        return "Present"
df['WHOIS_REGDATE'] = df['WHOIS_REGDATE'].apply(DATE_CLEANER)
df['WHOIS_UPDATED_DATE'] = df['WHOIS_UPDATED_DATE'].apply(DATE_CLEANER)

df.drop(['URL','WHOIS_COUNTRY','CONTENT_LENGTH'],axis=1,inplace=True)
# change null values to 0

df = df.fillna(0)

le = LabelEncoder()
for column in ['CHARSET','SERVER', 'WHOIS_STATEPRO', 'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']:
    df[column] = le.fit_transform(df[column].astype(str))



URL -> 1781
CHARSET -> 9
SERVER -> 239
WHOIS_COUNTRY -> 49
WHOIS_STATEPRO -> 182
WHOIS_REGDATE -> 891
WHOIS_UPDATED_DATE -> 594


In [11]:
y = df['Type']
X = df.drop('Type', axis=1)

best_index = 1
partition_count = 5

# get train test split
partition_size = math.ceil(len(X) / partition_count)
test_start = best_index * partition_size
test_end = test_start + partition_size
test_x = X[test_start:test_end]
test_y = y[test_start:test_end]
train_x = pd.concat([X[:test_start],X[test_end:]])
train_y =  pd.concat([y[:test_start], y[test_end:]])

In [14]:
space={'max_depth': hp.quniform("max_depth", 10, 180, 1), # 120, 180
        'gamma': hp.uniform ('gamma', 1,25),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.uniform('n_estimators', 200, 900), # 100, 200, 300
        "n_estimators": hp.quniform('n_estimators', 100,200,1),
        'seed': 0
    }

'''
    clf = xgb.XGBClassifier(
        n_estimators=int(space['n_estimators']), max_depth=int(space['max_depth']), gamma=space['gamma'],
        reg_alpha=int(space['reg_alpha']), min_child_weight=int(space['min_child_weight']),
        colsample_bytree=int(space['colsample_bytree']))
'''
def objective(space):
    clf = xgb.XGBClassifier(
        n_estimators=int(space['n_estimators']), gamma=space['gamma'], max_depth=int(space['max_depth']))

    print("Number of Estimators: ", int(space['n_estimators']))
    print("Maximum depth: ", int(space['max_depth']))

    evaluation = [(train_x, train_y), (test_x, test_y)]

    clf.fit(train_x, train_y,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10, verbose=False)

    pred = clf.predict(test_x)
    #accuracy = accuracy_score(y_test, pred)
    accuracy = recall_score(test_y, pred, average='macro')
    
    print("\nAccuracy: ", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK}


trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

print("The best hyperparameters are : ","\n")
print(best_hyperparams)

Number of Estimators:                                  
191                                                    
Maximum depth:                                         
105                                                    
                                                       
SCORE:
0.7163476381599259                                     
Number of Estimators:                                                             
108                                                                               
Maximum depth:                                                                    
25                                                                                
                                                                                  
SCORE:
0.7041525162087063                                                                
Number of Estimators:                                                             
132                                                                  

Maximum depth:                                                                     
88                                                                                 
                                                                                   
SCORE:
0.7009879592466811                                                                 
Number of Estimators:                                                              
125                                                                                
Maximum depth:                                                                     
174                                                                                
                                                                                   
SCORE:
0.8563599876505095                                                                 
Number of Estimators:                                                              
179                                                           

Maximum depth:                                                                     
70                                                                                 
                                                                                   
SCORE:
0.8017134918184625                                                                 
Number of Estimators:                                                              
142                                                                                
Maximum depth:                                                                     
94                                                                                 
                                                                                   
SCORE:
0.8473294226613153                                                                 
Number of Estimators:                                                              
158                                                           

Maximum depth:                                                                     
22                                                                                 
                                                                                   
SCORE:
0.86268910157456                                                                   
Number of Estimators:                                                              
115                                                                              
Maximum depth:                                                                   
12                                                                               
                                                                                 
SCORE:
0.8382988576721211                                                               
Number of Estimators:                                                            
110                                                                       

Maximum depth:                                                                     
34                                                                                 
                                                                                   
SCORE:
0.8017134918184625                                                                 
Number of Estimators:                                                              
131                                                                                
Maximum depth:                                                                     
10                                                                                 
                                                                                   
SCORE:
0.760381290521766                                                                  
Number of Estimators:                                                              
135                                                           

Maximum depth:                                                                     
52                                                                                 
                                                                                   
SCORE:
0.8579422661315221                                                                 
Number of Estimators:                                                              
168                                                                                
Maximum depth:                                                                     
133                                                                                
                                                                                   
SCORE:
0.8611068230935474                                                                 
Number of Estimators:                                                              
164                                                           

Maximum depth:                                                                     
54                                                                                 
                                                                                   
SCORE:
0.7635458474837913                                                                 
Number of Estimators:                                                              
154                                                                                
Maximum depth:                                                                     
81                                                                                 
                                                                                   
SCORE:
0.7041525162087063                                                                 
Number of Estimators:                                                              
166                                                           