In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from hyperopt import space_eval

#algorithms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

import os

import warnings
warnings.filterwarnings("ignore")


df=pd.read_csv(r'C:\Users\isarachchand\\Documents\git\apf\datasets\cyber_risk\data\malign_websites_dataset.csv')


In [None]:
#Prep the data

100 * df['Type'].value_counts()/len(df)   #variable imbalance

#unique categories for each categorical column

for i in df.select_dtypes(include='object').columns:
    print(f"{i} -> {df[i].nunique()}")
    
df['CHARSET'].value_counts()

# Top 5 categories kept

def CHARSET_CLEANER(x):
    if x not in ['UTF-8','ISO-8859-1','utf-8','us-ascii','iso-8859-1']:
        return "OTHERS"
    else:
        return x

df['CHARSET'] = df['CHARSET'].apply(CHARSET_CLEANER)
df['CHARSET'].value_counts()
df['SERVER'].value_counts()

# Top 5 categories kept

def SERVER_CLEANER(x):
    if x not in ['Apache','nginx','None','Microsoft-HTTPAPI/2.0','cloudflare-nginx']:
        return "OTHERS"
    else:
        return x
    
df['SERVER'] = df['SERVER'].apply(SERVER_CLEANER)
df['SERVER'].value_counts()
df['WHOIS_STATEPRO'].value_counts()[:11]

def STATE_CLEANER(x):
    if x not in ['CA','None','NY','WA','Barcelona','FL']:
        return "OTHERS"
    else:
        return x

df['WHOIS_STATEPRO'] = df['WHOIS_STATEPRO'].apply(STATE_CLEANER)
df['WHOIS_STATEPRO'].value_counts()

def DATE_CLEANER(x):
    if x == 'None':
        return "Absent"
    else:
        return "Present"
df['WHOIS_REGDATE'] = df['WHOIS_REGDATE'].apply(DATE_CLEANER)
df['WHOIS_UPDATED_DATE'] = df['WHOIS_UPDATED_DATE'].apply(DATE_CLEANER)

df.drop(['URL','WHOIS_COUNTRY','CONTENT_LENGTH'],axis=1,inplace=True)
# change null values to 0

df = df.fillna(0)

le = LabelEncoder()
for column in ['CHARSET','SERVER', 'WHOIS_STATEPRO', 'WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']:
    df[column] = le.fit_transform(df[column].astype(str))



In [None]:
models_list = []
accuracy_list = []


In [None]:
# Split data
y = df['Type']
X = df.drop('Type', axis=1)

best_index = 1
partition_count = 5

# get train test split
partition_size = math.ceil(len(X) / partition_count)
test_start = best_index * partition_size
test_end = test_start + partition_size
test_x = X[test_start:test_end]
test_y = y[test_start:test_end]
train_x = pd.concat([X[:test_start],X[test_end:]])
train_y =  pd.concat([y[:test_start], y[test_end:]])

In [None]:
# Defining Search Space
space = hp.choice('classifiers', [
    {
    'model':KNeighborsClassifier(),
    'params':{
        'model__n_neighbors': hp.choice('knc.n_neighbors', range(2,10)),
        'model__algorithm': hp.choice('knc.algorithm',
                                      ['auto', 'ball_tree', 'kd_tree']),
        'model__metric': hp.choice('knc.metric', ['chebyshev', 'minkowski'])
    }
    },
    {
    'model':SVC(),
    'params':{
        'model__C': hp.choice('C', np.arange(0.005,1.0,0.01)),
        'model__kernel': hp.choice('kernel',['linear', 'rbf', 'sigmoid']),
        'model__degree':hp.choice('degree',[2,3,4]),
        'model__gamma': hp.uniform('gamma',0.001,1000)
    }
    },

    {
    'model': LogisticRegression(verbose=0),
    'params': {
        'model__penalty': hp.choice('lr.penalty', ['none', 'l2']),
        'model__C': hp.choice('lr.C', np.arange(0.005,1.0,0.01))

    }
    },
        {
    'model': xgb.XGBClassifier(eval_metric='logloss', verbosity=0),
    'params': {
        'model__max_depth' : hp.choice('xgb.max_depth',
                                       range(5, 30, 1)),
        'model__learning_rate' : hp.quniform('xgb.learning_rate',
                                             0.01, 0.5, 0.01),
        'model__n_estimators' : hp.choice('xgb.n_estimators',
                                          range(5, 50, 1)),
        'model__reg_lambda' : hp.uniform ('xgb.reg_lambda', 0,1),
        'model__reg_alpha' : hp.uniform ('xgb.reg_alpha', 0,1)
    }
    },
#     {
#     'model': RandomForestClassifier(), # Default params
#     'params': {
#     'max_depth': hp.quniform("max_depth", 10, 180, 1),
#     'min_sample_leaf' : hp.uniform('min_samples_leaf',1,5),
#     'min_samples_split':hp.uniform('min_samples_split',2,6),
#     'n_estimators': hp.uniform('n_estimators', 200, 900),
#     'max_features':hp.choice('max_features',['sqrt', 'log2'])
#     }
#     },
#     {
#     'model': DecisionTreeClassifier(),
#     'params': {
#         'max_depth': hp.choice('max_depth', range(1,20)),
#         'max_features': hp.choice('max_features', range(1,5)),
#         'criterion': hp.choice('criterion', ["gini", "entropy"]),
#         'min_sample_leaf' : hp.uniform('min_samples_leaf',1,5),
#         'min_samples_split':hp.uniform('min_samples_split',2,6),
    
#         }
        
#     }
])

In [None]:
# Defining Objective function whose loss we have to minimize
def objective(args):
    
    # Initialize model pipeline
    pipe = Pipeline(steps=[
        ('model', args["model"]) # args[model] will be sent by fmin from search space
    ])
    
    pipe.set_params(**args['params']) # Model parameters will be set here
    
    # Cross Validation Score. Note the transformer.fit_transform for X_train. 
    
    score = cross_val_score(pipe, train_x, train_y, cv=5, n_jobs=-1, error_score=0.99)
    #accuracy = accuracy_score(pred, test_y)
    print(f"Model Name: {args['model']}: ", score)
          
    # Since we have to minimize the score, we return 1- score.
    return {'loss': 1 - np.median(score), 'status': STATUS_OK}



In [None]:
# Hyperopts Trials() records all the model and run artifacts.
trials = Trials()

# Fmin will call the objective funbction with selective param set. 
# The choice of algorithm will narrow the searchspace.

best_classifier = fmin(objective, space, algo=tpe.suggest,
                       max_evals=50, trials=trials)

# Best_params of the best model
best_params = space_eval(space, best_classifier)

In [None]:
from sklearn.metrics import classification_report

# Training the best model
model = best_params['model'].fit(train_x, train_y)

# Predicting with the best model
y_pred_train = model.predict(train_x)
y_pred_test = model.predict(test_x)

# Classification Report 
print('Training Classification Report for estimator: ',
      str(model).split('(')[0])
print('\n', classification_report(train_y, y_pred_train))
print('\n', classification_report(test_y, y_pred_test))

In [None]:
# print(models_list)
# print(accuracy_list)

In [None]:
# Data frame with accuracies of models

model_scores = pd.DataFrame({
    'Model Name' : models_list,
    'Accuracy' : accuracy_list
})

file_name = r'C:\Users\isarachchand\Documents\git\apf\output\cyber_risk\model_accuracies.csv'

model_scores.to_csv(file_name, encoding='utf-8', index=False)