In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 

In [2]:
# Dataset source : "https://www.kaggle.com/datasets/
#                   shashwatwork/web-page-phishing-detection-dataset?select=dataset_phishing.csv"

# Path to file
path_file = 'C:/Users/ASUS/Downloads/Documents/dataset_phishing.csv'

# Read File
df = pd.read_csv(path_file)

# Display the DataFrame
df.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [3]:
X = df.drop(columns = ['url', 'status']).copy()
y, _ = pd.factorize(df['status'])

In [4]:
# Split Data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Print the sizes of data 
print("x_train shape : ", x_train.shape, "y_train shape : ", y_train.shape)
print("x_test shape : ", x_test.shape, "y_test shape : ", y_test.shape)

x_train shape :  (9144, 87) y_train shape :  (9144,)
x_test shape :  (2286, 87) y_test shape :  (2286,)


In [7]:
def check_stats(data):
    
    print("is there null value in any column : ", data.isna().any().sum())
    print('**' * 20)
    
    # this condition is copied from this notebook : with a modification on the threshold (my choice)
    # https://www.kaggle.com/code/unstructuredrahul/deep-learning-pytorch-binary-classification
    likely_categorical = {}
    for var in data.iloc[:,1:].columns:
        likely_categorical[var] = 1. * data[var].nunique() / data[var].count() < 0.003
        
    print("number of categorical var : ", len(likely_categorical))
    print('**' * 20)
    print("number of continuous var : ", len(data.columns) - len(likely_categorical))
    print('**' * 20)
    print("all categorical variables : ", likely_categorical)


check_stats(df)

is there null value in any column :  0
****************************************
number of categorical var :  88
****************************************
number of continuous var :  1
****************************************
all categorical variables :  {'length_url': False, 'length_hostname': False, 'ip': True, 'nb_dots': True, 'nb_hyphens': True, 'nb_at': True, 'nb_qm': True, 'nb_and': True, 'nb_or': True, 'nb_eq': True, 'nb_underscore': True, 'nb_tilde': True, 'nb_percent': True, 'nb_slash': True, 'nb_star': True, 'nb_colon': True, 'nb_comma': True, 'nb_semicolumn': True, 'nb_dollar': True, 'nb_space': True, 'nb_www': True, 'nb_com': True, 'nb_dslash': True, 'http_in_path': True, 'https_token': True, 'ratio_digits_url': False, 'ratio_digits_host': False, 'punycode': True, 'port': True, 'tld_in_path': True, 'tld_in_subdomain': True, 'abnormal_subdomain': True, 'nb_subdomains': True, 'prefix_suffix': True, 'random_domain': True, 'shortening_service': True, 'path_extension': True, 'nb_r

Note that all variables considered categorical are of type int (confusing), 
the only variables with type categorical or object are status (target var) and url.

In [8]:
# Compute the AUC score
from sklearn.metrics import roc_curve, auc

def auc_score(y_true, y_hat):
    fpr, tpr, _ = roc_curve(y_true, y_hat)
    score = auc(fpr, tpr)
    return score

In [11]:
# Defining the hyperparameters to optimize
grid_parameters = [
    { # XGBoost
        'n_estimators': [400, 700, 1000],
        'colsample_bytree': [0.7, 0.8],
        'max_depth': [15,20,25],
        'reg_alpha': [1.1, 1.2, 1.3],
        'reg_lambda': [1.1, 1.2, 1.3],
        'subsample': [0.7, 0.8, 0.9]
    },
    
    { # Random Forest
        'max_depth':[3, 5, 10, 13],
        'n_estimators':[100, 200, 400, 600, 900],
        'max_features':[2, 4, 6, 8, 10]
    },
    {# KNN
        'n_neighbors' : [5,10,20,30]
    }
]

In [16]:
from sklearn.model_selection import GridSearchCV

# Search for best params, get the auc score
def grid_search_customize(x_train, x_test, y_train, y_test, model,
                   params, cv = 5, scoring_test = auc_score, scoring_fit = 'balanced_accuracy'):
    
    
    grid_search = GridSearchCV(
        estimator = model,
        param_grid = params,
        cv = cv,
        n_jobs = -1,
        scoring = scoring_fit,
        verbose = 2
    )
    
    b_params = grid_search.fit(x_train, y_train).best_params_
    
    preds = grid_search.best_estimator_.predict(x_test)
    score = scoring_test(y_test, preds)
    
    return b_params, score

In [17]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 

# Define estimators
models = [('xgbc', XGBClassifier()), ('rfc', RandomForestClassifier()), ('knnc', KNeighborsClassifier())]

In [19]:
def get_best_params(models, x_train, x_test, y_train, y_test,
                   params):
    
    models_best_params = {}
    # Loop over models
    for i, model in enumerate(models):
        models_best_params[model[0]], score = grid_search_customize(x_train, x_test,
                                                         y_train, y_test, model[1], params[i])

        print('the score of : ', model[0], '==>', score)
        print('**' * 20)
    
    return models_best_params



params_dict = get_best_params(models, x_train, x_test, y_train, y_test,
                              grid_parameters)

print(params_dict)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits
the score of :  xgbc ==> 0.9702171018937372
****************************************
Fitting 5 folds for each of 100 candidates, totalling 500 fits
the score of :  rfc ==> 0.9653562518133929
****************************************
Fitting 5 folds for each of 4 candidates, totalling 20 fits
the score of :  knnc ==> 0.8208777319554482
****************************************
{'xgbc': {'colsample_bytree': 0.7, 'max_depth': 15, 'n_estimators': 700, 'reg_alpha': 1.1, 'reg_lambda': 1.3, 'subsample': 0.7}, 'rfc': {'max_depth': 13, 'max_features': 6, 'n_estimators': 900}, 'knnc': {'n_neighbors': 5}}


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [20]:
params_dict

{'xgbc': {'colsample_bytree': 0.7,
  'max_depth': 15,
  'n_estimators': 700,
  'reg_alpha': 1.1,
  'reg_lambda': 1.3,
  'subsample': 0.7},
 'rfc': {'max_depth': 13, 'max_features': 6, 'n_estimators': 900},
 'knnc': {'n_neighbors': 5}}