In [1]:
import warnings
warnings.filterwarnings("ignore")
from collections import Counter
# Import pandas and numpy
import pandas as pd
import numpy as np
# Import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.linear_model import LogisticRegression
import wandb
import time
# Import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# Fix the random seed
np.random.seed(7)

In [3]:
X_train, y_train = np.load("X_train.npy", allow_pickle=True), np.load("y_train.npy", allow_pickle=True)
X_test, y_test = np.load("X_test.npy", allow_pickle=True), np.load("y_test.npy", allow_pickle=True)

In [4]:
def evaluation_pipeline(model, train_data, test_data, name):
    
    wandb.init(project="PhishingWebsites", name=name)
    
    (X_train, y_train) = train_data
    (X_test, y_test) = test_data
    
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time() - start
    prediction = model.predict(X_test)

    wandb.log({"accuracy":accuracy_score(y_test, prediction)*100.0,\
               "precision": precision_recall_fscore_support(y_test, prediction, average='macro')[0],
               "recall": precision_recall_fscore_support(y_test, prediction, average='macro')[1],
               "training_time":end})
    
    print("Accuracy score of the Logistic Regression classifier with default hyperparameter values {0:.2f}%"\
              .format(accuracy_score(y_test, prediction)*100.))
    print("\n")
    print("----Classification report of the Logistic Regression classifier with default hyperparameter value----")
    print("\n")
    print(classification_report(y_test, prediction, target_names=["Phishing Websites", "Normal Websites"]))

In [5]:
logistic_regresion = LogisticRegression()
evaluation_pipeline(logistic_regresion, (X_train, y_train),(X_test, y_test), "logistic_regression")

Accuracy score of the Logistic Regression classifier with default hyperparameter values 92.46%


----Classification report of the Logistic Regression classifier with default hyperparameter value----


                   precision    recall  f1-score   support

Phishing Websites       0.93      0.90      0.91      3924
  Normal Websites       0.92      0.94      0.93      4920

         accuracy                           0.92      8844
        macro avg       0.92      0.92      0.92      8844
     weighted avg       0.92      0.92      0.92      8844



In [6]:
penalty = ["l1", "l2"]
C = [0.8, 0.9, 1.0]
tol = [0.01, 0.001 ,0.0001]
max_iter = [100, 150, 200, 250]

param_dict = dict(penalty=penalty, C=C, tol=tol, max_iter=max_iter)

In [7]:
random_model = RandomizedSearchCV(estimator=logistic_regresion, param_distributions=param_dict, cv=5)

random_model_result = random_model.fit(X_train, y_train)

best_score, best_params = random_model_result.best_score_, random_model_result.best_params_
print("Best score: %.2f using %s" % (best_score*100., best_params))

Best score: 92.44 using {'tol': 0.0001, 'penalty': 'l1', 'max_iter': 250, 'C': 0.8}


In [8]:
# config wandb
config = wandb.config

config.tol = 0.001
config.penalty = "l1"
config.C = 1.0

In [9]:
logistic_regresion = LogisticRegression(tol=config.tol, penalty=config.penalty, max_iter=250, C=config.C)
evaluation_pipeline(logistic_regresion, (X_train, y_train), (X_test, y_test), "Logistic_Regression_Random_Search")

Accuracy score of the Logistic Regression classifier with default hyperparameter values 92.48%


----Classification report of the Logistic Regression classifier with default hyperparameter value----


                   precision    recall  f1-score   support

Phishing Websites       0.93      0.90      0.91      3924
  Normal Websites       0.92      0.94      0.93      4920

         accuracy                           0.92      8844
        macro avg       0.93      0.92      0.92      8844
     weighted avg       0.92      0.92      0.92      8844

