In [1]:
# Fix error AttributeError: 'super' object has no attribute '__sklearn_tags__'
# https://stackoverflow.com/questions/79290968/super-object-has-no-attribute-sklearn-tags
!pip install scikit-learn==1.3.1

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from google.colab import drive
from sklearn.preprocessing import StandardScaler
from google.colab import drive
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
from sklearn.utils import resample
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
merged_df_html = pd.read_csv("/content/merged_df_html.csv")
merged_df_features = pd.read_csv("/content/merged_df_features.csv")

In [3]:
def preprocess_html(train_df):
  columns_to_remove = [
    'phishing', 'rec_id', 'ExtFavicon',
    ]

  # Split labels and features
  labels = train_df['phishing']
  features = train_df.drop(columns_to_remove, axis=1)

  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)

  columns_to_scale = [
    'total_forms', 'total_hyperlinks'
  ]

  # Standardize the features
  scaler = StandardScaler()
  X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
  X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

  selector = SelectKBest(mutual_info_classif, k=25)
  X_new = selector.fit_transform(X_train, y_train)
  selected_feature_indices = selector.get_support(indices=True)
  selected_feature_names = X_train.columns[selected_feature_indices]

  X_train = X_train[selected_feature_names]
  X_test = X_test[selected_feature_names]
  return X_train, y_train, X_test, y_test



In [4]:
html_train_X, html_train_y, html_test_X, html_test_y = preprocess_html(merged_df_html)


In [5]:
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "k-NN (k=1)": KNeighborsClassifier(n_neighbors=1),
    "k-NN (k=3)": KNeighborsClassifier(n_neighbors=3),
    "Gaussian Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "MLP Neural Network": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42)
}

In [6]:
def evaluate_models(models, X_train, y_train, X_test, y_test):
  for model_name, model in models.items():
      # Train the model
      model.fit(X_train, y_train)

      # Predict on the training set
      y_train_pred = model.predict(X_train)
      # Calculate training accuracy
      train_accuracy = accuracy_score(y_train, y_train_pred)

      # Predict on the test set
      y_test_pred = model.predict(X_test)
      # Calculate testing accuracy
      test_accuracy = accuracy_score(y_test, y_test_pred)
      print(f"ID of {model_name}: {id(model)}")  # Print the ID

      # Print the accuracies for each model
      print(f"{model_name} - Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")

In [7]:
def evaluate_models_2(models, X_train, y_train, X_test, y_test):
    print(f"{'Model':<20} {'Set':<10} {'Pre (%)':<10} {'Recall (%)':<10} {'F1-Score (%)':<15} {'AUC (%)':<10} {'ACC (%)':<10}")
    print("-" * 80)

    for model_name, model in models.items():

        # Evaluate on test set
        y_test_pred = model.predict(X_test)
        y_test_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
        test_precision = precision_score(y_test, y_test_pred) * 100
        test_recall = recall_score(y_test, y_test_pred) * 100
        test_f1 = f1_score(y_test, y_test_pred) * 100
        test_auc = roc_auc_score(y_test, y_test_prob) * 100 if y_test_prob is not None else "-"
        test_accuracy = accuracy_score(y_test, y_test_pred) * 100

        # Print test set metrics
        print(f"{model_name:<20} {'Test':<10} {test_precision:<10.2f} {test_recall:<10.2f} {test_f1:<15.2f} {test_auc:<10} {test_accuracy:<10.2f}")

In [8]:
evaluate_models(models, html_train_X, html_train_y, html_test_X, html_test_y)

ID of Decision Tree: 133527859288944
Decision Tree - Train Accuracy: 0.9830, Test Accuracy: 0.9341
ID of k-NN (k=1): 133527859289856
k-NN (k=1) - Train Accuracy: 0.9643, Test Accuracy: 0.9224
ID of k-NN (k=3): 133527859282704
k-NN (k=3) - Train Accuracy: 0.9397, Test Accuracy: 0.9102
ID of Gaussian Naive Bayes: 133527859291536
Gaussian Naive Bayes - Train Accuracy: 0.8271, Test Accuracy: 0.8244
ID of Logistic Regression: 133527859286544
Logistic Regression - Train Accuracy: 0.8683, Test Accuracy: 0.8646
ID of MLP Neural Network: 133527859287600
MLP Neural Network - Train Accuracy: 0.9636, Test Accuracy: 0.9390
ID of Random Forest: 133527859287312
Random Forest - Train Accuracy: 0.9830, Test Accuracy: 0.9526
ID of XGBoost: 133527859286976
XGBoost - Train Accuracy: 0.9752, Test Accuracy: 0.9468


In [9]:
evaluate_models_2(models, html_train_X, html_train_y, html_test_X, html_test_y)

Model                Set        Pre (%)    Recall (%) F1-Score (%)    AUC (%)    ACC (%)   
--------------------------------------------------------------------------------
Decision Tree        Test       91.25      91.11      91.18           94.2712681187075 93.41     
k-NN (k=1)           Test       91.67      87.17      89.36           91.21750957523614 92.24     
k-NN (k=3)           Test       85.89      90.93      88.34           95.73015330804962 91.02     
Gaussian Naive Bayes Test       71.49      88.24      78.99           88.8179134806562 82.44     
Logistic Regression  Test       82.22      81.39      81.80           92.18412064443018 86.46     
MLP Neural Network   Test       92.46      91.13      91.79           97.77512094431452 93.90     
Random Forest        Test       94.48      92.73      93.60           98.76648852952817 95.26     
XGBoost              Test       93.60      92.08      92.83           98.47102278912459 94.68     


In [10]:
model_1 =  XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_1.fit(html_train_X, html_train_y)

# Predictions
y_pred = model_1.predict(html_test_X)

# Evaluation
accuracy = accuracy_score(html_test_y, y_pred)
precision = precision_score(html_test_y, y_pred)
recall = recall_score(html_test_y, y_pred)
f1 = f1_score(html_test_y, y_pred)
conf_matrix = confusion_matrix(html_test_y, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.9468125
Precision: 0.9359714673913043
Recall: 0.9208020050125313
F1 Score: 0.928324770487661
Confusion Matrix:
[[9638  377]
 [ 474 5511]]


In [11]:
def preprocess_features(train_df):
  columns_to_remove = [
      'phishing', 'rec_id',
    'qty_/_domain', 'qty_?_domain', 'qty_=_domain', 'qty_@_domain',
    'qty_&_domain', 'qty_!_domain', 'qty_ _domain', 'qty_~_domain',
    'qty_,_domain', 'qty_+_domain', 'qty_*_domain', 'qty_#_domain',
    'qty_$_domain', 'qty_%_domain',
      'time_response'	,'domain_spf'	,'asn_ip'	,'time_domain_activation'	,
      'time_domain_expiration'	,'qty_ip_resolved'	,'qty_nameservers',
      'qty_mx_servers',	'ttl_hostname'	,'tls_ssl_certificate',
      'qty_redirects',	'url_google_index',	'domain_google_index',	'url_shortened'
]

   # Split labels and features
  labels = train_df['phishing']
  features = train_df.drop(columns_to_remove, axis=1)

  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)

  scaler = StandardScaler()
  X_train  = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  return X_train, y_train, X_test, y_test



In [12]:
features_train_X, features_train_y, features_test_X, features_test_y = preprocess_features(merged_df_features)


In [13]:
models2 = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "k-NN (k=1)": KNeighborsClassifier(n_neighbors=1),
    "k-NN (k=3)": KNeighborsClassifier(n_neighbors=3),
    "Gaussian Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "MLP Neural Network": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42)
}

In [14]:
evaluate_models(models2, features_train_X, features_train_y, features_test_X, features_test_y)

ID of Decision Tree: 133527854693968
Decision Tree - Train Accuracy: 0.9836, Test Accuracy: 0.8818
ID of k-NN (k=1): 133527854951984
k-NN (k=1) - Train Accuracy: 0.9781, Test Accuracy: 0.8902
ID of k-NN (k=3): 133527854941568
k-NN (k=3) - Train Accuracy: 0.9365, Test Accuracy: 0.8947
ID of Gaussian Naive Bayes: 133527854951792
Gaussian Naive Bayes - Train Accuracy: 0.7076, Test Accuracy: 0.7078
ID of Logistic Regression: 133527854944112
Logistic Regression - Train Accuracy: 0.8456, Test Accuracy: 0.8462
ID of MLP Neural Network: 133527854944496
MLP Neural Network - Train Accuracy: 0.9474, Test Accuracy: 0.9098
ID of Random Forest: 133527854944544
Random Forest - Train Accuracy: 0.9836, Test Accuracy: 0.9145
ID of XGBoost: 133527854944448
XGBoost - Train Accuracy: 0.9323, Test Accuracy: 0.9150


In [15]:
evaluate_models_2(models2, features_train_X, features_train_y, features_test_X, features_test_y)

Model                Set        Pre (%)    Recall (%) F1-Score (%)    AUC (%)    ACC (%)   
--------------------------------------------------------------------------------
Decision Tree        Test       84.73      83.33      84.02           88.36984966835186 88.18     
k-NN (k=1)           Test       85.30      85.24      85.27           88.25303388525944 89.02     
k-NN (k=3)           Test       86.37      85.20      85.78           93.33439662505845 89.47     
Gaussian Naive Bayes Test       83.41      27.01      40.81           87.75447984896942 70.78     
Logistic Regression  Test       82.86      74.09      78.23           91.95659894641825 84.62     
MLP Neural Network   Test       87.57      88.34      87.96           96.4696590425941 90.98     
Random Forest        Test       89.11      87.80      88.45           96.74086044926298 91.45     
XGBoost              Test       89.31      87.69      88.49           97.2249493358518 91.50     


In [16]:
model_2 =  XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_2.fit(features_train_X, features_train_y)

# Predictions
y_pred = model_2.predict(features_test_X)

# Evaluation
accuracy = accuracy_score(features_test_y, y_pred)
precision = precision_score(features_test_y, y_pred)
recall = recall_score(features_test_y, y_pred)
f1 = f1_score(features_test_y, y_pred)
conf_matrix = confusion_matrix(features_test_y, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.91496875
Precision: 0.8931284677763551
Recall: 0.8768856855514583
F1 Score: 0.8849325495834567
Confusion Matrix:
[[18816  1252]
 [ 1469 10463]]


In [17]:
merged_df = pd.merge(merged_df_html, merged_df_features, on='rec_id', how='outer')

# Handle phishing label (assuming both phishing labels should match)
# If they don't match, prioritize one, or handle conflicts as needed
merged_df['phishing'] = merged_df['phishing_x'].combine_first(merged_df['phishing_y'])

# Drop duplicate phishing columns
merged_df.drop(columns=['phishing_x', 'phishing_y'], inplace=True)


In [18]:
def preprocess_merged_df(train_df):
  columns_to_remove = [
      'phishing', 'rec_id',
    'qty_/_domain', 'qty_?_domain', 'qty_=_domain', 'qty_@_domain',
    'qty_&_domain', 'qty_!_domain', 'qty_ _domain', 'qty_~_domain',
    'qty_,_domain', 'qty_+_domain', 'qty_*_domain', 'qty_#_domain',
    'qty_$_domain', 'qty_%_domain',
      'time_response'	,'domain_spf'	,'asn_ip'	,'time_domain_activation'	,
      'time_domain_expiration'	,'qty_ip_resolved'	,'qty_nameservers',
      'qty_mx_servers',	'ttl_hostname'	,'tls_ssl_certificate',
      'qty_redirects',	'url_google_index',	'domain_google_index',	'url_shortened', 'ExtFavicon'
]

   # Split labels and features
  labels = train_df['phishing']
  features = train_df.drop(columns_to_remove, axis=1)

  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)
  feature_names = X_train.columns

  feature_columns_to_scale = merged_df_features.columns.difference(columns_to_remove)
  html_columns_to_scale = [
    'total_forms', 'total_hyperlinks'
  ]
  columns_to_scale = feature_columns_to_scale.tolist() + html_columns_to_scale

  # Standardize the features
  scaler = StandardScaler()
  X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
  X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

  imputer = SimpleImputer(strategy='most_frequent')
  X_train = imputer.fit_transform(X_train)
  X_test = imputer.transform(X_test)

  # Convert back to DataFrame with original column names
  X_train = pd.DataFrame(X_train, columns=feature_names)
  X_test = pd.DataFrame(X_test, columns=feature_names)

  selector = SelectKBest(mutual_info_classif, k=40)
  X_new = selector.fit_transform(X_train, y_train)
  selected_feature_indices = selector.get_support(indices=True)
  selected_feature_names = X_train.columns[selected_feature_indices]

  X_train = X_train[selected_feature_names]
  X_test = X_test[selected_feature_names]


  return X_train, y_train, X_test, y_test

In [19]:
merged_train_X, merged_train_y, merged_test_X, merged_test_y = preprocess_merged_df(merged_df)

In [20]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(merged_train_X, merged_train_y)

predictions = xgb_model.predict(merged_test_X)

accuracy = accuracy_score(merged_test_y, predictions)
precision = precision_score(merged_test_y, predictions)
recall = recall_score(merged_test_y, predictions)
f1 = f1_score(merged_test_y, predictions)
conf_matrix = confusion_matrix(merged_test_y, predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.9273333333333333
Precision: 0.9138246041412911
Recall: 0.8899762752075919
F1 Score: 0.9017427884615385
Confusion Matrix:
[[21380  1132]
 [ 1484 12004]]


In [21]:

voting_model = VotingClassifier(
    estimators=[('xgboost_1', model_1), ('rf_1', models['Random Forest']),
                ('gaussian_1', models['Gaussian Naive Bayes']), ('logistic_1', models['Logistic Regression']),

                ('rf_2', models['Random Forest']), ('gaussian_2', models['Gaussian Naive Bayes']),
                ('logistic_2', models['Logistic Regression']), ('xgboost_2', model_2)
                ],
    voting='hard'  # Use 'soft' for probability averaging if needed
)

# Train the voting classifier
voting_model.fit(merged_train_X, merged_train_y)

# Make predictions
y_pred = voting_model.predict(merged_test_X)

accuracy = accuracy_score(merged_test_y, y_pred)
precision = precision_score(merged_test_y, y_pred)
recall = recall_score(merged_test_y, y_pred)
f1 = f1_score(merged_test_y, y_pred)
conf_matrix = confusion_matrix(merged_test_y, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)




Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.89425
Precision: 0.9338531863404141
Recall: 0.7724644128113879
F1 Score: 0.8455264759586123
Confusion Matrix:
[[21774   738]
 [ 3069 10419]]
