In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from google.colab import drive
from sklearn.preprocessing import StandardScaler
from google.colab import drive
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
from sklearn.utils import resample
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
merged_df_html = pd.read_csv("/content/merged_df_html.csv")
merged_df_features = pd.read_csv("/content/merged_df_features.csv")

In [None]:
# Merge all dataframes into one
merged_df_html = pd.concat([df_html_0 , df_html_10000, df_html_20000, df_html_30000], ignore_index=True)
merged_df_features = pd.concat([df_features_0, df_features_10000, df_features_20000, df_features_30000], ignore_index=True)


In [None]:
def preprocess_html(train_df):
  columns_to_remove = [
    'phishing', 'rec_id', 'ExtFavicon',
    ]

  # Split labels and features
  labels = train_df['phishing']
  features = train_df.drop(columns_to_remove, axis=1)

  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)

  columns_to_scale = [
    'total_forms', 'total_hyperlinks'
  ]

  # Standardize the features
  scaler = StandardScaler()
  X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
  X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

  selector = SelectKBest(mutual_info_classif, k=25)
  X_new = selector.fit_transform(X_train, y_train)
  selected_feature_indices = selector.get_support(indices=True)
  selected_feature_names = X_train.columns[selected_feature_indices]

  X_train = X_train[selected_feature_names]
  X_test = X_test[selected_feature_names]
  return X_train, y_train, X_test, y_test



In [None]:
html_train_X, html_train_y, html_test_X, html_test_y = preprocess_html(merged_df_html)


In [None]:
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "k-NN (k=1)": KNeighborsClassifier(n_neighbors=1),
    "k-NN (k=3)": KNeighborsClassifier(n_neighbors=3),
    "Gaussian Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "MLP Neural Network": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42)
}

In [None]:
def evaluate_models(models, X_train, y_train, X_test, y_test):
  for model_name, model in models.items():
      # Train the model
      model.fit(X_train, y_train)

      # Predict on the training set
      y_train_pred = model.predict(X_train)
      # Calculate training accuracy
      train_accuracy = accuracy_score(y_train, y_train_pred)

      # Predict on the test set
      y_test_pred = model.predict(X_test)
      # Calculate testing accuracy
      test_accuracy = accuracy_score(y_test, y_test_pred)
      print(f"ID of {model_name}: {id(model)}")  # Print the ID

      # Print the accuracies for each model
      print(f"{model_name} - Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")

In [None]:
def evaluate_models_2(models, X_train, y_train, X_test, y_test):
    print(f"{'Model':<20} {'Set':<10} {'Pre (%)':<10} {'Recall (%)':<10} {'F1-Score (%)':<15} {'AUC (%)':<10} {'ACC (%)':<10}")
    print("-" * 80)

    for model_name, model in models.items():

        # Evaluate on test set
        y_test_pred = model.predict(X_test)
        y_test_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
        test_precision = precision_score(y_test, y_test_pred) * 100
        test_recall = recall_score(y_test, y_test_pred) * 100
        test_f1 = f1_score(y_test, y_test_pred) * 100
        test_auc = roc_auc_score(y_test, y_test_prob) * 100 if y_test_prob is not None else "-"
        test_accuracy = accuracy_score(y_test, y_test_pred) * 100

        # Print test set metrics
        print(f"{model_name:<20} {'Test':<10} {test_precision:<10.2f} {test_recall:<10.2f} {test_f1:<15.2f} {test_auc:<10} {test_accuracy:<10.2f}")

In [None]:
# evaluate_models(models, features_train_X, features_train_y, features_test_X, features_test_y)
evaluate_models(models, html_train_X, html_train_y, html_test_X, html_test_y)

ID of Decision Tree: 138002685832512
Decision Tree - Train Accuracy: 0.9829, Test Accuracy: 0.9363
ID of k-NN (k=1): 138002685828528
k-NN (k=1) - Train Accuracy: 0.9641, Test Accuracy: 0.9230
ID of k-NN (k=3): 138002685828960
k-NN (k=3) - Train Accuracy: 0.9395, Test Accuracy: 0.9097
ID of Gaussian Naive Bayes: 138002685833808
Gaussian Naive Bayes - Train Accuracy: 0.8167, Test Accuracy: 0.8150
ID of Logistic Regression: 138002685831504
Logistic Regression - Train Accuracy: 0.8686, Test Accuracy: 0.8646
ID of MLP Neural Network: 138002685834768
MLP Neural Network - Train Accuracy: 0.9616, Test Accuracy: 0.9361
ID of Random Forest: 138002685835008
Random Forest - Train Accuracy: 0.9829, Test Accuracy: 0.9533



stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/usr/local/lib/python3.10/dist-packages/numba/cuda/cudadrv/driver.py", line 295, in __getattr__
    raise CudaSupportError("Error at driver init: \n%s:" %
numba.cuda.cudadrv.error.CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:


Not patching Numba


ID of XGBoost: 138002685828000
XGBoost - Train Accuracy: 0.9760, Test Accuracy: 0.9469


In [None]:
evaluate_models_2(models, html_train_X, html_train_y, html_test_X, html_test_y)

Model                Set        Pre (%)    Recall (%) F1-Score (%)    AUC (%)    ACC (%)   
--------------------------------------------------------------------------------
Decision Tree        Test       91.47      91.51      91.49           94.56351813132432 93.63     
k-NN (k=1)           Test       91.76      87.25      89.45           91.28424322580457 92.30     
k-NN (k=3)           Test       85.98      90.66      88.26           95.76871534803058 90.97     
Gaussian Naive Bayes Test       69.55      89.89      78.43           88.68231487355432 81.50     
Logistic Regression  Test       82.22      81.42      81.82           92.20628789480774 86.46     
MLP Neural Network   Test       93.27      89.37      91.28           97.78979567407453 93.61     
Random Forest        Test       94.56      92.87      93.70           98.8290029450394 95.33     
XGBoost              Test       93.51      92.20      92.85           98.44060475702486 94.69     


In [None]:
model_1 =  XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_1.fit(html_train_X, html_train_y)

# Predictions
y_pred = model_1.predict(html_test_X)

# Evaluation
accuracy = accuracy_score(html_test_y, y_pred)
precision = precision_score(html_test_y, y_pred)
recall = recall_score(html_test_y, y_pred)
f1 = f1_score(html_test_y, y_pred)
conf_matrix = confusion_matrix(html_test_y, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)



stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/usr/local/lib/python3.10/dist-packages/numba/cuda/cudadrv/driver.py", line 295, in __getattr__
    raise CudaSupportError("Error at driver init: \n%s:" %
numba.cuda.cudadrv.error.CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:


Not patching Numba


Accuracy: 0.946875
Precision: 0.9350957464836468
Recall: 0.9219715956558062
F1 Score: 0.928487295978462
Confusion Matrix:
[[9632  383]
 [ 467 5518]]


In [None]:
def preprocess_features(train_df):
  columns_to_remove = [
      'phishing', 'rec_id',
    'qty_/_domain', 'qty_?_domain', 'qty_=_domain', 'qty_@_domain',
    'qty_&_domain', 'qty_!_domain', 'qty_ _domain', 'qty_~_domain',
    'qty_,_domain', 'qty_+_domain', 'qty_*_domain', 'qty_#_domain',
    'qty_$_domain', 'qty_%_domain',
      'time_response'	,'domain_spf'	,'asn_ip'	,'time_domain_activation'	,
      'time_domain_expiration'	,'qty_ip_resolved'	,'qty_nameservers',
      'qty_mx_servers',	'ttl_hostname'	,'tls_ssl_certificate',
      'qty_redirects',	'url_google_index',	'domain_google_index',	'url_shortened'
]

   # Split labels and features
  labels = train_df['phishing']
  features = train_df.drop(columns_to_remove, axis=1)

  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)

  scaler = StandardScaler()
  X_train  = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  return X_train, y_train, X_test, y_test



In [None]:
features_train_X, features_train_y, features_test_X, features_test_y = preprocess_features(merged_df_features)


In [None]:
models2 = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "k-NN (k=1)": KNeighborsClassifier(n_neighbors=1),
    "k-NN (k=3)": KNeighborsClassifier(n_neighbors=3),
    "Gaussian Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "MLP Neural Network": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42)
}

In [None]:
evaluate_models(models2, features_train_X, features_train_y, features_test_X, features_test_y)

ID of Decision Tree: 138002673878352
Decision Tree - Train Accuracy: 0.9836, Test Accuracy: 0.8818
ID of k-NN (k=1): 138002673886752
k-NN (k=1) - Train Accuracy: 0.9781, Test Accuracy: 0.8902
ID of k-NN (k=3): 138002673883728
k-NN (k=3) - Train Accuracy: 0.9365, Test Accuracy: 0.8947
ID of Gaussian Naive Bayes: 138002673884976
Gaussian Naive Bayes - Train Accuracy: 0.7076, Test Accuracy: 0.7078
ID of Logistic Regression: 138002673889200
Logistic Regression - Train Accuracy: 0.8456, Test Accuracy: 0.8465
ID of MLP Neural Network: 138002673886704
MLP Neural Network - Train Accuracy: 0.9474, Test Accuracy: 0.9098
ID of Random Forest: 138002673885024
Random Forest - Train Accuracy: 0.9836, Test Accuracy: 0.9145



stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/usr/local/lib/python3.10/dist-packages/numba/cuda/cudadrv/driver.py", line 295, in __getattr__
    raise CudaSupportError("Error at driver init: \n%s:" %
numba.cuda.cudadrv.error.CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:


Not patching Numba


ID of XGBoost: 138002673885120
XGBoost - Train Accuracy: 0.9335, Test Accuracy: 0.9158


In [None]:
# html_train_df = pd.DataFrame(html_train_X, columns=html_train_X.columns, index=html_train_y.index)
# html_train_df['phishing'] = html_train_y  # Add back the target variable
# html_test_df = pd.DataFrame(html_test_X, columns=html_test_X.columns, index=html_test_y.index)
# html_test_df['phishing'] = html_test_y  # Add back the target variable


In [None]:
evaluate_models_2(models2, features_train_X, features_train_y, features_test_X, features_test_y)

Model                Set        Pre (%)    Recall (%) F1-Score (%)    AUC (%)    ACC (%)   
--------------------------------------------------------------------------------
Decision Tree        Test       84.73      83.33      84.02           88.36984966835186 88.18     
k-NN (k=1)           Test       85.30      85.24      85.27           88.25303388525944 89.02     
k-NN (k=3)           Test       86.37      85.20      85.78           93.33439662505845 89.47     
Gaussian Naive Bayes Test       83.41      27.01      40.81           87.75447984896942 70.78     
Logistic Regression  Test       82.88      74.15      78.27           91.95791361833727 84.65     
MLP Neural Network   Test       87.57      88.34      87.96           96.4696590425941 90.98     
Random Forest        Test       89.11      87.80      88.45           96.74086044926298 91.45     
XGBoost              Test       89.34      87.91      88.62           97.26090924614273 91.58     


In [None]:
model_2 =  XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_2.fit(features_train_X, features_train_y)

# Predictions
y_pred = model_2.predict(features_test_X)

# Evaluation
accuracy = accuracy_score(features_test_y, y_pred)
precision = precision_score(features_test_y, y_pred)
recall = recall_score(features_test_y, y_pred)
f1 = f1_score(features_test_y, y_pred)
conf_matrix = confusion_matrix(features_test_y, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)


stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/usr/local/lib/python3.10/dist-packages/numba/cuda/cudadrv/driver.py", line 295, in __getattr__
    raise CudaSupportError("Error at driver init: \n%s:" %
numba.cuda.cudadrv.error.CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:


Not patching Numba


Accuracy: 0.9158125
Precision: 0.8933742122296031
Recall: 0.8791485082132082
F1 Score: 0.8862042747317732
Confusion Matrix:
[[18816  1252]
 [ 1442 10490]]


In [None]:
merged_df = pd.merge(merged_df_html, merged_df_features, on='rec_id', how='outer')

# Handle phishing label (assuming both phishing labels should match)
# If they don't match, prioritize one, or handle conflicts as needed
merged_df['phishing'] = merged_df['phishing_x'].combine_first(merged_df['phishing_y'])

# Drop duplicate phishing columns
merged_df.drop(columns=['phishing_x', 'phishing_y'], inplace=True)

# Output the final DataFrame
print(merged_df)

       rec_id  script_files_ratio  css_files_ratio  image_files_ratio  \
0           1            0.000000         0.000000           0.571429   
1           1            0.000000         0.000000           0.571429   
2           2            0.142857         0.071429           0.142857   
3           2            0.142857         0.071429           0.142857   
4           3            0.031250         0.010417           0.052083   
...       ...                 ...              ...                ...   
89995   79996                 NaN              NaN                NaN   
89996   79997                 NaN              NaN                NaN   
89997   79998                 NaN              NaN                NaN   
89998   79999                 NaN              NaN                NaN   
89999   80000                 NaN              NaN                NaN   

       anchor_files_ratio  empty_anchor_ratio  null_hyperlink_ratio  \
0                0.142857                 0.0       

In [None]:
def preprocess_merged_df(train_df):
  columns_to_remove = [
      'phishing', 'rec_id',
    'qty_/_domain', 'qty_?_domain', 'qty_=_domain', 'qty_@_domain',
    'qty_&_domain', 'qty_!_domain', 'qty_ _domain', 'qty_~_domain',
    'qty_,_domain', 'qty_+_domain', 'qty_*_domain', 'qty_#_domain',
    'qty_$_domain', 'qty_%_domain',
      'time_response'	,'domain_spf'	,'asn_ip'	,'time_domain_activation'	,
      'time_domain_expiration'	,'qty_ip_resolved'	,'qty_nameservers',
      'qty_mx_servers',	'ttl_hostname'	,'tls_ssl_certificate',
      'qty_redirects',	'url_google_index',	'domain_google_index',	'url_shortened', 'ExtFavicon'
]

   # Split labels and features
  labels = train_df['phishing']
  features = train_df.drop(columns_to_remove, axis=1)

  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)
  feature_names = X_train.columns

  feature_columns_to_scale = merged_df_features.columns.difference(columns_to_remove)
  html_columns_to_scale = [
    'total_forms', 'total_hyperlinks'
  ]
  columns_to_scale = feature_columns_to_scale.tolist() + html_columns_to_scale

  # Standardize the features
  scaler = StandardScaler()
  X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
  X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

  imputer = SimpleImputer(strategy='most_frequent')
  X_train = imputer.fit_transform(X_train)
  X_test = imputer.transform(X_test)

  # Convert back to DataFrame with original column names
  X_train = pd.DataFrame(X_train, columns=feature_names)
  X_test = pd.DataFrame(X_test, columns=feature_names)

  selector = SelectKBest(mutual_info_classif, k=40)
  X_new = selector.fit_transform(X_train, y_train)
  selected_feature_indices = selector.get_support(indices=True)
  selected_feature_names = X_train.columns[selected_feature_indices]

  X_train = X_train[selected_feature_names]
  X_test = X_test[selected_feature_names]


  return X_train, y_train, X_test, y_test

In [None]:
merged_train_X, merged_train_y, merged_test_X, merged_test_y = preprocess_merged_df(merged_df)

In [None]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(merged_train_X, merged_train_y)

predictions = xgb_model.predict(merged_test_X)

accuracy = accuracy_score(merged_test_y, predictions)
precision = precision_score(merged_test_y, predictions)
recall = recall_score(merged_test_y, predictions)
f1 = f1_score(merged_test_y, predictions)
conf_matrix = confusion_matrix(merged_test_y, predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)


stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/usr/local/lib/python3.10/dist-packages/numba/cuda/cudadrv/driver.py", line 295, in __getattr__
    raise CudaSupportError("Error at driver init: \n%s:" %
numba.cuda.cudadrv.error.CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:


Not patching Numba


Accuracy: 0.9265833333333333
Precision: 0.9139628979311398
Recall: 0.8876037959667853
F1 Score: 0.9005905141610562
Confusion Matrix:
[[21385  1127]
 [ 1516 11972]]


In [None]:
voting_model = VotingClassifier(
    estimators=[('xgboost_1', model_1), ('rf_1', models['Random Forest']),
                ('gaussian_1', models['Gaussian Naive Bayes']), ('logistic_1', models['Logistic Regression']),

                ('rf_2', models['Random Forest']), ('gaussian_2', models['Gaussian Naive Bayes']),
                ('logistic_2', models['Logistic Regression']), ('xgboost_2', model_2)
                ],
    voting='hard'  # Use 'soft' for probability averaging if needed
)

# Train the voting classifier
voting_model.fit(merged_train_X, merged_train_y)

# Make predictions
y_pred = voting_model.predict(merged_test_X)

accuracy = accuracy_score(merged_test_y, y_pred)
precision = precision_score(merged_test_y, y_pred)
recall = recall_score(merged_test_y, y_pred)
f1 = f1_score(merged_test_y, y_pred)
conf_matrix = confusion_matrix(merged_test_y, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)





stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/usr/local/lib/python3.10/dist-packages/numba/cuda/cudadrv/driver.py", line 295, in __getattr__
    raise CudaSupportError("Error at driver init: \n%s:" %
numba.cuda.cudadrv.error.CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:


Not patching Numba

stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/usr/local/lib/python3.10/dist-packages/numba/cuda/cudadrv/driver.py", line 295, in __getattr__
    raise CudaSupportError("Error at driver init: \n%s:" %
numba.cuda.cudadrv.error.CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER

Accuracy: 0.90
Accuracy: 0.8956666666666667
Precision: 0.932456452186278
Recall: 0.777876631079478
F1 Score: 0.8481810832659661
Confusion Matrix:
[[21752   760]
 [ 2996 10492]]
