In [1]:
import pandas as pd
import numpy as np
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb

In [2]:
data = pd.read_csv("modified_phishing_dataset.csv")
# Drop irrelevant columns
data = data.drop(['url'], axis=1)

In [3]:
data.duplicated().sum()

174

In [4]:
# we have duplicate data in our dataset so we will remove it.
data.drop_duplicates(inplace= True)

In [5]:
data.duplicated().sum()

0

In [6]:
data.shape

(11256, 88)

In [7]:
# Split the data into features (X) and target (y)
X = data.drop(['status'], axis=1)
y = data['status']

In [8]:
from sklearn.feature_selection import RFE

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Train the LightGBM model
clf = lgb.LGBMClassifier(learning_rate=0.5, max_depth=7)
clf.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4899
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492892 -> initscore=-0.028434
[LightGBM] [Info] Start training from score -0.028434


In [10]:
rfe = RFE(clf, n_features_to_select=30)
rfe.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4899
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492892 -> initscore=-0.028434
[LightGBM] [Info] Start training from score -0.028434
[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4899
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492892 -> initscore=-0.028434
[LightGBM] [Info] Start training from score -0.028434
[LightGBM] [Info] 

[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002376 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 69
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492892 -> initscore=-0.028434
[LightGBM] [Info] Start training from score -0.028434
[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4891
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 69
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492892 -> initscore=-0.028434
[LightGB

[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001217 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4887
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492892 -> initscore=-0.028434
[LightGBM] [Info] Start training from score -0.028434
[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002895 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4883
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492892 -> initscore=-0.028434
[LightGB

[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000933 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4843
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 59
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492892 -> initscore=-0.028434
[LightGBM] [Info] Start training from score -0.028434
[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4836
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 58
[LightGBM] [Info] [binar

[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000869 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4813
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492892 -> initscore=-0.028434
[LightGBM] [Info] Start training from score -0.028434
[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000872 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4811
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 48
[LightGBM] [Info] [binar

[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4755
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492892 -> initscore=-0.028434
[LightGBM] [Info] Start training from score -0.028434
[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4740
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 38
[LightGBM] [Info] [binar

In [11]:
selected_features = X.columns[rfe.support_]
print(selected_features)

Index(['length_url', 'length_hostname', 'nb_dots', 'nb_hyphens', 'nb_slash',
       'nb_www', 'ratio_digits_url', 'length_words_raw', 'char_repeat',
       'shortest_word_host', 'shortest_word_path', 'longest_words_raw',
       'longest_word_host', 'longest_word_path', 'avg_words_raw',
       'avg_word_host', 'avg_word_path', 'phish_hints', 'nb_hyperlinks',
       'ratio_intHyperlinks', 'ratio_extHyperlinks', 'ratio_extRedirection',
       'ratio_extErrors', 'links_in_tags', 'safe_anchor',
       'domain_registration_length', 'domain_age', 'web_traffic',
       'google_index', 'page_rank'],
      dtype='object')


In [12]:
# Select only the relevant features from the dataset
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]


In [13]:
# Train the LightGBM model using selected features
clf_selected = lgb.LGBMClassifier(learning_rate=0.5, max_depth=7)
clf_selected.fit(X_train_selected, y_train)

[LightGBM] [Info] Number of positive: 4438, number of negative: 4566
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001254 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4094
[LightGBM] [Info] Number of data points in the train set: 9004, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492892 -> initscore=-0.028434
[LightGBM] [Info] Start training from score -0.028434


In [14]:
# Evaluate the model
y_pred_train = clf_selected.predict(X_train_selected)
y_pred_test = clf_selected.predict(X_test_selected)
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)



In [15]:
print("LightGBM: Accuracy on training Data: {:.3f}".format(acc_train))
print("LightGBM: Accuracy on test Data: {:.3f}".format(acc_test))

LightGBM: Accuracy on training Data: 1.000
LightGBM: Accuracy on test Data: 0.967


In [16]:
joblib.dump(clf_selected, 'model_feature_selected.joblib')

['model_feature_selected.joblib']