In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

df = pd.read_csv("dataset_phishing.csv")

# Step 2: EDA
print(df.shape)
print(df.info())
print(df.describe())
print(df['status'].value_counts())
print(df.columns.tolist())

(11430, 89)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11430 entries, 0 to 11429
Data columns (total 89 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   url                         11430 non-null  object 
 1   length_url                  11430 non-null  int64  
 2   length_hostname             11430 non-null  int64  
 3   ip                          11430 non-null  int64  
 4   nb_dots                     11430 non-null  int64  
 5   nb_hyphens                  11430 non-null  int64  
 6   nb_at                       11430 non-null  int64  
 7   nb_qm                       11430 non-null  int64  
 8   nb_and                      11430 non-null  int64  
 9   nb_or                       11430 non-null  int64  
 10  nb_eq                       11430 non-null  int64  
 11  nb_underscore               11430 non-null  int64  
 12  nb_tilde                    11430 non-null  int64  
 13  nb_percent         

In [4]:
# Unstack and sort correlation pairs

corr = df.corr()
high_corr = corr[(corr > 0.8) | (corr < -0.8)]
high_corr_pairs = high_corr.unstack().dropna().sort_values(ascending=False)
print("\nTop correlated feature pairs (abs > 0.8):")
display(high_corr_pairs)

  corr = df.corr()



Top correlated feature pairs (abs > 0.8):


length_url            length_url              1.000000
brand_in_subdomain    brand_in_subdomain      1.000000
ratio_extRedirection  ratio_extRedirection    1.000000
nb_extCSS             nb_extCSS               1.000000
ratio_extHyperlinks   ratio_extHyperlinks     1.000000
                                                ...   
nb_and                nb_eq                   0.906404
longest_word_host     avg_word_host           0.816313
avg_word_host         longest_word_host       0.816313
shortest_word_host    avg_word_host           0.800014
avg_word_host         shortest_word_host      0.800014
Length: 90, dtype: float64

In [21]:
desc_stats = df.describe()
display(desc_stats.value_counts(normalize=True))


length_url    length_hostname  ip            nb_dots       nb_hyphens    nb_at         nb_qm         nb_and        nb_or    nb_eq         nb_underscore  nb_tilde      nb_percent    nb_slash      nb_star       nb_colon      nb_comma      nb_semicolumn  nb_dollar     nb_space      nb_www        nb_com        nb_dslash     http_in_path  https_token   ratio_digits_url  ratio_digits_host  punycode      port          tld_in_path   tld_in_subdomain  abnormal_subdomain  nb_subdomains  prefix_suffix  random_domain  shortening_service  path_extension  nb_redirection  nb_external_redirection  length_words_raw  char_repeat   shortest_words_raw  shortest_word_host  shortest_word_path  longest_words_raw  longest_word_host  longest_word_path  avg_words_raw  avg_word_host  avg_word_path  phish_hints   domain_in_brand  brand_in_subdomain  brand_in_path  suspecious_tld  statistical_report  nb_hyperlinks  ratio_intHyperlinks  ratio_extHyperlinks  ratio_nullHyperlinks  nb_extCSS     ratio_intRedirection  

In [6]:
# checking the outliers of the dataset

numeric_df = df.select_dtypes(include=['int64', 'float64'])
corr_matrix = numeric_df.corr()
corr_matrix = numeric_df.corr()
outlier = {}

for column in numeric_df.columns:
    Q1 = numeric_df[column].quantile(0.25)
    Q3 = numeric_df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = numeric_df[(numeric_df[column] < lower_bound) | (numeric_df[column] > upper_bound)]
    outliercount = outliers.shape[0]
    
    if outliercount > 0:
        outlier[column] = outliercount

# Display features with outliers
print("Features with outliers:")
for feature, count in outlier.items():
    print(f"{feature}:   {count} outliers")


Features with outliers:
length_url:   620 outliers
length_hostname:   775 outliers
ip:   1721 outliers
nb_dots:   567 outliers
nb_hyphens:   1371 outliers
nb_at:   245 outliers
nb_qm:   1555 outliers
nb_and:   761 outliers
nb_eq:   1564 outliers
nb_underscore:   1695 outliers
nb_tilde:   76 outliers
nb_percent:   355 outliers
nb_slash:   401 outliers
nb_star:   8 outliers
nb_colon:   197 outliers
nb_comma:   24 outliers
nb_semicolumn:   248 outliers
nb_dollar:   11 outliers
nb_space:   210 outliers
nb_com:   1327 outliers
nb_dslash:   75 outliers
http_in_path:   150 outliers
ratio_digits_url:   933 outliers
ratio_digits_host:   1503 outliers
punycode:   4 outliers
port:   27 outliers
tld_in_path:   750 outliers
tld_in_subdomain:   573 outliers
abnormal_subdomain:   247 outliers
prefix_suffix:   2314 outliers
random_domain:   952 outliers
shortening_service:   1411 outliers
path_extension:   2 outliers
nb_redirection:   166 outliers
nb_external_redirection:   36 outliers
length_words_ra

In [12]:
# 5: Preparing the x & y cloumn 
X = df.drop(['url', 'status'], axis=1)
y = df['status']

# 6: Spliting the dataset (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7: Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [16]:
print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])
print("Train distribution:", y_train.value_counts(normalize=True))
print("Test distribution:", y_test.value_counts(normalize=True))

Train size: 9144
Test size: 2286
Train distribution: 1    0.501531
0    0.498469
Name: status, dtype: float64
Test distribution: 0    0.506124
1    0.493876
Name: status, dtype: float64
