In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('phishing.csv', index_col='Index')
df

Unnamed: 0_level_0,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,Favicon,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,-1,0,-1,1,-1,-1,1,1,-1,1,...,1,1,1,1,1,-1,1,-1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11049,1,-1,1,-1,1,1,1,1,-1,-1,...,-1,-1,1,1,-1,-1,1,1,1,1
11050,-1,1,1,-1,-1,-1,1,-1,-1,-1,...,-1,1,1,1,1,1,1,-1,1,-1
11051,1,-1,1,1,1,-1,1,-1,-1,1,...,1,1,1,1,1,-1,1,0,1,-1
11052,-1,-1,1,1,1,-1,-1,-1,1,-1,...,-1,1,1,1,1,-1,1,1,1,-1


In [3]:
df.columns

Index(['UsingIP', 'LongURL', 'ShortURL', 'Symbol@', 'Redirecting//',
       'PrefixSuffix-', 'SubDomains', 'HTTPS', 'DomainRegLen', 'Favicon',
       'NonStdPort', 'HTTPSDomainURL', 'RequestURL', 'AnchorURL',
       'LinksInScriptTags', 'ServerFormHandler', 'InfoEmail', 'AbnormalURL',
       'WebsiteForwarding', 'StatusBarCust', 'DisableRightClick',
       'UsingPopupWindow', 'IframeRedirection', 'AgeofDomain', 'DNSRecording',
       'WebsiteTraffic', 'PageRank', 'GoogleIndex', 'LinksPointingToPage',
       'StatsReport', 'class'],
      dtype='object')

In [4]:
df.drop(columns=['PrefixSuffix-', 'PageRank', 'WebsiteTraffic',
        'HTTPSDomainURL', 'RequestURL', 'AnchorURL', 'ServerFormHandler',
        'WebsiteForwarding', 'StatusBarCust', 'UsingPopupWindow', 'IframeRedirection',
        'DNSRecording', 'GoogleIndex', 'LinksPointingToPage', 'StatsReport',], inplace=True)

df

Unnamed: 0_level_0,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,SubDomains,HTTPS,DomainRegLen,Favicon,NonStdPort,LinksInScriptTags,InfoEmail,AbnormalURL,DisableRightClick,AgeofDomain,class
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,1,1,1,1,1,0,1,-1,1,1,-1,1,1,1,-1,-1
1,1,0,1,1,1,-1,-1,-1,1,1,-1,-1,-1,1,1,-1
2,1,0,1,1,1,-1,-1,1,1,1,0,1,1,1,-1,-1
3,1,0,-1,1,1,1,1,-1,1,1,0,1,1,1,-1,1
4,-1,0,-1,1,-1,1,1,-1,1,1,0,-1,-1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11049,1,-1,1,-1,1,1,1,-1,-1,-1,1,-1,1,-1,1,1
11050,-1,1,1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,1,1,-1
11051,1,-1,1,1,1,1,-1,-1,1,1,-1,1,1,1,1,-1
11052,-1,-1,1,1,1,-1,-1,1,-1,1,1,1,1,1,1,-1


In [5]:
df.isnull().any()

UsingIP              False
LongURL              False
ShortURL             False
Symbol@              False
Redirecting//        False
SubDomains           False
HTTPS                False
DomainRegLen         False
Favicon              False
NonStdPort           False
LinksInScriptTags    False
InfoEmail            False
AbnormalURL          False
DisableRightClick    False
AgeofDomain          False
class                False
dtype: bool

In [6]:
df['class'].value_counts()

class
 1    6157
-1    4897
Name: count, dtype: int64

In [7]:
is_phishing = df[df['class'] == 1]
not_phishing = df[df['class'] == -1]

is_phishing_upsampled = resample(is_phishing, n_samples=len(not_phishing))

df_balanced = pd.concat([is_phishing_upsampled, not_phishing])
df_balanced['class'].value_counts()

class
 1    4897
-1    4897
Name: count, dtype: int64

In [8]:
x_data = df_balanced.drop(columns=['class'])
y_data = df_balanced['class']

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

In [9]:
linmodel = LinearRegression()

linmodel.fit(x_train, y_train)

y_pred = linmodel.predict(x_test)

y_pred_class = np.where(y_pred > 0.5, 1, -1)

accuracy = accuracy_score(y_test, y_pred_class)
print(f"Accuracy: {accuracy}\n")

report = classification_report(y_test, y_pred_class)
print("Classification Report:")
print(report)

Accuracy: 0.8356304236855538

Classification Report:
              precision    recall  f1-score   support

          -1       0.78      0.94      0.85       961
           1       0.92      0.74      0.82       998

    accuracy                           0.84      1959
   macro avg       0.85      0.84      0.83      1959
weighted avg       0.85      0.84      0.83      1959



In [10]:
logmodel = LogisticRegression(solver='newton-cg')

logmodel.fit(x_train, y_train)

y_pred = logmodel.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}\n")

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Accuracy: 0.8795303726391016

Classification Report:
              precision    recall  f1-score   support

          -1       0.90      0.84      0.87       961
           1       0.86      0.91      0.89       998

    accuracy                           0.88      1959
   macro avg       0.88      0.88      0.88      1959
weighted avg       0.88      0.88      0.88      1959



In [11]:
param_grid = {
    'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10, 15, 20],
    'weights': ['uniform', 'distance',],
    'metric': ['euclidean', 'manhattan', 'chebyshev'],
}

knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

grid_search.fit(x_train, y_train)

params = grid_search.best_params_

print("Best Hyperparameters:", params)

Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}


In [12]:
knn = KNeighborsClassifier(n_neighbors=params['n_neighbors'], weights=params['weights'], metric=params['metric'])

knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Accuracy: 0.9060745278203165
Classification Report:
              precision    recall  f1-score   support

          -1       0.92      0.88      0.90       961
           1       0.89      0.93      0.91       998

    accuracy                           0.91      1959
   macro avg       0.91      0.91      0.91      1959
weighted avg       0.91      0.91      0.91      1959

