In [1]:
import pandas as pd
import sklearn
import os
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm , preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier



In [2]:
def mean_score(scoring):
    return {i:j.mean() for i,j in scoring.items()}

In [3]:
# Get the parent directory of the current folder
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Specify the file path for dataset.csv in the parent folder
filename = os.path.join(parent_dir, "dataset.csv")
df = pd.read_csv(filename)

In [4]:
df.columns 

Index(['PhishID', 'frequency_of_a_tags', 'frequency_of_alltags',
       'presence_of_iframes', 'presence_of_popups', 'right_click_disabling',
       'checking_sfh', 'request_url', 'url-of-anchor', 'links_in_meta_img',
       'Result'],
      dtype='object')

In [5]:
df.shape

(9713, 11)

In [6]:
data_no_duplicates = df.drop_duplicates()
df = data_no_duplicates.dropna()

In [7]:
df = df.drop('PhishID', axis = 1)

In [8]:
df.shape

(6072, 10)

In [9]:
# Investigate unique values in the each column
unique_frequency_a = df['frequency_of_a_tags'].unique()
unique_all = df['frequency_of_alltags'].unique()
unique_iframes = df['presence_of_iframes'].unique()
unique_popups = df['presence_of_popups'].unique()
unique_rightclick = df['right_click_disabling'].unique()
# unique_redirects = df['num_of_redirects'].unique()
# unique_cookies = df['checking_cookies'].unique()
unique_sfh = df['checking_sfh'].unique()
unique_request = df['request_url'].unique()
unique_url = df['url-of-anchor'].unique()
unique_meta = df['links_in_meta_img'].unique()
unique_R = df['Result'].unique()

print(unique_frequency_a)
print(unique_all)
print(unique_iframes)
print(unique_popups)
print(unique_rightclick)
# print(unique_redirects)
# print(unique_cookies)
print(unique_sfh)
print(unique_request)
print(unique_url)
print(unique_meta)
print(unique_R)

['0' '1' '-1' 'frequency_of_a_tags']
['0' '-1' '1' 'frequency_of_alltags']
['1' 'presence_of_iframes']
['1' '0' '-1' 'presence_of_popups']
['1' 'right_click_disabling' '-1']
['-1' '1' '0' 'checking_sfh']
['-1' '1' '0' 'request_url']
['-1' '0' '1' 'url-of-anchor']
['-1' '1' '0' 'links_in_meta_img']
['1' 'Result' '-1']


In [10]:
validValues = {-1, 0, 1}

for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')
    df = df[df[column].isin(validValues)]

# Convert all columns to integers (again after the filtering)
df = df.astype(int)

In [11]:
# Now you can use the DataFrame with valid integer values (1, -1, and 0)
print(df)

      frequency_of_a_tags  frequency_of_alltags  presence_of_iframes  \
0                       0                     0                    1   
1                       1                     0                    1   
2                       1                     0                    1   
3                       1                     0                    1   
4                       0                     0                    1   
...                   ...                   ...                  ...   
6875                   -1                     0                    1   
6876                   -1                     0                    1   
6877                   -1                     0                    1   
6878                   -1                     0                    1   
6879                    0                     0                    1   

      presence_of_popups  right_click_disabling  checking_sfh  request_url  \
0                      1                      1          

In [12]:
df.shape

(6071, 10)

In [13]:
df = sklearn.utils.shuffle(df)
X = df.drop("Result",axis=1).values
X = preprocessing.scale(X)
y = df['Result'].values
df.head()

Unnamed: 0,frequency_of_a_tags,frequency_of_alltags,presence_of_iframes,presence_of_popups,right_click_disabling,checking_sfh,request_url,url-of-anchor,links_in_meta_img,Result
4903,0,0,1,1,1,1,1,1,1,-1
5644,-1,0,1,1,1,-1,-1,-1,1,-1
4364,-1,0,1,1,1,-1,1,-1,-1,-1
3817,0,0,1,0,1,-1,1,-1,1,1
5270,0,0,1,1,1,-1,-1,-1,-1,-1


In [14]:
scoring = {'accuracy': 'accuracy',
           'recall': 'recall',
           'precision': 'precision',
           'f1': 'f1'}
fold_count=10

## SVM-Linear

In [15]:
linear_clf = svm.SVC(kernel='linear')
cross_val_scores = cross_validate(linear_clf, X, y, cv=fold_count, scoring=scoring)
linear_svc_clf_score = mean_score(cross_val_scores)

print(f"fit time = {linear_svc_clf_score['fit_time']}")
print(f"score time = {linear_svc_clf_score['score_time']}")
print(f"accuracy = {linear_svc_clf_score['test_accuracy']}")
print(f"recall = {linear_svc_clf_score['test_recall']}")
print(f"precision = {linear_svc_clf_score['test_precision']}")
print(f"f1 = {linear_svc_clf_score['test_f1']}")

fit time = 0.59933021068573
score time = 0.04351842403411865
accuracy = 0.6694114172374924
recall = 0.9404053820941864
precision = 0.6397588200831991
f1 = 0.7614545374608873


## SVM-Polynomial

In [16]:
poly_clf = svm.SVC(kernel='poly')
cross_val_scores = cross_validate(poly_clf, X, y, cv=fold_count, scoring=scoring)
poly_svc_clf_score = mean_score(cross_val_scores)

print(f"fit time = {poly_svc_clf_score['fit_time']}")
print(f"score time = {poly_svc_clf_score['score_time']}")
print(f"accuracy = {poly_svc_clf_score['test_accuracy']}")
print(f"recall = {poly_svc_clf_score['test_recall']}")
print(f"precision = {poly_svc_clf_score['test_precision']}")
print(f"f1 = {poly_svc_clf_score['test_f1']}")

fit time = 0.6659755229949951
score time = 0.041745972633361814
accuracy = 0.7255787739530043
recall = 0.8314611005692599
precision = 0.7220221522067384
f1 = 0.7726700105742677


## SVM-RBF

In [17]:
rbf_clf = svm.SVC(kernel='rbf')
cross_val_scores = cross_validate(rbf_clf, X, y, cv=fold_count, scoring=scoring)
rbf_svc_clf_score = mean_score(cross_val_scores)

print(f"fit time = {rbf_svc_clf_score['fit_time']}")
print(f"score time = {rbf_svc_clf_score['score_time']}")
print(f"accuracy = {rbf_svc_clf_score['test_accuracy']}")
print(f"recall = {rbf_svc_clf_score['test_recall']}")
print(f"precision = {rbf_svc_clf_score['test_precision']}")
print(f"f1 = {rbf_svc_clf_score['test_f1']}")


fit time = 0.626070499420166
score time = 0.07366483211517334
accuracy = 0.7387564488858059
recall = 0.8153217181300672
precision = 0.7439773533928746
f1 = 0.7779085639969064


## SVM-Sigmoid

In [18]:
sigmoid_clf = svm.SVC(kernel='sigmoid')
cross_val_scores = cross_validate(sigmoid_clf, X, y, cv=fold_count, scoring=scoring)
sigmoid_svc_clf_score = mean_score(cross_val_scores)

print(f"fit time = {sigmoid_svc_clf_score['fit_time']}")
print(f"score time = {sigmoid_svc_clf_score['score_time']}")
print(f"accuracy = {sigmoid_svc_clf_score['test_accuracy']}")
print(f"recall = {sigmoid_svc_clf_score['test_recall']}")
print(f"precision = {sigmoid_svc_clf_score['test_precision']}")
print(f"f1 = {sigmoid_svc_clf_score['test_f1']}")

fit time = 1.0144456386566163
score time = 0.06407933235168457
accuracy = 0.6236237860920836
recall = 0.6614731757805762
precision = 0.6666690677102016
f1 = 0.6630686164588154
