In [1]:
import pandas as pd
import sklearn
import os
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm , preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier



## Function to calculate the scores

In [2]:
def mean_score(scoring):
    return {i:j.mean() for i,j in scoring.items()}

## Loading the data
- Load the data and clean it for unique values, later shuffle it with specific seeding

In [3]:
# Get the parent directory of the current folder
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Specify the file path for dataset.csv in the parent folder
filename = os.path.join(parent_dir, "dataset.csv")
df = pd.read_csv(filename)

In [3]:
# Get the parent directory of the current folder
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Specify the file path for dataset.csv in the parent folder
filename = os.path.join(parent_dir, "dataset-1.csv")
df = pd.read_csv(filename)

In [4]:
# Get the parent directory of the current folder
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Specify the file path for dataset.csv in the parent folder
filename = os.path.join(parent_dir, "dataset-2.csv")
df = pd.read_csv(filename)

## Data Preprocessing

In [5]:
df.columns

Index(['Length of URL', 'Has IP address', 'Shortening Service',
       'Having @ Symbol', 'Double Slash Redirecting', 'Prefix-Suffix', 'CTLD',
       'HTTPS in Domain', 'Sensitive Words', 'Has Tilde', 'Has Port',
       'PhishID', 'frequency_of_a_tags', 'frequency_of_alltags',
       'presence_of_iframes', 'presence_of_popups', 'right_click_disabling',
       'checking_sfh', 'request_url', 'url-of-anchor', 'links_in_meta_img',
       'check_hidden_content', 'Current Domain Age', 'Matching Domain Name',
       'Length of Domain', 'Google page index', 'Result'],
      dtype='object')

In [6]:
df.shape

(5790, 27)

In [7]:
data_no_duplicates = df.drop_duplicates()
df = data_no_duplicates.dropna()

In [8]:
df.shape

(5790, 27)

In [9]:
df = df.drop('PhishID', axis = 1)

In [10]:
df.rename(columns={'Length of URL':'Length_of_URL', 'Has IP address':'Has_IP_address', 'Shortening Service':'Shortening_Service', 'Having @ Symbol':'Having_@_Symbol', 'Double Slash Redirecting':'Double_Slash_Redirecting', 'Prefix-Suffix':'Prefix-Suffix', 'CTLD':'CTLD', 'HTTPS in Domain':'HTTPS_in_Domain', 'Sensitive Words':'Sensitive_Words', 'Has Tilde':'Has_Tilde', 'Has Port':'Has_Port', 'Current Domain Age':'Current_Domain_Age' , 'Matching Domain Name':'Matching_Domain_Name', 'Length of Domain':'Length_of_Domain', 'Google page index':'Google_page_index','Result':'Result'}, inplace=True)

In [11]:
# Investigate unique values in the each column
unique_frequency_a = df['frequency_of_a_tags'].unique()
unique_all = df['frequency_of_alltags'].unique()
unique_iframes = df['presence_of_iframes'].unique()
unique_popups = df['presence_of_popups'].unique()
unique_rightclick = df['right_click_disabling'].unique()
unique_sfh = df['checking_sfh'].unique()
unique_request = df['request_url'].unique()
unique_url = df['url-of-anchor'].unique()
unique_meta = df['links_in_meta_img'].unique()
unique_R = df['Result'].unique()
unique_IP = df['Has_IP_address'].unique()
unique_SS = df['Shortening_Service'].unique()
unique_HAS = df['Having_@_Symbol'].unique()
unique_DSR = df['Double_Slash_Redirecting'].unique()
unique_PS = df['Prefix-Suffix'].unique()
unique_CTLD = df['CTLD'].unique()
unique_HID = df['HTTPS_in_Domain'].unique()
unique_SW = df['Sensitive_Words'].unique()
unique_HT = df['Has_Tilde'].unique()
unique_HP = df['Has_Port'].unique()
unique_hidden = df['check_hidden_content'].unique()
unique_R = df['Result'].unique()

print(unique_frequency_a)
print(unique_all)
print(unique_iframes)
print(unique_popups)
print(unique_rightclick)
print(unique_sfh)
print(unique_request)
print(unique_url)
print(unique_meta)
print(unique_R)
print(unique_IP)
print(unique_SS)
print(unique_HAS)
print(unique_DSR)
print(unique_PS)
print(unique_CTLD)
print(unique_HID)
print(unique_SW)
print(unique_HT)
print(unique_HP)
print(unique_hidden)
print(unique_R)

['-1' '1' 'frequency_of_a_tags']
['1' '0' '-1' 'frequency_of_alltags']
['1' 'presence_of_iframes']
['1' '-1' 'presence_of_popups']
['1' 'right_click_disabling' '-1']
['-1' '1' 'checking_sfh']
['-1' '1' '0' 'request_url']
['-1' '0' '1' 'url-of-anchor']
['-1' '1' '0' 'links_in_meta_img']
['1' 'Result' '-1']
['1' '-1' 'Has IP address']
['-1' '1' 'Shortening Service']
['1' '-1' 'Having @ Symbol']
['1' '-1' 'Double Slash Redirecting']
['-1' '1' 'Prefix-Suffix']
['1' '-1' '0' 'CTLD']
['-1' 'HTTPS in Domain']
['1' '-1' 'Sensitive Words']
['1' '-1' 'Has Tilde']
['-1' '1' 'Has Port']
['1' '-1' 'check_hidden_content']
['1' 'Result' '-1']


### Remove all the values other than {-1, 0, 1} from the data

In [12]:
validValues = {-1, 0, 1}

for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')
    df = df[df[column].isin(validValues)]

# Convert all columns to integers (again after the filtering)
df = df.astype(int)

In [13]:
# Now you can use the DataFrame with valid integer values (1, -1, and 0)
print(df)

      Length_of_URL  Has_IP_address  Shortening_Service  Having_@_Symbol  \
0                 1               1                  -1                1   
1                 1               1                   1                1   
2                 1               1                   1                1   
3                 1               1                   1                1   
4                 1               1                   1                1   
...             ...             ...                 ...              ...   
5785              1               1                   1                1   
5786              1               1                   1                1   
5787              1               1                   1                1   
5788              1               1                   1                1   
5789              1               1                   1                1   

      Double_Slash_Redirecting  Prefix-Suffix  CTLD  HTTPS_in_Domain  \
0              

In [14]:
df.shape

(5789, 26)

### Data Preprocessing
- Shuffle the data, and split it into train and test data

In [15]:
df = sklearn.utils.shuffle(df)
X = df.drop("Result",axis=1).values
X = preprocessing.scale(X)
y = df['Result'].values
df.head()

Unnamed: 0,Length_of_URL,Has_IP_address,Shortening_Service,Having_@_Symbol,Double_Slash_Redirecting,Prefix-Suffix,CTLD,HTTPS_in_Domain,Sensitive_Words,Has_Tilde,...,checking_sfh,request_url,url-of-anchor,links_in_meta_img,check_hidden_content,Current_Domain_Age,Matching_Domain_Name,Length_of_Domain,Google_page_index,Result
5324,-1,1,1,1,1,1,0,-1,1,1,...,-1,1,1,0,1,1,-1,1,1,-1
2857,-1,1,1,1,1,1,1,-1,1,1,...,-1,1,1,1,1,1,-1,1,1,1
3397,1,1,1,1,1,1,-1,-1,1,1,...,-1,-1,-1,1,1,1,-1,1,1,-1
1441,1,1,1,1,1,1,1,-1,1,1,...,-1,1,1,1,-1,1,-1,1,1,1
4639,-1,1,1,1,1,-1,0,-1,1,1,...,1,1,-1,1,1,-1,-1,1,1,-1


## Evalution Metrics

In [16]:
scoring = {'accuracy': 'accuracy',
           'recall': 'recall',
           'precision': 'precision',
           'f1': 'f1'}
fold_count=10

## Random Forest

In [17]:
rforest_clf=RandomForestClassifier()
cross_val_scores = cross_validate(rforest_clf, X, y, cv=fold_count, scoring=scoring)
rforest_clf_score = mean_score(cross_val_scores)

print(f"fit time = {rforest_clf_score['fit_time']}")
print(f"score time = {rforest_clf_score['score_time']}")
print(f"accuracy = {rforest_clf_score['test_accuracy']}")
print(f"recall = {rforest_clf_score['test_recall']}")
print(f"precision = {rforest_clf_score['test_precision']}")
print(f"f1 = {rforest_clf_score['test_f1']}")

fit time = 0.3210896968841553
score time = 0.019525671005249025
accuracy = 0.8992945120748695
recall = 0.9247274068091234
precision = 0.9047243747131443
f1 = 0.9145480290157664


In [18]:
rforest_clf=RandomForestClassifier()
cross_val_scores = cross_validate(rforest_clf, X, y, cv=fold_count, scoring=scoring)
rforest_clf_score = mean_score(cross_val_scores)

print(f"fit time = {rforest_clf_score['fit_time']}")
print(f"score time = {rforest_clf_score['score_time']}")
print(f"accuracy = {rforest_clf_score['test_accuracy']}")
print(f"recall = {rforest_clf_score['test_recall']}")
print(f"precision = {rforest_clf_score['test_precision']}")
print(f"f1 = {rforest_clf_score['test_f1']}")

fit time = 0.33159356117248534
score time = 0.021239590644836426
accuracy = 0.8982576450269228
recall = 0.9220567836637226
precision = 0.9052715601195324
f1 = 0.9135295153622215


In [19]:
rforest_clf=RandomForestClassifier()
cross_val_scores = cross_validate(rforest_clf, X, y, cv=fold_count, scoring=scoring)
rforest_clf_score = mean_score(cross_val_scores)

print(f"fit time = {rforest_clf_score['fit_time']}")
print(f"score time = {rforest_clf_score['score_time']}")
print(f"accuracy = {rforest_clf_score['test_accuracy']}")
print(f"recall = {rforest_clf_score['test_recall']}")
print(f"precision = {rforest_clf_score['test_precision']}")
print(f"f1 = {rforest_clf_score['test_f1']}")

fit time = 0.3129581451416016
score time = 0.019349288940429688
accuracy = 0.8991209040763515
recall = 0.9232472389514161
precision = 0.9056755147805191
f1 = 0.9142807468288948


# Validation Against Virus Total

In [20]:

from sklearn.metrics import  recall_score, precision_score, f1_score

In [21]:
# Load and preprocess the training dataset
training_data = pd.read_csv("../dataset-2.csv")
training_data.columns

Index(['Length of URL', 'Has IP address', 'Shortening Service',
       'Having @ Symbol', 'Double Slash Redirecting', 'Prefix-Suffix', 'CTLD',
       'HTTPS in Domain', 'Sensitive Words', 'Has Tilde', 'Has Port',
       'PhishID', 'frequency_of_a_tags', 'frequency_of_alltags',
       'presence_of_iframes', 'presence_of_popups', 'right_click_disabling',
       'checking_sfh', 'request_url', 'url-of-anchor', 'links_in_meta_img',
       'check_hidden_content', 'Current Domain Age', 'Matching Domain Name',
       'Length of Domain', 'Google page index', 'Result'],
      dtype='object')

In [22]:
training_data.shape

(5790, 27)

In [23]:
data_no_duplicates = training_data.drop_duplicates()
training_data = data_no_duplicates.dropna()

training_data = training_data.drop('PhishID', axis = 1)


training_data.rename(columns={'Length of URL':'Length_of_URL', 'Has IP address':'Has_IP_address', 'Shortening Service':'Shortening_Service', 'Having @ Symbol':'Having_@_Symbol', 'Double Slash Redirecting':'Double_Slash_Redirecting', 'Prefix-Suffix':'Prefix-Suffix', 'CTLD':'CTLD', 'HTTPS in Domain':'HTTPS_in_Domain', 'Sensitive Words':'Sensitive_Words', 'Has Tilde':'Has_Tilde', 'Has Port':'Has_Port', 'Current Domain Age':'Current_Domain_Age' , 'Matching Domain Name':'Matching_Domain_Name', 'Length of Domain':'Length_of_Domain', 'Google page index':'Google_page_index','Result':'Result'}, inplace=True)

In [24]:
validValues = {-1, 0, 1}

for column in training_data.columns:
    training_data[column] = pd.to_numeric(training_data[column], errors='coerce')
    training_data = training_data[training_data[column].isin(validValues)]

# Convert all columns to integers (again after the filtering)
training_data = training_data.astype(int)

training_data.shape

(5789, 26)

In [25]:
training_data = sklearn.utils.shuffle(training_data)
X = training_data.drop("Result",axis=1).values
X = preprocessing.scale(X)
y = training_data['Result'].values
training_data.head()

Unnamed: 0,Length_of_URL,Has_IP_address,Shortening_Service,Having_@_Symbol,Double_Slash_Redirecting,Prefix-Suffix,CTLD,HTTPS_in_Domain,Sensitive_Words,Has_Tilde,...,checking_sfh,request_url,url-of-anchor,links_in_meta_img,check_hidden_content,Current_Domain_Age,Matching_Domain_Name,Length_of_Domain,Google_page_index,Result
5391,-1,1,1,1,1,-1,1,-1,1,1,...,-1,1,-1,1,-1,1,-1,1,1,-1
1001,-1,1,1,1,1,1,1,-1,1,1,...,-1,-1,0,0,1,1,1,1,1,1
4527,-1,1,1,1,1,-1,1,-1,1,1,...,1,1,1,1,1,1,-1,1,1,-1
5521,1,1,1,1,1,-1,0,-1,1,1,...,1,1,1,0,1,1,-1,1,1,-1
1023,-1,1,1,1,1,1,0,-1,1,1,...,-1,-1,-1,0,-1,1,-1,1,1,1


In [26]:
# Train the AdaBoostClassifier on the training dataset
# adaboost_clf = AdaBoostClassifier()
rforest_clf.fit(X, y)

In [27]:
# Load and preprocess the testing dataset
testing_data = pd.read_csv("../Unconfirmed-Data.csv")
testing_data.shape

(235, 27)

In [28]:
data_no_duplicates = testing_data.drop_duplicates()
testing_data = data_no_duplicates.dropna()

testing_data = testing_data.drop('PhishID', axis = 1)

testing_data.rename(columns={'Length of URL':'Length_of_URL', 'Has IP address':'Has_IP_address', 'Shortening Service':'Shortening_Service', 'Having @ Symbol':'Having_@_Symbol', 'Double Slash Redirecting':'Double_Slash_Redirecting', 'Prefix-Suffix':'Prefix-Suffix', 'CTLD':'CTLD', 'HTTPS in Domain':'HTTPS_in_Domain', 'Sensitive Words':'Sensitive_Words', 'Has Tilde':'Has_Tilde', 'Has Port':'Has_Port', 'Current Domain Age':'Current_Domain_Age' , 'Matching Domain Name':'Matching_Domain_Name', 'Length of Domain':'Length_of_Domain', 'Google page index':'Google_page_index','Result':'Result'}, inplace=True)

In [29]:
validValues = {-1, 0, 1}

for column in testing_data.columns:
    testing_data[column] = pd.to_numeric(testing_data[column], errors='coerce')
    testing_data = testing_data[testing_data[column].isin(validValues)]

# Convert all columns to integers (again after the filtering)
testing_data = testing_data.astype(int)

testing_data.shape
testing_data = sklearn.utils.shuffle(testing_data)
X_test = testing_data.drop("Result",axis=1).values
X_test = preprocessing.scale(X_test)
y_test = testing_data['Result'].values
testing_data.head()

Unnamed: 0,Length_of_URL,Has_IP_address,Shortening_Service,Having_@_Symbol,Double_Slash_Redirecting,Prefix-Suffix,CTLD,HTTPS_in_Domain,Sensitive_Words,Has_Tilde,...,checking_sfh,request_url,url-of-anchor,links_in_meta_img,check_hidden_content,Current_Domain_Age,Matching_Domain_Name,Length_of_Domain,Google_page_index,Result
85,0,1,1,1,1,1,1,-1,1,1,...,-1,1,-1,1,1,-1,-1,1,-1,1
34,-1,1,-1,1,-1,1,-1,-1,-1,1,...,-1,-1,1,1,1,1,-1,1,-1,1
74,-1,-1,1,1,1,1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,0,0,0,-1,1
97,1,1,1,1,1,-1,1,-1,1,1,...,-1,-1,-1,-1,-1,-1,-1,1,1,1
107,-1,1,1,1,-1,1,-1,-1,-1,-1,...,-1,-1,1,1,1,1,-1,1,1,1


In [30]:
# Use the trained AdaBoostClassifier to predict on the testing dataset
y_pred = rforest_clf.predict(X_test)

In [31]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy on testing dataset: {accuracy}")
print(f"Recall on testing dataset: {recall}")
print(f"Precision on testing dataset: {precision}")
print(f"F1-score on testing dataset: {f1}")

Accuracy on testing dataset: 0.8170212765957446
Recall on testing dataset: 0.8268398268398268
Precision on testing dataset: 0.9845360824742269
F1-score on testing dataset: 0.8988235294117647
