In [28]:
import pandas as pd
import sklearn
import os
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm , preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

## Function to calculate the scores

In [29]:
def mean_score(scoring):
    return {i:j.mean() for i,j in scoring.items()}

## Loading the data
- Load the data and clean it for unique values, later shuffle it with specific seeding

In [3]:
# Get the parent directory of the current folder
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Specify the file path for dataset.csv in the parent folder
filename = os.path.join(parent_dir, "dataset.csv")
df = pd.read_csv(filename)

In [3]:
# Get the parent directory of the current folder
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Specify the file path for dataset.csv in the parent folder
filename = os.path.join(parent_dir, "dataset-1.csv")
df = pd.read_csv(filename)

In [30]:
# Get the parent directory of the current folder
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Specify the file path for dataset.csv in the parent folder
filename = os.path.join(parent_dir, "dataset-2.csv")
df = pd.read_csv(filename)

## Data Preprocessing

In [31]:
df.columns

Index(['Length of URL', 'Has IP address', 'Shortening Service',
       'Having @ Symbol', 'Double Slash Redirecting', 'Prefix-Suffix', 'CTLD',
       'HTTPS in Domain', 'Sensitive Words', 'Has Tilde', 'Has Port',
       'PhishID', 'frequency_of_a_tags', 'frequency_of_alltags',
       'presence_of_iframes', 'presence_of_popups', 'right_click_disabling',
       'checking_sfh', 'request_url', 'url-of-anchor', 'links_in_meta_img',
       'check_hidden_content', 'Current Domain Age', 'Matching Domain Name',
       'Length of Domain', 'Google page index', 'Result'],
      dtype='object')

In [32]:
df.shape

(5790, 27)

In [33]:
data_no_duplicates = df.drop_duplicates()
df = data_no_duplicates.dropna()

In [34]:
df.shape

(5790, 27)

In [35]:
df = df.drop('PhishID', axis = 1)

In [None]:
df.rename(columns={'Length of URL':'Length_of_URL', 'Has IP address':'Has_IP_address', 'Shortening Service':'Shortening_Service', 'Having @ Symbol':'Having_@_Symbol', 'Double Slash Redirecting':'Double_Slash_Redirecting', 'Prefix-Suffix':'Prefix-Suffix', 'CTLD':'CTLD', 'HTTPS in Domain':'HTTPS_in_Domain', 'Sensitive Words':'Sensitive_Words', 'Has Tilde':'Has_Tilde', 'Has Port':'Has_Port', 'Current Domain Age':'Current_Domain_Age' , 'Matching Domain Name':'Matching_Domain_Name', 'Length of Domain':'Length_of_Domain', 'Google page index':'Google_page_index','Result':'Result'}, inplace=True)

In [38]:
# Investigate unique values in the each column
unique_frequency_a = df['frequency_of_a_tags'].unique()
unique_all = df['frequency_of_alltags'].unique()
unique_iframes = df['presence_of_iframes'].unique()
unique_popups = df['presence_of_popups'].unique()
unique_rightclick = df['right_click_disabling'].unique()
unique_sfh = df['checking_sfh'].unique()
unique_request = df['request_url'].unique()
unique_url = df['url-of-anchor'].unique()
unique_meta = df['links_in_meta_img'].unique()
unique_R = df['Result'].unique()
unique_IP = df['Has_IP_address'].unique()
unique_SS = df['Shortening_Service'].unique()
unique_HAS = df['Having_@_Symbol'].unique()
unique_DSR = df['Double_Slash_Redirecting'].unique()
unique_PS = df['Prefix-Suffix'].unique()
unique_CTLD = df['CTLD'].unique()
unique_HID = df['HTTPS_in_Domain'].unique()
unique_SW = df['Sensitive_Words'].unique()
unique_HT = df['Has_Tilde'].unique()
unique_HP = df['Has_Port'].unique()
unique_hidden = df['check_hidden_content'].unique()
unique_R = df['Result'].unique()

print(unique_frequency_a)
print(unique_all)
print(unique_iframes)
print(unique_popups)
print(unique_rightclick)
print(unique_sfh)
print(unique_request)
print(unique_url)
print(unique_meta)
print(unique_R)
print(unique_IP)
print(unique_SS)
print(unique_HAS)
print(unique_DSR)
print(unique_PS)
print(unique_CTLD)
print(unique_HID)
print(unique_SW)
print(unique_HT)
print(unique_HP)
print(unique_hidden)
print(unique_R)

[-1  1]
[ 1  0 -1]
[1]
[ 1 -1]
[ 1 -1]
[-1  1]
[-1  1  0]
[-1  0  1]
[-1  1  0]
[ 1 -1]
[ 1 -1]
[-1  1]
[ 1 -1]
[ 1 -1]
[-1  1]
[ 1 -1  0]
[-1]
[ 1 -1]
[ 1 -1]
[-1  1]
[ 1 -1]
[ 1 -1]


### Remove all the values other than {-1, 0, 1} from the data

In [37]:
validValues = {-1, 0, 1}

for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')
    df = df[df[column].isin(validValues)]

# Convert all columns to integers (again after the filtering)
df = df.astype(int)

In [39]:
# Now you can use the DataFrame with valid integer values (1, -1, and 0)
print(df)

      Length_of_URL  Has_IP_address  Shortening_Service  Having_@_Symbol  \
0                 1               1                  -1                1   
1                 1               1                   1                1   
2                 1               1                   1                1   
3                 1               1                   1                1   
4                 1               1                   1                1   
...             ...             ...                 ...              ...   
5785              1               1                   1                1   
5786              1               1                   1                1   
5787              1               1                   1                1   
5788              1               1                   1                1   
5789              1               1                   1                1   

      Double_Slash_Redirecting  Prefix-Suffix  CTLD  HTTPS_in_Domain  \
0              

In [40]:
df.shape

(5789, 26)

### Data Preprocessing
- Shuffle the data, and split it into train and test data

In [41]:
df = sklearn.utils.shuffle(df)
X = df.drop("Result",axis=1).values
X = preprocessing.scale(X)
y = df['Result'].values
df.head()

Unnamed: 0,Length_of_URL,Has_IP_address,Shortening_Service,Having_@_Symbol,Double_Slash_Redirecting,Prefix-Suffix,CTLD,HTTPS_in_Domain,Sensitive_Words,Has_Tilde,...,checking_sfh,request_url,url-of-anchor,links_in_meta_img,check_hidden_content,Current_Domain_Age,Matching_Domain_Name,Length_of_Domain,Google_page_index,Result
1364,-1,1,1,-1,-1,1,0,-1,1,1,...,1,1,1,1,-1,1,-1,1,1,1
4145,-1,1,1,1,1,1,-1,-1,1,1,...,-1,-1,1,-1,-1,1,-1,1,1,-1
2761,-1,1,1,1,1,1,0,-1,1,1,...,-1,-1,0,0,-1,1,-1,1,1,1
3888,1,1,1,1,1,1,-1,-1,1,1,...,-1,-1,-1,-1,1,1,-1,1,1,-1
4739,0,1,1,1,1,1,1,-1,1,1,...,-1,1,1,1,1,-1,1,1,1,-1


## Evalution Metrics

In [42]:
scoring = {'accuracy': 'accuracy',
           'recall': 'recall',
           'precision': 'precision',
           'f1': 'f1'}
fold_count=10

## Random Forest

In [17]:
rforest_clf=RandomForestClassifier()
cross_val_scores = cross_validate(rforest_clf, X, y, cv=fold_count, scoring=scoring)
rforest_clf_score = mean_score(cross_val_scores)

print(f"fit time = {rforest_clf_score['fit_time']}")
print(f"score time = {rforest_clf_score['score_time']}")
print(f"accuracy = {rforest_clf_score['test_accuracy']}")
print(f"recall = {rforest_clf_score['test_recall']}")
print(f"precision = {rforest_clf_score['test_precision']}")
print(f"f1 = {rforest_clf_score['test_f1']}")

fit time = 0.3210896968841553
score time = 0.019525671005249025
accuracy = 0.8992945120748695
recall = 0.9247274068091234
precision = 0.9047243747131443
f1 = 0.9145480290157664


In [18]:
rforest_clf=RandomForestClassifier()
cross_val_scores = cross_validate(rforest_clf, X, y, cv=fold_count, scoring=scoring)
rforest_clf_score = mean_score(cross_val_scores)

print(f"fit time = {rforest_clf_score['fit_time']}")
print(f"score time = {rforest_clf_score['score_time']}")
print(f"accuracy = {rforest_clf_score['test_accuracy']}")
print(f"recall = {rforest_clf_score['test_recall']}")
print(f"precision = {rforest_clf_score['test_precision']}")
print(f"f1 = {rforest_clf_score['test_f1']}")

fit time = 0.33159356117248534
score time = 0.021239590644836426
accuracy = 0.8982576450269228
recall = 0.9220567836637226
precision = 0.9052715601195324
f1 = 0.9135295153622215


In [None]:
rforest_clf=RandomForestClassifier()
cross_val_scores = cross_validate(rforest_clf, X, y, cv=fold_count, scoring=scoring)
rforest_clf_score = mean_score(cross_val_scores)

print(f"fit time = {rforest_clf_score['fit_time']}")
print(f"score time = {rforest_clf_score['score_time']}")
print(f"accuracy = {rforest_clf_score['test_accuracy']}")
print(f"recall = {rforest_clf_score['test_recall']}")
print(f"precision = {rforest_clf_score['test_precision']}")
print(f"f1 = {rforest_clf_score['test_f1']}")

## Validation On Virus Total

In [19]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Load the training and test datasets
train_data = pd.read_csv('../dataset-2.csv')
test_data = pd.read_csv('../Sample-Legitimate.csv')

In [20]:
print(train_data.shape)
print(test_data.shape)

(5790, 27)
(267, 27)


In [21]:
PhishID = train_data['PhishID']
testPhishID = test_data[' PhishID']
# train_data = train_data.drop('PhishID', axis = 1)
# test_data = test_data.drop('PhishID', axis = 1)

In [22]:
train_data.drop_duplicates(inplace = True)
train_data.dropna(inplace = True)

test_data.drop_duplicates(inplace = True)
test_data.dropna(inplace = True)

print(train_data.shape)
print(test_data.shape)

(5790, 27)
(136, 27)


In [23]:
train_data.rename(columns={'Length of URL':'Length_of_URL', 'Has IP address':'Has_IP_address', 'Shortening Service':'Shortening_Service', 'Having @ Symbol':'Having_@_Symbol', 'Double Slash Redirecting':'Double_Slash_Redirecting', 'Prefix-Suffix':'Prefix-Suffix', 'CTLD':'CTLD', 'HTTPS in Domain':'HTTPS_in_Domain', 'Sensitive Words': 'Sensitive_Words', 'Has Tilde':'Has_Tilde', 'Has Port':'Has_Port', 'PhishID': 'PhishID', 'frequency_of_a_tags':'frequency_of_a_tags', 'frequency_of_alltags': 'frequency_of_alltags', 'presence_of_iframes':'presence_of_iframes', 'presence_of_popups': 'presence_of_popups', 'right_click_disabling': 'right_click_disabling', 'checking_sfh': 'checking_sfh', 'request_url': 'request_url', 'url-of-anchor':'url-of-anchor', 'links_in_meta_img':'links_in_meta_img', 'check_hidden_content': 'check_hidden_content', 'Current Domain Age':'Current_Domain_Age' , 'Matching Domain Name':'Matching_Domain_Name', 'Length of Domain':'Length_of_Domain', 'Google page index': 'Google_page_index','Result':'Result'}, inplace=True)

test_data.rename(columns={'Length of URL':'Length_of_URL', ' Has IP address':'Has_IP_address', ' Shortening Service':'Shortening_Service', ' Having @ Symbol':'Having_@_Symbol', ' Double Slash Redirecting':'Double_Slash_Redirecting', ' Prefix-Suffix':'Prefix-Suffix', ' CTLD':'CTLD', ' HTTPS in Domain':'HTTPS_in_Domain', ' Sensitive Words':'Sensitive_Words', ' Has Tilde':'Has_Tilde', ' Has Port':'Has_Port', ' PhishID': 'PhishID', ' frequency_of_a_tags':'frequency_of_a_tags',' frequency_of_alltags': 'frequency_of_alltags', ' presence_of_iframes':'presence_of_iframes', ' presence_of_popups':'presence_of_popups' ,' right_click_disabling': 'right_click_disabling', ' checking_sfh': 'checking_sfh', ' request_url': 'request_url', ' url-of-anchor':'url-of-anchor', ' links_in_meta_img':'links_in_meta_img', ' check_hidden_content':'check_hidden_content', ' Current Domain Age':'Current_Domain_Age' , ' Matching Domain Name':'Matching_Domain_Name', ' Length of Domain': 'Length_of_Domain', ' Google page index': 'Google_page_index','Result':'Result'}, inplace=True)

In [24]:
train_data.columns

Index(['Length_of_URL', 'Has_IP_address', 'Shortening_Service',
       'Having_@_Symbol', 'Double_Slash_Redirecting', 'Prefix-Suffix', 'CTLD',
       'HTTPS_in_Domain', 'Sensitive_Words', 'Has_Tilde', 'Has_Port',
       'PhishID', 'frequency_of_a_tags', 'frequency_of_alltags',
       'presence_of_iframes', 'presence_of_popups', 'right_click_disabling',
       'checking_sfh', 'request_url', 'url-of-anchor', 'links_in_meta_img',
       'check_hidden_content', 'Current_Domain_Age', 'Matching_Domain_Name',
       'Length_of_Domain', 'Google_page_index', 'Result'],
      dtype='object')

In [25]:
test_data.columns

Index(['Length_of_URL', 'Has_IP_address', 'Shortening_Service',
       'Having_@_Symbol', 'Double_Slash_Redirecting', 'Prefix-Suffix', 'CTLD',
       'HTTPS_in_Domain', 'Sensitive_Words', 'Has_Tilde', 'Has_Port',
       'PhishID', 'frequency_of_a_tags', 'frequency_of_alltags',
       'presence_of_iframes', 'presence_of_popups', 'right_click_disabling',
       'checking_sfh', 'request_url', 'url-of-anchor', 'links_in_meta_img',
       'check_hidden_content', 'Current_Domain_Age', 'Matching_Domain_Name',
       'Length_of_Domain', 'Google_page_index', 'Result'],
      dtype='object')

In [26]:
# Remove rows with PhishID not in [-1, 0, 1]
validValues = {-1, 0, 1}
for column in train_data.columns:
    if column != 'PhishID':
        train_data[column] = pd.to_numeric(train_data[column], errors='coerce')
        train_data = train_data[train_data[column].isin(validValues)]


# Remove rows with NaN values in the target variable
train_data = train_data.dropna(subset=['Result'])

# Convert all columns to integers 
train_data = train_data.astype(int)

In [27]:
print(train_data.shape)
print(test_data.shape)

(5789, 27)
(136, 27)


In [28]:
# Define features and target variable
features = train_data.columns.difference(['Result', 'PhishID'])
X = train_data[features]
y = train_data['Result']

In [29]:
# Train an AdaBoost classifier
rforest_clf = RandomForestClassifier()
rforest_clf.fit(X, y)

In [30]:
print(train_data.columns)
print(test_data.columns)

Index(['Length_of_URL', 'Has_IP_address', 'Shortening_Service',
       'Having_@_Symbol', 'Double_Slash_Redirecting', 'Prefix-Suffix', 'CTLD',
       'HTTPS_in_Domain', 'Sensitive_Words', 'Has_Tilde', 'Has_Port',
       'PhishID', 'frequency_of_a_tags', 'frequency_of_alltags',
       'presence_of_iframes', 'presence_of_popups', 'right_click_disabling',
       'checking_sfh', 'request_url', 'url-of-anchor', 'links_in_meta_img',
       'check_hidden_content', 'Current_Domain_Age', 'Matching_Domain_Name',
       'Length_of_Domain', 'Google_page_index', 'Result'],
      dtype='object')
Index(['Length_of_URL', 'Has_IP_address', 'Shortening_Service',
       'Having_@_Symbol', 'Double_Slash_Redirecting', 'Prefix-Suffix', 'CTLD',
       'HTTPS_in_Domain', 'Sensitive_Words', 'Has_Tilde', 'Has_Port',
       'PhishID', 'frequency_of_a_tags', 'frequency_of_alltags',
       'presence_of_iframes', 'presence_of_popups', 'right_click_disabling',
       'checking_sfh', 'request_url', 'url-of-anchor', '

In [31]:
# Make predictions on the test data
test_predictions = rforest_clf.predict(test_data[features])

In [32]:
# Load 'legitimate_updated.xlsx' and update 'adaboost' column
# Path for Lab PC
file_path = '/home/administrator/Documents/Phishing-Verification/Validation/Legitimate_Updated.xlsx'
# df = pd.read_excel(file_path)
# df['adaboost'] = df['PhishID'].map(phishid_to_result)
df = pd.read_excel(file_path)

In [33]:
# Create a mapping of PhishID to results
phishid_to_result = dict(zip(testPhishID, test_predictions))

In [34]:
# Iterate through each row in the DataFrame and update 'adaboost' column
for index, row in df.iterrows():
    phish_id = row['PhishID']
    if phish_id in phishid_to_result:
        df.at[index, 'Random-Forest'] = phishid_to_result[phish_id]
    else:
        df.at[index, 'Random-Forest'] = 0  # Write 0 when mapping is not found

In [35]:
# Save the updated 'legitimate_updated.xlsx'
df.to_excel(file_path, index=False)

In [36]:
# Display the accuracy on the test data
test_accuracy = accuracy_score(test_data['Result'], test_predictions)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.9264705882352942
