Import the necessary Libraries

In [49]:
import pandas as pd
import numpy as np

Exploring Kaggle Dataset

In [50]:
# load the training dataset
train = pd.read_parquet("Training.parquet")
train.shape

(7658, 89)

In [51]:
# load the test dataset
test = pd.read_parquet("Testing.parquet")
test.shape

(3772, 89)

Combine the train and test datasets

In [52]:
combined_data = pd.concat([train, test], axis=0, ignore_index=True)
combined_data.shape

(11430, 89)

#### check the columns of the combined dataframe

In [53]:
combined_data.columns

Index(['url', 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens',
       'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore',
       'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma',
       'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com',
       'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
       'ratio_digits_host', 'punycode', 'port', 'tld_in_path',
       'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains',
       'prefix_suffix', 'random_domain', 'shortening_service',
       'path_extension', 'nb_redirection', 'nb_external_redirection',
       'length_words_raw', 'char_repeat', 'shortest_words_raw',
       'shortest_word_host', 'shortest_word_path', 'longest_words_raw',
       'longest_word_host', 'longest_word_path', 'avg_words_raw',
       'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand',
       'brand_in_subdomain', 'brand_in_path', 'suspecious_tld',
       'statistical_report', 

- The dataset has 89 different columns
- the status column is the target column

#### create a new dataframe that has only the url and status columns

In [54]:
# Create a new DataFrame with only the 'url' and 'status' columns
url_status_df = combined_data[['url', 'status']]
url_status_df.head()

Unnamed: 0,url,status
0,https://www.todayshomeowner.com/how-to-make-ho...,legitimate
1,http://thapthan.ac.th/information/confirmation...,phishing
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,phishing
3,https://www.bedslide.com,legitimate
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,legitimate


#### convert status column to numerical

In [66]:
# Create a copy to avoid the SettingWithCopyWarning
url_status_df = url_status_df.copy()

# Map the 'status' column to numerical values
url_status_df['status'] = url_status_df['status'].map({'legitimate': 0, 'phishing': 1})
url_status_df.head()


Unnamed: 0,url,status
0,https://www.todayshomeowner.com/how-to-make-ho...,
1,http://thapthan.ac.th/information/confirmation...,
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,
3,https://www.bedslide.com,
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,


#### check for duplicates

In [75]:
# Count the number of duplicate URLs
duplicate_count = url_status_df['url'].duplicated().sum()
print(f"Number of duplicate URLs: {duplicate_count}")

Number of duplicate URLs: 1


- The url_status_df has one observation duplicated.
- The duplicated row will be dropped after combining with the UCI Irvine dataset

#### inspect the unique values in the status column 

In [56]:
url_status_df['status'].unique()

array([0, 1], dtype=int64)

#### Check the value counts for both the legitimate and the legitimate and the phishing urls

In [57]:
url_status_df.status.value_counts()

1    5715
0    5715
Name: status, dtype: int64

- The dataset is balanced.
- It has 5715 legitimate urls and 5715 phishing urls

## Exploring the UCI Irvine Dataset

#### Loading the dataset

In [58]:
dataset2= pd.read_csv("PhiUSIIL_Phishing_URL_Dataset.csv")
dataset2.head()

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,...,0,0,1,34,20,28,119,0,124,1
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,...,0,0,1,50,9,8,39,0,217,1
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,...,0,0,1,10,2,7,42,2,5,1
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,...,1,1,1,3,27,15,22,1,31,1
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,...,1,0,1,244,15,34,72,1,85,1


#### 

#### check the columns of dataset 2

In [59]:
dataset2.columns

Index(['FILENAME', 'URL', 'URLLength', 'Domain', 'DomainLength', 'IsDomainIP',
       'TLD', 'URLSimilarityIndex', 'CharContinuationRate',
       'TLDLegitimateProb', 'URLCharProb', 'TLDLength', 'NoOfSubDomain',
       'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio',
       'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL',
       'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL',
       'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL',
       'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength',
       'HasTitle', 'Title', 'DomainTitleMatchScore', 'URLTitleMatchScore',
       'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect',
       'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame',
       'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton',
       'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto',
       'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef',
       'NoOfEmptyRef', 'NoOf

#### create a new dataframe that has only the URL and the target variable

In [77]:
# Create a new DataFrame with only the 'url' and 'status' columns
url_label_df = dataset2[['URL', 'label']]
url_label_df.head()

Unnamed: 0,URL,label
0,https://www.southbankmosaics.com,1
1,https://www.uni-mainz.de,1
2,https://www.voicefmradio.co.uk,1
3,https://www.sfnmjournal.com,1
4,https://www.rewildingargentina.org,1


#### check the shape of the url_label_df

In [78]:
url_label_df.shape

(235795, 2)

- The dataset has 235,795 observations with the target column to determine whether the url is a legitimate or phishing url

#### check for duplicates

In [79]:
# Count the number of duplicate URLs
duplicate_count = url_label_df['URL'].duplicated().sum()
print(f"Number of duplicate URLs: {duplicate_count}")

Number of duplicate URLs: 425


- The dataset has 425 duplicates
- The duplicates will be dropped after combining the two datasets

### Merging both datasets

Here the kaggle dataset and the UCI Irvine dataset are marged before extracting the features for the combined dataset.

The datasets will be merged as follows:
1. Standardize Column Names: Rename similar columns in each dataset to a common name.
2. Concatenate: Use pd.concat to combine the datasets.

In [62]:
# Rename columns in url_label_df for consistency
url_label_df = url_label_df.rename(columns={'URL': 'url', 'label': 'status'})

# Concatenate the two DataFrames
combined_df = pd.concat([url_status_df, url_label_df], ignore_index=True)

combined_df.head()

Unnamed: 0,url,status
0,https://www.todayshomeowner.com/how-to-make-ho...,0
1,http://thapthan.ac.th/information/confirmation...,1
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,1
3,https://www.bedslide.com,0
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,0


#### check the shape of the combined dataframe

In [64]:
combined_df.shape

(247225, 2)

## Preprocessing

#### Checking whether there are duplicates in the combined dataframe

In [73]:
# Check for duplicates in the 'url' column
duplicates = combined_df.duplicated(subset='url', keep=False)  # keep=False marks all duplicates as True

# Display all rows with duplicate URLs
duplicate_urls = combined_df[duplicates]
print("Duplicate URLs:")
print(duplicate_urls)

Duplicate URLs:
                                                      url  status
261     https://app.box.com/s/x6agocx9zvj049azirk4aw3x...       1
303                         http://vxdse.myfreesites.net/       1
339                     http://repl-mess.myfreesites.net/       1
389                     http://site9423773.92.webydo.com/       1
485     http://www.imcreator.com/viewer/vbid-fa0f29d5-...       1
...                                                   ...     ...
246025                 https://outlook-web-fb782.web.app/       0
246710                    https://orange789.yolasite.com/       0
246848                http://uph0ldlgin.mystrikingly.com/       0
246931  https://objectstorage.ap-singapore-1.oracleclo...       0
247168    https://yellow-river-189b.lhziiz35.workers.dev/       0

[937 rows x 2 columns]


#### Count the Number of Duplicate URLs

In [74]:
# Count the number of duplicate URLs
duplicate_count = combined_df['url'].duplicated().sum()
print(f"Number of duplicate URLs: {duplicate_count}")

Number of duplicate URLs: 470


- The dataset has 420 duplicate URLS
- We therefore drop the duplicate URLS before extracting the features

#### Drop Duplicates

In [80]:
# Remove duplicates, keeping the first occurrence
combined_df_no_duplicates = combined_df.drop_duplicates(subset='url', keep='first')

print("DataFrame after removing duplicates:")
print(combined_df_no_duplicates)


DataFrame after removing duplicates:
                                                      url  status
0       https://www.todayshomeowner.com/how-to-make-ho...       0
1       http://thapthan.ac.th/information/confirmation...       1
2       http://app.dialoginsight.com/T/OFC4/L2S/3888/B...       1
3                                https://www.bedslide.com       0
4       https://tabs.ultimate-guitar.com/s/sex_pistols...       0
...                                                   ...     ...
247220                     https://www.skincareliving.com       1
247221                      https://www.winchester.gov.uk       1
247222                    https://www.nononsensedesign.be       1
247223  https://patient-cell-40f5.updatedlogmylogin.wo...       0
247224                 https://www.alternativefinland.com       1

[246755 rows x 2 columns]
