# Malicious URL Detection Using Machine Learning

## Importing Libraries

In [None]:
!pip install tldextract ipywidgets > /dev/null #For URL parsing  & User Interface


In [None]:

# Standard Libraries:
import re # To handle regular expressions
import numpy as np # Numerical operations
import pandas as pd  # For working with DataFrame
import ipaddress # to check ip address for feature extraction

# Visualization tools:
import matplotlib.pyplot as plt   # For Plotting

# URL Parsing:
import tldextract #For extracting domain and suffix from URLs

# import matplotlib ---delete if not needed

# Data Preprocessing
from sklearn import preprocessing  #To label encode y(dependent variable)
from sklearn.model_selection import train_test_split  #To split data into train and test sets
from sklearn.preprocessing import StandardScaler # For scaling features when needed(e.g Logistic Regression)

# Machine Learning Models for Training:
from sklearn.ensemble import RandomForestClassifier   # Model 1
from sklearn.ensemble import HistGradientBoostingClassifier # Model 2
from sklearn.linear_model import LogisticRegression # Model 3

# Metric for measuring model performance
from sklearn.metrics import (
accuracy_score,
confusion_matrix,
precision_score,
recall_score,
ConfusionMatrixDisplay,
classification_report)

# Model Comparison Tool:
import seaborn as sns

#User Interface
import ipywidgets as widgets
from IPython.display import display, clear_output,HTML


## Data Preprocessing

### Importing Dataset
This project uses a public malicous URL dataset from Kaggle. This dataset can be found using the following link: https://www.kaggle.com/datasets/sid321axn/malicious-urls-dataset

According to a discussion on Kaggle, the last 96,018 URLs that were sourced from PhishStorm might have had their labels flipped during the making of the original Kaggle set. The benign URLs were marked as phishing and phishing URLs as benign. To fix this issue, I initially tried to correct the wrong labels. However, I eventually decided to drop those URLs that were sourced from PhishStorm. This was done using command-line tools to keep only the first 555173 rows, including the header. This ensures that the corrupted labels do not affect the model's performance.

The trimmed version is accessed from my repository on GitHub: https://raw.githubusercontent.com/T-Geb/Malicious-URL-detection-_-ML/main/trimmed_dataset.csv




In [None]:

mal_ds = pd.read_csv("https://raw.githubusercontent.com/T-Geb/Malicious-URL-detection-_-ML/main/trimmed_dataset.csv")




---


### Verification of Proper Data Import


To verify that the dataset was imported properly, the first five URLs and their corresponding labels are printed below:

In [None]:
print(mal_ds['url'][:5])
print(mal_ds['type'][:5])

### Deleting Rows for Duplicated URL

In [None]:
# Checking for duplicated rows in column 1 - URL list
duplicate_urls = mal_ds[mal_ds.duplicated(subset=mal_ds.columns[0])]
total_duplicates = duplicate_urls.shape[0]
print("\n Total row count before duplicate clean-up:\n", mal_ds.shape[0])
print("\nTotal number of duplicate URLs:", total_duplicates)

print("\nClass distribution of duplicated URLs:\n")
print(duplicate_urls['type'].value_counts())

# Deleting Duplicated rows based on identified duplicates of the first column - URLs
print("\nDeleting duplicate rows....\n")
mal_ds = mal_ds.drop_duplicates(subset=mal_ds.columns[0], keep='first')

#Checking the number of URLs before and after dropping duplicate URLs
print("\nTotal row count after duplicate clean-up:\n", mal_ds.shape[0])





---


The above output shows that there are 10,064 duplicate URLs identified. To ensure consistent data and to prevent data leakage during the split into training and testing sets, these duplicates were dropped, and only the first occurrence of these URLs was retained.





---


### Checking for Missing Data
A check is performed to verify that there is no missing data in both the url and type columns.

A return value of 0 for both indicates that no missing data is found.

If any missing values were found, the dropna() method ensures that these rows are dropped

In [None]:

#Checking for missing values
count_missing_values = mal_ds.isnull().sum()
print("Missing values per column:")
print(count_missing_values)

#Drop any rows with any missing values
mal_ds = mal_ds.dropna()

# Verifying no missing values remain
print(f"\nAfter cleaning missing values: {mal_ds.isnull().sum().sum()}")
print(f"Final dataset size: {mal_ds.shape[0]} rows")


### Bar Chart Visualization: Showing Class Imbalance

In [None]:
# Y represents - mal_ds['type']

label_counts = mal_ds['type'].value_counts() #counting the values of each class using value_count. This will return a series
total_count = sum(label_counts) # counting the type count

bar_class = ['benign', 'defacement', 'malware', 'phishing']
bar_counts = [label_counts.get(label,0) for label in bar_class]

bar_colors = ['tab:red', 'tab:blue', 'tab:orange', 'tab:cyan']

figure, axis = plt.subplots() #assigning figure and axis for the bar plot

#Setting the x and y axis, applying color and title
axis.bar(bar_class, bar_counts, color=bar_colors)

axis.set_ylabel('Class Occurance')
axis.set_xlabel('Classification')
axis.set_title('Malicious Classification')


#using a for loop to show the count in numbers on each bar
for i, count in enumerate(bar_counts):
  percentage = (count / total_count) * 100
  axis.text(i, count, f'{count} ({percentage:.2f}%)', ha='center', va='bottom')

print("\n")
plt.show()


The bar chart above shows the original dataset's class distribution after trimming the wrong labels. It shows a significant class imbalance, with benign URLs dominating with a higher percentage of 69.74 and malware URLs showing the lowest percentage of 4.34.

This could impact machine learning models as it leads to bias towards the majority class, impacting the detection of malicious URLs, such as defacement, malware, and phishing.

---



## Balancing Classifications

To ensure the model is not biased towards benign URLs and minority classes(defacement, phishing, and malware) detection is maintained, and I have decided to use random sampling to get 23,000 URLs from each class.



In [None]:
# sample(n=23000) takes 23,000 random rows

benign = mal_ds[mal_ds['type'] == 'benign'].sample(n=23000)
defacement = mal_ds[mal_ds['type'] == 'defacement'].sample(n=23000)
phishing = mal_ds[mal_ds['type'] == 'phishing'].sample(n=23000)
malware = mal_ds[mal_ds['type'] == 'malware'].sample(n=23000)

balanced_df = pd.concat([benign, defacement, phishing, malware])

X = balanced_df.iloc[:,0]  #getting all rows from the first column - urls: the independent variable
Y = balanced_df.iloc[:,1] # getting all rows from the second column - types : the dependent variable

print("Total Sample Size:",balanced_df.shape[0])
print("\nSamples per type:")
print(balanced_df['type'].value_counts())

### Label Encoding - Dependent Variable, Y - Classifications

Since the labels(classes) are strings[benign, defacement, malware, phishing], they need to be converted into numeric values so the machine learning models can process them. I am using label encoding to assign a unique integer to each class.

In [None]:
# Y represents - balanced_df['type']

le = preprocessing.LabelEncoder()
le.fit(Y)  # Assigning labels to numbers

#le.classes_ - stores the internal mapping
# le.transform - This maps the labels to the assigned values

numeric_value = le.transform(le.classes_)
class_label = le.classes_

print("\nShowing Label Mapping:\n")
for i in range(len(numeric_value)):
    print(f"{class_label[i]}: {numeric_value[i]}")


y = le.transform(Y)

### Bar Chart of Top 4 Domain-Suffixes in Dataset

This visualization displays the four most common domain suffixes in the dataset. Identifying frequently used suffixes helps see trends that could be used for feature extraction.

I am using the tldextract library to handle domain extraction

In [None]:
extractor = tldextract.TLDExtract(cache_dir=False) # disabling network access to avoid issues

In [None]:
# X represents - balanced_df['urls']
#creating a method to work on one URL at a time to extract the suffix, and applying the method to the URLs column
# This method will also be used later on for feature extraction

def get_suffix(url):
    return tldextract.extract(url).suffix.lower()

extd_domains = X.apply(get_suffix)
top_domains = extd_domains.value_counts().head(4)
total_count = top_domains.sum()
suffix_name = top_domains.index.tolist()
suffix_counts= top_domains.values
bar_colors = ['tab:red', 'tab:blue', 'tab:orange', 'tab:cyan']

#plotting the histogram
figure, axis = plt.subplots()

print("Showing the four top domains\n\n" , top_domains)
axis.bar(suffix_name, suffix_counts,color=bar_colors)
axis.set_ylabel('Count')
axis.set_xlabel('Domain Suffix')
axis.set_title('Top Four Domain-Suffix In Dataset')


#using a for loop to show the count on each bar
for i, count in enumerate(suffix_counts):
  percentage = (count / total_count) * 100
  axis.text(i, count, f'{count} ({percentage:.2f}%)', ha='center', va='bottom')

plt.show()


The above chart shows that .com domains dominate the dataset. 11,599 URLs also show as having missing suffixes. It also suggests that a lot of the malicious URLs might contain .com as the domain along with the benign URLs. This indicates that the domain suffix alone is not a strong indicator for distinguishing URL types.



---




### Manual Feature Engineering of URLs



#### Feature Extraction Methods

Feature extraction is used to transform the raw URLs into meaningful numeric information that the models can use. My feature extraction focuses on lexical features by analyzing the contents within the URL from different angles. These extractions give the models patterns to recognize during training.
Since all of these methods return numeric values, no additional encoding is needed.

In [None]:
def get_url_length(url):
    return len(url)

def count_digits(url):
    return sum(c.isdigit() for c in url)

def equals_count(url):
    return url.count('=')

def question_count(url):
    return url.count('?')

def hyphen_count(url):
    return url.count('-')

def count_other_special_chars(url):
    pattern = r'[^a-zA-Z0-9=?-]'  #since equals, question mark and hyphen are counted separately, I'm excluding them in the count for special chars
    special_char = re.findall(pattern,url)
    count = len(special_char)
    return count

def https_check(url):
    return 1 if url.startswith("https") else 0

# A method to get the full domain e.g www.123.com. This method will be used in the domain_number_check method
def get_full_domain(url):
    return tldextract.extract(url).fqdn


# A method to check if the entire domain is made of numbers and dots
# This will check if the domain is IP-like numbers, which can be an indicator of malicious intent in a URL.

def count_nums_in_domain(url):
    domain = get_full_domain(url)
    digit_count = 0
    for char in domain:
        if char.isdigit():
            digit_count += 1
    return digit_count


#check if the domain is an IP address
def domain_is_ip(url):

    domain = get_full_domain(url)
    try:
        ipaddress.ip_address(domain)
        return 1
    except ValueError:
        return 0

def suspicious_suffix(url):
    suffix = get_suffix(url)
    if suffix in ('',"info","cn","cc","asia","tk","biz","fm","tv","xyz","ml"):
        return 1 # if the domain suffix matches one of the suffixes in this list, indicating suspcious domain
    else:
        return 0  # if the domain suffix doesn't match..return 0


def count_suspicious(url):
    keywords = ["ebayisapi","webscr","rfc","webmail","login","re2","servlet","urgent","confirm","signin","login","login2",
                "account","validate","activate","secure","blogs","crypto","pay","fish"]
    url = url.lower()
    count = 0
    for word in keywords:
        if word in url:
            count += 1
    return count


#### Testing Functionality of Feature Methods

A few test URLs were plugged into the methods to show that the methods return the expected output

In [None]:

test_urls = [
    "https://www.google.com",
    "https://www.alsothecrumbsplease.com/authentic-black-forest-cake",
    "http://sample.info/?drain=mine&direction=lock#cable",
    "http://bank-confirm.com/login",
    "https://www.sample.edu/?query.8"
]

for url in test_urls:
    print(f"URL: {url}")
    print("URL Length:", get_url_length(url))
    print("Digit Count:", count_digits(url))
    print("Special char count:", count_other_special_chars(url))
    print("Equals sign count:", equals_count(url))
    print("Question mark count:", question_count(url))
    print("Hyphen count:", hyphen_count(url))
    print("Suspicious keyword count:", count_suspicious(url))
    print("get_domin:", get_full_domain(url))
    print("Domain is IP?:", domain_is_ip(url))
    print("is domain number:", count_nums_in_domain(url))
    print("Domain Suffix: ", get_suffix(url)) #nothing being returned
    print("Suspicious suffix :", suspicious_suffix(url))

    print("-" * 50)




### Splitting Data into Train and Test Set

The training set will have 80% of the data, and the test set will have 20%. With 92,000 records available, 20% should be sufficient for testing.
I used 'random_state = 42' to ensure that the same random split is applied each time the program is run, and stratify=y to keep class balance during split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 42,stratify=y)

#### Verification of balanced class split

In [None]:
label_mapping = {0: 'Benign', 1: 'Defacement', 2: 'Malware', 3: 'Phishing'}

print("\nTrain set class distribution:\n")
print(pd.Series(y_train).value_counts().sort_index().rename(index=(label_mapping)).to_string())

print("\n\nTest set class distribution:\n")
print(pd.Series(y_test).value_counts().sort_index().rename(index=(label_mapping)).to_string())

print("\nTotal rows in train set:", len(x_train))
print("Total rows in test set:", len(x_test))

The above output shows that class balance is maintained within the training and testing sets.



---




### Applying Features to Train and Test Set Dataframes

For this specific program, the feature extraction methods need to be applied to both the x_train and x_test sets because the model needs these numeric trends to make predictions. This does not cause data leakage, as the feature extractions operate on rows and no statistical generation was made.

In [None]:
#creating a data frame for x_train and x_test and adding the feature extractions as columns in the dataframe
# Used the pandas library to create the dataframes and apply the methods.

x_train_df = pd.DataFrame({
    'url_length': x_train.apply(get_url_length),
    'digit_counts': x_train.apply(count_digits),
    'equals_count' : x_train.apply(equals_count),
    'question_count' : x_train.apply(question_count),
    'hyphen_count' : x_train.apply(hyphen_count),
    'count_special_chars' : x_train.apply(count_other_special_chars),
    'https_check' : x_train.apply(https_check),
    'count_nums_in_domain' : x_train.apply(count_nums_in_domain),
    'domain_is_ip' : x_train.apply(domain_is_ip),
    'suspicious_suffix' : x_train.apply(suspicious_suffix),
    'count_suspicious' : x_train.apply(count_suspicious)
})

x_test_df = pd.DataFrame({
    'url_length' : x_test.apply(get_url_length),
    'digit_counts' : x_test.apply(count_digits),
    'equals_count' : x_test.apply(equals_count),
    'question_count' : x_test.apply(question_count),
    'hyphen_count' : x_test.apply(hyphen_count),
    'count_special_chars' :x_test.apply(count_other_special_chars),
    'https_check' : x_test.apply(https_check),
    'count_nums_in_domain' : x_test.apply(count_nums_in_domain),
    'domain_is_ip' : x_test.apply(domain_is_ip),
    'suspicious_suffix' : x_test.apply(suspicious_suffix),
    'count_suspicious' : x_test.apply(count_suspicious)
})

print("\nFirst 5 rows of the extracted features for x_train \n")
display(x_train_df.head(5))

print("\n\nFirst 5 rows of the extracted features for x_test \n")
display(x_test_df.head(5))


## Model Training, Test and Evaluation

Model training for this project is done on 3 models. These models are as follows:
1. Logistic Regression
2. Hist Gradient Boosting Classifier
3. Random Forest Classifier

The evaluation will look at accuracy and f1-scores as metrics to evaluate the performance of models. f1-score gives a single number by combining both precision and recall.


### Model 1 : Logistic Regression


#### Training Model

In [None]:
log_model = LogisticRegression(random_state = 42)

#scaling features to treat features equally with logistic regression
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_df)
x_test_scaled = scaler.transform(x_test_df)
print(type(x_train_scaled))

log_model.fit(x_train_scaled, y_train)

#### Generating Predictions

In [None]:
y_pred_log=log_model.predict(x_test_scaled)

#### Evaluating Performance

In [None]:
accuracy = accuracy_score(y_test,y_pred_log)
print(f"Model Accuracy: {accuracy:.2f}")

# Showing classification report
print("\nClassification Report:\n")
report_log_str = classification_report(y_test,y_pred_log,target_names=le.classes_)  # for printing
report_log_dict = classification_report(y_test,y_pred_log,output_dict=True) # for heatmap - model comparision

print(report_log_str)

#Plotting a confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_log, display_labels=le.classes_,cmap="Blues", normalize="true")
plt.title("Confusion Matrix for Model 3 - Logistic Regression")
plt.show()


**Evaluation Summary: Logistic Regression**

This model performed the weakest overall, with an overall accuracy of 66% and a low F1-score of 0.45 for phishing. This shows that logistic regression may not be the best choice for a dataset with complex features

-- Results may vary slightly during reruns due to model randomness.



---



### Model 2 : Hist Gradient Boosting Classifier

#### Training Model

In [None]:
hist_gbc= HistGradientBoostingClassifier(random_state = 42)
hist_gbc.fit(x_train_df, y_train)

#### Generating Predictions

In [None]:
y_pred_hist=hist_gbc.predict(x_test_df)

#### Evaluating Performance

In [None]:
accuracy = accuracy_score(y_test,y_pred_hist)
print(f"Model Accuracy: {accuracy:.2f}")

# Showing classification report
print("\nClassification Report:\n")
report_hist_str = classification_report(y_test,y_pred_hist,target_names=le.classes_) # for printing
report_hist_dict = classification_report(y_test,y_pred_hist,output_dict=True) # for heatmap - model comparision
print(report_hist_str)

#Plotting a confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_hist, display_labels=le.classes_,cmap="Blues", normalize="true")
plt.title("Confusion Matrix for Model 2 - Hist Gradient Boosting Classifier")
plt.show()

**Evaluation Summary: Hist Gradient Boosting Classifier**

- The model achieved an accuracy of 86%
- Malware here shows the highest F1-score(0.91), showing strong detection ability.
- Phishing shows the lowest F1-score(0.82), which shows the model can, at times, struggle to identify phishing URLs. This is to be expected, as most phishing URLs present themselves as safe and rely on social engineering


-- Results may vary slightly during reruns due to model randomness.


---




### Model 3 - Random Forest Classifier

#### Training Model

In [None]:
random_forest = RandomForestClassifier(random_state = 42)
random_forest.fit(x_train_df, y_train)

#### Generating Predictions

In [None]:
y_pred_rf = random_forest.predict(x_test_df)

In [None]:
# Model accuracy measurement
accuracy = accuracy_score(y_test,y_pred_rf)
print(f"Model Accurancy: {accuracy:.2f}")

# Showing classification report
print("\nClassification Report:\n")
report_rf_str = classification_report(y_test,y_pred_rf,target_names=le.classes_) # for printing report
report_rf_dict = classification_report(y_test,y_pred_rf,output_dict=True) # for heatmap - model comparision

print(report_rf_str)
# Plotting a confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_rf, display_labels=le.classes_,cmap="Blues", normalize="true")
plt.title("Confusion Matrix for Model 1 - Random Forest Classifier")
plt.show()


#### Evaluating Performance

**Evaluation Summary: Random Forest Classifier**
- The model achieved an accuracy of 86% at the time of run
- Malware had the highest F1-score(0.91), showing the model can identify malware URLs better than other classes.
- Phishing again had the lowest F1-score(0.82), which was heavily influenced by its low recall. This indicates that the detection of phishing remains harder

-- Results may vary slightly during reruns due to model randomness.

---



## Model Comparison and Selection Model for User Interaction

I have trained, tested, and evaluated three models. The classification reports and confusion matrices show individual reports for each model. To visually compare the performance of the three models, I am creating a heatmap by using the F1-score of each class in each model.

In [None]:

# putting the model name and the dict classfication report in a dictionary
model_predictions = {
    "Random Forest": report_rf_dict,
    "Hist Gradient Boosting": report_hist_dict,
    "Logistic Regression": report_log_dict
}

class_labels = ["Benign", "Defacement", "Malware","Phishing"]
class_keys = ['0','1','2','3']

f1_data = {} # creating an empty dictionary to put the model name along with the list of f1-scores for each class


# a for loop to parse through the model_prediction dictionary and extract the F1-score for each class from each model's classification report.
for model_name, report in model_predictions.items():
  f1_scores = []
  for class_key in class_keys:
    f1 = report[class_key]['f1-score']
    f1_scores.append(f1)
  f1_data[model_name] = f1_scores


# plotting heatmap
f1_dataframe = pd.DataFrame(f1_data, index=class_labels).T
sns.heatmap(f1_dataframe,annot=True,fmt=".2f")
plt.title("F1 Score Comparision Across Models and Classes")
plt.xlabel("Classes")
plt.ylabel("Models")
plt.yticks(rotation=0)
plt.tight_layout()


plt.show()




## Model Selection

Based on the accuracy metric, classification reports, and heat map visualization, the final model is trained using the **Random Forest Classifier**. This model shows a higher F1-score compared to the other models. This indicates that precision and recall were fairly high. Phishing URL f1-score shows the lowest performance.

The model trained using Logistic Regression did not perform as well as the other two models.

## Generating Sample URLs
Generating URLs from the test set to provide the user with sample input. Since these are pulled from the test set, the model has not yet learned them and can be used as a sample.

In [None]:
sample_urls = x_test.sample(10,random_state=42)


## User Interface - URL Safety Checker

In [None]:
## URL Check: User Interaction


def run_url_checker():
  print("\nEnter a URL from the sample list or input your own, then click submit\n \nOr --  Type 'q' and click submit to exit interface.\n")

# Sampling 10 URLs from the x_test set to provide to user

  sample_options = ["Select a sample URL...."] + sample_urls.values.tolist()

  dropdown = widgets.Dropdown(options=sample_options, description="Samples:")


  text_box = widgets.Text(placeholder = "Enter url and press Enter on keyboard")
  submit_button = widgets.Button(description="Submit",button_style='Primary')

# to fill the text box when dropdown changes
  def on_dropdown_change(change):
    if change.new != "Select a sample URL....":
      text_box.value = change.new

  dropdown.observe(on_dropdown_change, names='value')
  display(dropdown)
  display(HTML("<br>"))
  display(text_box,)
  display(HTML("<br>"))
  display(submit_button)
  display(HTML("<br>"))

  def submit(sender=None):
    url = text_box.value.strip()
    clear_output(wait=True)


    if not url:
        print("Please enter a URL.")
        return
    elif url == 'q':
        print("\n\nProgram Exited.")
        return

    text_box.value = "" # prepare for next input

    extracted_features = pd.DataFrame([{
        'url_length': get_url_length(url),
        'digit_counts': count_digits(url),
        'equals_count' : equals_count(url),
        'question_count' : question_count(url),
        'hyphen_count' : hyphen_count(url),
        'count_special_chars' : count_other_special_chars(url),
        'https_check' : https_check(url),
        'count_nums_in_domain' : count_nums_in_domain(url),
        'domain_is_ip' : domain_is_ip(url),
        'suspicious_suffix' : suspicious_suffix(url),
        'count_suspicious' : count_suspicious(url)
    }])

    prediction = random_forest.predict(extracted_features)[0]
    print("\n\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>URL Safety Prediction>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n\n")
    display(HTML(f"URL Entered: {url}\n"))
    print("\n")

    if prediction == 0:
      display(HTML(f"<b> Prediction: <span style='color:green;'>Benign - Safe</span></b><br><br>"))

    elif prediction == 1:
      display(HTML(f"<b> Prediction: <span style='color:red;'>Malicious - Possible Defacement</span></b><br><br>"))

    elif prediction == 2:
      display(HTML(f"<b> Prediction: <span style='color:red;'>Malicious - Possible Malware</span></b><br><br>"))

    elif prediction == 3:
      display(HTML(f"<b> Prediction: </b> <span style='color:red;'>Malicious - Possible Phishing</span></b><br><br>"))

    run_url_checker() # show input form for another entry


  text_box.on_submit(submit)
  submit_button.on_click(submit)

run_url_checker()

