In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'openphish-malicious-urls:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1545876%2F2548771%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240404%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240404T104432Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3De5a5743b49514e18575f5d58bfcd42e5208d4c8af0352e0692a1748b403eb1899664947d107681f6a725eded71b15e5ff6ed7f1d31ac55a67db358760c4dfbb31f34d79d31c3e4de6c3b042b58b542b870ed619d65451edf4a30a5aead5e5e1b66ba929be6b5dca0ed31fac4baa5d1085c8ac702ab770c874e10e243cf7841e1d1281513c31d9e28c6e7f91b47cfba2fee43a61f097e9bb66d29864efd6bd3c49c344055c92276f714155d801e22a7e1aff517f989af973de5e9d283992772b7429aacfe235a353aa80962161517ce865b2bb2b8521b3455c98e61c8d599822ae0d26e757a3bdc1dadb609a1cceb08c509a1dd74f5cbba7e62e986f20d731074,malicious-and-benign-urls:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1545878%2F2548774%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240404%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240404T104432Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9877a3282a656fe4dfe39aaef1c2171bc1fd8f3d32075fd82836c44c7d24cf2a09b5acdf018d9cd9ca85d3e3392a62a6a11f23ab98690d66bfccbc7c2805acb2a1bbcef419467121f10aef8d6ccee89f680b63785840e4f120831736292ba158efde9aa67fdbed0864aef6e38b313dc02cc00301d360ecd75e3ea797cf8a1aea693c27cac297b2d29a9068285395330200c23e450062d3aadad705c8ac23b6726e53ab4c4766d4e26a3b447007af5a3e9a0f506279780ad2ac1d4b153135bfd659abd66c11e899930b1c20f6f8aeea9906aca608ce47ad2392ef5723dff168ed862113297400c0f298970455a9126312f5033ebcb92dd74d3df8438be7c34900'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
! pip install tldextract

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import os

# Visualisation
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Extracting URL informaation
from urllib.parse import urlparse
import tldextract

# Preprocessing
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import category_encoders as ce
from sklearn.metrics import confusion_matrix, classification_report

# ANN
from tensorflow.keras import Sequential, metrics
from tensorflow.keras.layers import Dense

In [None]:
# Data files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Loading data
dataset = pd.read_csv('../input/malicious-and-benign-urls/Malicious And Benign URLs.csv')

In [None]:
# Loading OpenPhish Testing Data
# This data is collected from https://openphish.com/
# which is different from training and testing data from kaggle repository.
# The dataset contains malicious sites only.
openphish_sites = pd.read_csv('../input/openphish-malicious-urls/openphish.csv', names=['url'])

In [None]:
dataset.head()

# Data Analysis

In [None]:
fig = go.Figure([go.Pie(labels=['Benign', 'Malicious'], values=dataset.label.value_counts())])
fig.update_layout(title='Percentage of Class (Benign and Malicious)')
fig.show()

In [None]:
# Dataset information
dataset.info()

In [None]:
# Checking NULL values
dataset.isnull().sum()

# Preprocessing
#### Features to be extracted
<ul>
    <li>SubDomain</li>
    <li>Domain</li>
    <li>Suffix</li>
    <li>Scheme length</li>
    <li>Path length</li>
    <li>Parameter length</li>
    <li>Query length</li>
    <li>Fragment length</li>
    <li>Count of '-'</li>
    <li>Count of '&'</li>
    <li>Count of '?'</li>
    <li>Count of '%'</li>
    <li>Count of '.'</li>
    <li>Count of digits</li>
    <li>Count of alphabets</li>
</ul>

In [None]:
# Sample Website URL information
site ="https://www.google.co.in/?gws_rd=ssl"
print(tldextract.extract(site))
print(urlparse(site))

## Functions

In [None]:
# Feature Extraction Functions
def extract_domains(row):
#     domain_info contains ['subdomain', 'domain', 'suffix']
#     len_info contains ['scheme_len', 'url_len', 'path_len', 'param_len', 'query_len', 'frag_len']
    domain_info = list('NA' if extracted == '' else extracted for extracted in tldextract.extract(row['url']))
    len_info = list(map(len, urlparse(row['url'])[:]))
    symbols = ['-', '@', '?', '%', '.']
    count_info = []
    for sym in symbols:
        count_info.append(row['url'].count(sym))
    count_info.append(sum(i.isdigit() for i in row['url']))
    count_info.append(sum(i.isalpha() for i in row['url']))
    return domain_info + len_info + count_info

def extract_url(data):
#     result_type='expand' causes return type to be pd.Dataframe instead of pd.Series
    url_component = data.apply(extract_domains, axis='columns', result_type='expand')
#     Changing column names
    extracted_columns = ['subdomain', 'domain', 'suffix',
                         'scheme_len', 'url_len', 'path_len', 'param_len', 'query_len', 'frag_len',
                         'count-', 'count@', 'count?', 'count%', 'count.', 'count_digit', 'count_alpha']
    url_component.columns = extracted_columns
#     display(url_component.head())
#     better than concat, because on multiple execution of new_extract_url(),
#     multiple cols of same type won't we added
    for col in extracted_columns:
        data[col] = url_component[col]
    return data

In [None]:
# Graph Plotting Functions
def get_frequent_group(data, n_group):
    # get the most frequent
    data = data.value_counts().reset_index(name='values')

    # scale log base 10
    data['values'] = np.log10(data['values'])

    # calculate total values
    # x_column (subdomain / domain / domain_suffix)
    x_column = data.columns[1]
    data['total_values'] = data[x_column].map(data.groupby(x_column)['values'].sum().to_dict())

    # get n_group data order by highest values
    data_group = data.sort_values('total_values', ascending=False).iloc[:, 1].unique()[:n_group]
    data = data[data.iloc[:, 1].isin(data_group)]
    data = data.sort_values('total_values', ascending=False)
    return data

def plot(data, n_group, title):
    data = get_frequent_group(data, n_group)
    fig = px.bar(data, x=data.columns[1], y='values', color='label')
    fig.update_layout(title=title)
    fig.show()

In [None]:
%%time
# Extracting information about URLs
data = extract_url(dataset)

In [None]:
# Extracted Data
data.head()

## Data Visualisation

In [None]:
# Number of unique Domains, Sub-Domains, Domain-suffix
fig = go.Figure(
    data=[
        go.Bar(
        x=['Domain', 'SubDomain', 'Suffix'],
        y=[data['domain'].nunique(), data['subdomain'].nunique(), data['suffix'].nunique()])
    ],
    layout=go.Layout(
        title="Number of unique Domains, Sub-Domains, Domain-suffix")
    )
fig.show()

In [None]:
plot(
    data=data.groupby('label')['domain'],
    n_group=20,
    title='Top 20 Domains Grouped By Labels (Logarithmic Scale)'
)

In [None]:
plot(
    data=data.groupby('label')['subdomain'],
    n_group=20,
    title='Top 20 Sub Domains Grouped By Labels (Logarithmic Scale)'
)

In [None]:
plot(
    data=data.groupby('label')['suffix'],
    n_group=20,
    title='Top 20 Domains Suffix Grouped By Labels (Logarithmic Scale)'
)

In [None]:
# Separating dependent and independent varible
X = data.drop(columns=['label'], inplace=False)
y = data.loc[:, 'label']

In [None]:
# Encoding labels
# Malicious: 1 and Benign: 0
y = (y == 'malicious').astype('int')

In [None]:
# Splitting data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

In [None]:
X_train

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Handling categorical features
count_encoder = ce.CountEncoder(handle_unknown=0)
X_train.iloc[:, 1:4] = count_encoder.fit_transform(X_train.iloc[:, 1:4])
X_test.iloc[:, 1:4] = count_encoder.transform(X_test.iloc[:, 1:4])

In [None]:
# Scaling of data
cols = X_train.columns
scaler_x = StandardScaler()
X_train = pd.DataFrame(scaler_x.fit_transform(X_train.iloc[:, 1:]), columns=cols[1:])
X_test = pd.DataFrame(scaler_x.transform(X_test.iloc[:, 1:]), columns=cols[1:])
del cols

In [None]:
# Correlation between dependent variables
fig = go.Figure(go.Heatmap(x = X_train.columns,
                 y = X_train.columns,
                 z = X_train.corr()))
fig.update_layout(title="Correlation between dependent variables")
fig.show()

In [None]:
# Preprocessing OpenPhish Data (unseen data)
openphish_data = extract_url(openphish_sites)
display(openphish_data.head())
openphish_data.iloc[:, 1:4] = count_encoder.transform(openphish_data.iloc[:, 1:4])
# display(openphish_data.info())
openphish_data = pd.DataFrame(scaler_x.transform(openphish_data.iloc[:, 1:]),
                              columns=openphish_data.columns[1:])
openphish_data_label = np.ones(shape=(len(openphish_data), 1), dtype='int')

# Model Training

In [None]:
%%time
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
%%time
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)

In [None]:
%%time
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=250, verbose=1, n_jobs=-1, random_state=14)
rf.fit(X_train, y_train)

In [None]:
%%time
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)

In [None]:
%%time
# K Nearest Neighbours
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn.fit(X_train, y_train)

In [None]:
%%time
# Support Vector Machine
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

In [None]:
%%time
# Artificial Neural Network
ann = Sequential()
ann.add(Dense(8, input_dim=X_train.shape[1], activation='relu', kernel_initializer='uniform'))
ann.add(Dense(8, activation='relu', kernel_initializer='uniform'))
ann.add(Dense(1, kernel_initializer='uniform', activation = 'sigmoid'))
ann.compile(optimizer = 'Adam', loss = 'binary_crossentropy',
            metrics = [metrics.Precision(), metrics.Recall()])

history = ann.fit(X_train, y_train, batch_size=64, epochs=20,
                  verbose=1, validation_split=0.2, shuffle=True)

In [None]:
fig = make_subplots(3, 1, subplot_titles=('Loss', 'Precision', 'Recall'))

for index, key in enumerate(['loss', 'precision', 'recall']):
    # train score
    fig.add_trace(go.Scatter(
        x=list(range(len(history.history[key]))),
        y=history.history[key],
        mode='lines+markers',
        name=key
    ), index + 1, 1)

    # val score
    fig.add_trace(go.Scatter(
        x=list(range(len(history.history[f'val_{key}']))),
        y=history.history[f'val_{key}'],
        mode='lines+markers',
        name=f'val {key}'
    ), index + 1, 1)

fig.update_layout(title='Artificial Neural Network metrics')
fig.show()

## Model Testing

In [None]:
%%time
# Logistic Regression model testing
lr_y_pred = lr.predict(X_test)
print("--Logistic Regression--")
print("Confusion Matrix: \n", confusion_matrix(y_test,lr_y_pred))
lr_class_report = classification_report(y_test, lr_y_pred, output_dict=True)

# Classification report on Openphish Data
openphish_lr_class_report = classification_report(openphish_data_label,
                                                  lr.predict(openphish_data),
                                                  output_dict=True, zero_division=0)

In [None]:
%%time
# Decision Tree model testing
dt_y_pred = dt.predict(X_test)
print("--Decision Tree--")
print("Confusion Matrix: \n", confusion_matrix(y_test, dt_y_pred))
dt_class_report = classification_report(y_test, dt_y_pred, output_dict=True)

# Classification report on Openphish Data
openphish_dt_class_report = classification_report(openphish_data_label,
                                                  dt.predict(openphish_data),
                                                  output_dict=True, zero_division=0)

In [None]:
%%time
# Random Forest model testing
rf_y_pred = rf.predict(X_test)
print("--Random Forest--")
print("Confusion Matrix: \n", confusion_matrix(y_test, rf_y_pred))
rf_class_report = classification_report(y_test, rf_y_pred, output_dict=True)

# Classification report on Openphish Data
openphish_rf_class_report = classification_report(openphish_data_label,
                                                  rf.predict(openphish_data),
                                                  output_dict=True, zero_division=0)

In [None]:
%%time
# Naive Bayes model testing
nb_y_pred = nb.predict(X_test)
print("--Naive Bayes--")
print("Confusion Matrix: \n", confusion_matrix(y_test, nb_y_pred))
nb_class_report = classification_report(y_test, nb_y_pred, output_dict=True)

# Classification report on Openphish Data
openphish_nb_class_report = classification_report(openphish_data_label,
                                                  nb.predict(openphish_data),
                                                  output_dict=True, zero_division=0)

In [None]:
%%time
# K Nearest Neighbours model testing
knn_y_pred = knn.predict(X_test)
print("--K Nearest Neighbours--")
print("Confusion Matrix: \n", confusion_matrix(y_test, knn_y_pred))
knn_class_report = classification_report(y_test, knn_y_pred, output_dict=True)

# Classification report on Openphish Data
openphish_knn_class_report = classification_report(openphish_data_label,
                                                   knn.predict(openphish_data),
                                                   output_dict=True, zero_division=0)

In [None]:
%%time
# Support Vector Machine model testing
svc_y_pred = svc.predict(X_test)
print("--Support Vector Machine--")
print("Confusion Matrix: \n", confusion_matrix(y_test, svc_y_pred))
svc_class_report = classification_report(y_test, svc_y_pred, output_dict=True)

# Classification report on Openphish Data
openphish_svc_class_report = classification_report(openphish_data_label,
                                                   svc.predict(openphish_data),
                                                   output_dict=True, zero_division=0)

In [None]:
%%time
# ANN model testing
ann_y_pred = ann.predict(X_test)
ann_y_pred = (ann_y_pred >= 0.5)
print("--Artificial Neural Network--")
print("Confusion Matrix: \n", confusion_matrix(y_test, ann_y_pred))
ann_class_report = classification_report(y_test, ann_y_pred, output_dict=True)

# Classification report on Openphish Data
openphish_ann_class_report = classification_report(openphish_data_label,
                                                   ann.predict(openphish_data)>=0.5,
                                                   output_dict=True, zero_division=0)

## Model Comparision

In [None]:
# Comparing model performance on testing data
models = ['Logistic Regresiion', 'Decision Tree', 'Random Forest', 'Naive Bayes', 'K Nearest Neighbours', 'Support Vector Machine', 'ANN']
class_reports = [lr_class_report, dt_class_report, rf_class_report, nb_class_report, knn_class_report, svc_class_report, ann_class_report]
model_compare = []
for index, report in enumerate(class_reports):
    model_compare.append([models[index], report['accuracy']])
    model_compare[index].extend(list(report['1'].values())[:-1])
# Converting list into dataframe
model_compare = pd.DataFrame(model_compare, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F-Score'])
model_compare.set_index('Model', inplace=True)
print("Classification Model comparison on Kaggle Dataset")
display(model_compare)

In [None]:
# Comparing model performance on openphish data
models = ['Logistic Regresiion', 'Decision Tree', 'Random Forest', 'Naive Bayes', 'K Nearest Neighbours', 'Support Vector Machine', 'ANN']
openphish_class_reports = [openphish_lr_class_report, openphish_dt_class_report, openphish_rf_class_report, openphish_nb_class_report,
                 openphish_knn_class_report, openphish_svc_class_report, openphish_ann_class_report]
openphish_model_compare = []
for index, report in enumerate(openphish_class_reports):
    openphish_model_compare.append([models[index], report['accuracy']])
    openphish_model_compare[index].extend(list(report['1'].values())[:-1])
# Converting list into dataframe
openphish_model_compare = pd.DataFrame(openphish_model_compare, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F-Score'])
openphish_model_compare.set_index('Model', inplace=True)
print("Classification Model comparison on OpenPhish Dataset")
display(openphish_model_compare)

In [None]:
fig = go.Figure()
for col in model_compare.columns:
    fig.add_trace(
        go.Bar(
            x=model_compare.index,
            y=model_compare[col],
            name=col,))
fig.update_layout(title='Classification Model Comparision on Kaggle Testing Data',
                  xaxis={'title': 'Models'}, yaxis={'title': 'Score'})
fig.update_yaxes(range=[0.93, 1])
fig.show()

In [None]:
fig = go.Figure()
for col in openphish_model_compare.columns:
    fig.add_trace(
        go.Bar(
            x=openphish_model_compare.index,
            y=openphish_model_compare[col],
            name=col,))
fig.update_layout(title='Classification Model Comparision on Openphish Data',
                  xaxis={'title': 'Models'}, yaxis={'title': 'Score'})
fig.update_yaxes(range=[0.4, 1])
fig.show()

<p>Precision denotes how many websites the model classified as malicious are actually malicious, since OpenPhish data contains only malicious websites hence precision of all the models is 1.</p>
<p>The most import metric in this scenerio is Recall, because Recall is calculated as out of all the websites that are malicious, how many were predicted as malicious and our main task is to identify malicous websites.</p>
<p>As evident from the above visualisation, the decision tree classifier performs best in both kaggle and openphish dataset.</p>