In [None]:
# Fork from https://www.kaggle.com/code/habibmrad1983/habib-mrad-detection-malicious-url-using-ml-models/notebook
# Decision Tree Example: https://machinelearninggeek.com/decision-tree-classification-in-python/
# ML for cybersecurity: https://github.com/jivoi/awesome-ml-for-cybersecurity?tab=readme-ov-file#-datasets
# https://www.datacamp.com/tutorial/decision-tree-classification-python 

## 1. Setup

### 1.1 Initial Setup and Libraries/Dependencies

In [None]:
!pip install -r ./phishing_url_detector_requirements.txt
#pandas
#numpy
#matplotlib
#scikit-learn
#tld
#re101 - 
#seaborn - for data visualization
#colorama - coloring 
#urllib3

### 1.2 Import Libraries

In [None]:
# ML Data Processing libraries/modules
import numpy as np
import pandas as pd

# ML Model libraries/modules
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier #Stochastic Gradient Descent for Linear models (LogisticRegression, SVM)
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay

# URL or Text processing libraries
import re # For regular expressions

# Data Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
from colorama import Fore #To color python outputs 

# URL libraries
from urllib.parse import urlparse
from tld import get_tld, is_tld

## 2. Data Load and Pre-processing

### 2.1 Load Data

In [None]:
#Load the data into a dataframe (df) so it can be made ready for ML
url_df = pd.read_csv('./phishing_url_data.csv')
url_df.head(10)

### Types of Cyber attacks

#### Phishing
Attemps to gather information and steal sensitive data such as usernames and passwords, bank account information, credit card numbers etc. It is a social engineering attack. 

#### Defacement
Attempts to cause a denial of service by unauthorized alteration of the website pages. This can lead to evil twin websites that can be setup to perform additional pivoting attacks. 

#### Malware
Attempts to install and infect MALicious softWARE (and hence the name Malware) that can lead to serious compromises. 



### 2.2 Meta Data information

In [None]:
# Retrieve meta data about the dataframe
# This will provide information in terms of the data, the type of data, number of data entries, 
# and also the memory that is use
url_df.info()

## 3. Data Pre-processing

### 3.1 Checking for Null or Not a Number (NaN) values

In [None]:
url_df.isnull().sum()

### 3.2 URLs by type of cyber attack

In [None]:
cyber_attack_count = url_df.type.value_counts()
cyber_attack_count

In [None]:
cyber_attack_count_index = cyber_attack_count.index
cyber_attack_count_index

### 3.3 Plotting cyber attack types

In [None]:
counts = url_df['type'].value_counts().rename_axis('type').reset_index(name='count')
snsplt = sns.barplot(x=cyber_attack_count.index, y=cyber_attack_count, data=counts)
snsplt.bar_label(snsplt.containers[0])


In [None]:
#sns.barplot(x=cyber_attack_count.index, y=cyber_attack_count)
#snsplt.set(xlabel = 'Cyber Attack Types', 
#           ylabel = 'Count',
#           title = 'Types of Cyber Attacks')
#snsplt.bar_label(snsplt.containers[0])

### 3.4 URL data processing

### 3.4.1 Removing www. from the URL scheme

In [None]:
# Removing www from the URL scheme
url_df['url'] = url_df['url'].replace('www.', '', regex=True)
url_df

### 3.4.2 Adding Category to represent type with Numbers (Category Numbers)

In [None]:
# Adding Category to represent type with Numbers (Category Numbers)
replacement_data = {"Category": {"benign":0, "defacement":1, "phishing":2, "malware":3}}
url_df['Category'] = url_df['type']
url_df = url_df.replace(replacement_data)
url_df.head(10)

## 4. Feature Extraction - Data Preparation

### 4.1 Adding a url length column

In [None]:
#4.1 Adding a url length column
url_df['url_len'] = url_df['url'].apply(lambda x: len(str(x)))
url_df.head(10)

### 4.2 Extract TLD from whole URL

In [None]:
# Function to extract and get top level domain 
# netloc - returns the network location, which will be the domain (and subdomain if present),
# the port number, and any credentials supplied (if present in the URL)
# Resulting Form: username:password@domain.com:443

def getTLD(url):
    try:
        #Extract tld from the url supplied
        tldomain = get_tld(url, as_object=True, fail_silently=False, fix_protocol=True)
        primary_domain = tldomain.parsed_url.netloc
    except:
        primary_domain = None
    return primary_domain

In [None]:
# 4.2 Adding a domain column
url_df['domain'] = url_df['url'].apply(lambda d: getTLD(d))
url_df.head(10)

### 4.3 Count special characters in URL

In [None]:
spl_characters = ['@','?','-','=','.','#','%','+','$','!','*',',','//']
for s in spl_characters:
    url_df[s] = url_df['url'].apply(lambda c: c.count(s))

In [None]:
url_df.head(10)

### 4.4 Identify if an URL is non-conformant (abnormal) URL or not

In [None]:
# Function to identify the number of non-conformant or abnormal URLs 
# urlparse() parses the URL into six components and returns a 6-tuple
# Tuples are immutable data types that can store multiple values of different data types, kind of like a list
# Return 1 if conformant or else it returns 0 for nonconformant urls

def isNonConformantUrl(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    regex_match = re.search(hostname, url)
    if regex_match:
        #print(regex_match.group())
        return 1
    else:
        #print('Not matching pattern found')
        return 0

In [None]:
# Identify if an URL is non-conformant (abnormal) URL or not
url_df['nonconformant_url'] = url_df['url'].apply(lambda i: isNonConformantUrl(i))
url_df.head(10)

#### Plot Conformant and Non-conformant URLs

In [None]:
snsplt = sns.countplot(x='nonconformant_url', data=url_df)
snsplt.set(xlabel = 'URL Conformance', 
           ylabel = 'Count',
           title = 'URLs (non-conformant:0, conformant:1)')
snsplt.bar_label(snsplt.containers[0])

### 4.5 Identify if an URL is Secure or not

In [None]:
# Function to determine if a URL is secure or not 
# urlparse() parses the URL into six components and returns a 6-tuple
# Tuples are immutable data types that can store multiple values of different data types, kind of like a list
# Return 1 if secure or else it returns 0 for nonsecure urls
# urllib supports the following URL schemes - http, https, ftp, gopher, file, ...

def isSecureUrl(url):
    urlScheme = urlparse(url).scheme
    regex_match = str(urlScheme)
    
    if regex_match == 'https':
        #print(regex_match.group())
        return 1
    else:
        #print('Not matching pattern found')
        return 0

In [None]:
#  Identify if an URL is Secure or not
url_df['secure_url'] = url_df['url'].apply(lambda i: isSecureUrl(i))
url_df.head(10)

#### Plot Is Secure URLs 

In [None]:
snsplt = sns.countplot(x='secure_url', data=url_df)
snsplt.set(xlabel = 'URL Security', 
           ylabel = 'Count',
           title = 'URLs (http:0, https:1)')
snsplt.bar_label(snsplt.containers[0])

### 4.6 Determine the number of digits in the URL

In [None]:
# Function to determine the number of digits in the URL using the isnumeric function 

def getDigitCount(url):
    digits = 0
    for i in url:
        if i.isnumeric(): 
            digits += 1
    return digits

In [None]:
# Determine the number of digits in the URL
url_df['num_of_digits'] = url_df['url'].apply(lambda i: getDigitCount(i))
url_df.head(10)

### 4.7 Determine the number of alpha characters (letters) in the URL

In [None]:
# Function to determine the number of letters  in the URL using the isalpha function 

def getLetterCount(url):
    letters = 0
    for i in url:
        if i.isalpha(): 
            letters += 1
    return letters

In [None]:
# Determine the number of alpha characters (letters) in the URL
url_df['num_of_letters'] = url_df['url'].apply(lambda i: getLetterCount(i))
url_df.head(10)

## 5. Model Training

### 5.1 Target Data (Data Cleanup)

In [None]:
# Dropping non-numeric columns to speed up model training and assigning dataframe to a variable (X)
X = url_df.drop(['url','type','Category','domain'],axis=1)
# Assigning the Category column to a variable (y)
y = url_df['Category']

In [None]:
#Checking the data in the variable X
X

In [None]:
# Checking the data in the variable y
y

### 5.2 Plot Heat Map of Target Data

In [None]:
# Plot Heat Map of Target Data
plt.figure(figsize=(15, 15))
sns.heatmap(X.corr(), linewidths=.5)

### 5.3 Train Test Splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1337)

In [None]:
#X_train

In [None]:
#X_test

In [None]:
#y_train

In [None]:
#y_test

### 5.4 Training the Models

In [None]:
#all_models = [DecisionTreeClassifier,
#          LogisticRegression,
#          RandomForestClassifier,
#         AdaBoostClassifier,
#         #KNeighborsClassifier,
#         SGDClassifier,
#         ExtraTreesClassifier,
#         GaussianNB]

models = [RandomForestClassifier]

In [None]:
accuracy_test = []

In [None]:
for m in models:
    print('\033[32m----------------------Begin-----------------------\033[0m')
    print('######-Model =>\033[07m {} \033[0m'.format(m))
    #model_ = m()
    
    # To handle convergence in LogisticRegression
    #model_name = type(m).__name__
    #if model_name == "LogisticRegression":
    #   model_ = m(max_iter=1000)
    #else:
    #    model_ = m()
    
    model_ = m()
    model_.fit(X_train, y_train)
    pred = model_.predict(X_test)
    acc = accuracy_score(pred, y_test)
    accuracy_test.append(acc)
    print('Test Accuracy :\033[32m \033[01m {:.2f}% \033[30m \033[0m'.format(acc*100))
    print('\033[01m                      Classification Report  \033[0m')
    print(classification_report(y_test, pred))
    print('\033[01m                      Confusion Matrix  \033[0m')
    cf_matrix = confusion_matrix(y_test, pred)
    plot_ = sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, fmt='0.2%')
    plt.show()
    print('\033[31m----------------------End-------------------------\033[0m')

In [None]:
#all_ml_output = pd.DataFrame({"Model":['Decision Tree Classifier',
#                                   'Logistic Regression',
#                                  'Random Tree Classifier', 
#                                  'AdaBoost Classifier',
#                                  #'KNeighbors Classifier',
#                                  'SGD Classifier',
#                                  'Extra Trees Classifier',
#                                  'Guassian Naives Bayes'],
#                         "Accuracy": accuracy_test})

ml_output = pd.DataFrame({"Model":['Random Tree Classifier'],
                         "Accuracy": accuracy_test})

In [None]:
X_test

In [None]:
plt.figure(figsize=(10,5))
plots = sns.barplot(x='Model', y='Accuracy', data=ml_output)
for bar in plots.patches:
    plots.annotate(format(bar.get_height(),'.2f'),
                  (bar.get_x() + bar.get_width() / 2,
                  bar.get_height()),
                  ha='center',
                  va='center',
                  size=15,
                  xytext=(0,8),
                  textcoords='offset points')
plt.xlabel("Models", size=14)
plt.xticks(rotation=20)
plt.ylabel("Accuracy", size =14)
plt.show()

## 6. Model Evaluation (Testing)

### 6.1 URL Processing Function

In [None]:
# Function to convert and process urls supplied to determine of the URL is malicious or not

def URLProcessor(urls):
    _df = pd.DataFrame()
    _df['url'] = pd.Series(urls)
    
    _df['url_len'] = _df['url'].apply(lambda x: len(str(x)))
    _df['domain'] = _df['url'].apply(lambda x: getTLD(x))
    feature = ['@','?','-','=','.','#','%','+','$','!','*',',','//']
    for f in feature:
        _df[f] = _df['url'].apply(lambda x: x.count(f))
    _df['nonconformant_url'] = _df['url'].apply(lambda x: isNonConformantUrl(x))
    _df['secure_url'] = _df['url'].apply(lambda x: isSecureUrl(x))
    _df['num_of_digits'] = _df['url'].apply(lambda x: getDigitCount(x))
    _df['num_of_letters'] = _df['url'].apply(lambda x: getLetterCount(x))
    #_df['is_shortened_url'] = _df['url'].apply(lambda x: isShortenedURL(x))
    #_df['is_IP_in_url'] = _df['url'].apply(lambda x: isIPinURL(x))
    
    print(_df.columns)
    X = _df.drop(['url', 'domain'], axis=1)
    
    return X

### 6.2 Input - Test URLs

In [None]:
# URLS to check if they are malicious or not
urls= ['diaryofagameaddict.com',
       'crackspider.us/toolbar/install.php?pack=exe',
       'pashminaonline.com/pure-pashminas',
       'espdesign.com.au',
       'iamagameaddict.com',
       'kalantzis.net',
       'slightlyoffcenter.net',
       'toddscarwash.com',
       'tubemoviez.com',
       'ipl.hk',
       'pos-kupang.com/',
       'rupor.info',
       'svision-online.de/mgfi/administrator/components/com_babackup/classes/fx29id1.txt',
       'officeon.ch.ma/office.js?google_ad_format=728x90_as',
       'sn-gzzx.com',
       'sunlux.net/company/about.html',
       'outporn.com',
       'timothycopus.aimoo.com',
       'xindalawyer.com',
       'freeserials.spb.ru/key/68703.htm',
       'deletespyware-adware.com',
       'orbowlada.strefa.pl/text396.htm',
       'ruiyangcn.com',
       'zkic.com',
       'adserving.favorit-network.com/eas?camp=19320;cre=mu&grpid=1738&tag_id=618&nums=FGApbjFAAA',
       'cracks.vg/d1.php',
       'nuptialimages.com',
       'andysgame.com',
       'bezproudoff.cz',
       'ceskarepublika.net',
       'hotspot.cz',
       'gmcjjh.org/DHL',
       'nerez-schodiste-zabradli.com',
       'nordiccountry.cz',
       'nowina.info',
       'obada-konstruktiwa.org',
       'otylkaaotesanek.cz',
       'pb-webdesign.net',
       'pension-helene.cz',
       'podzemi.myotis.info',
       'smrcek.com',
       'spekband.com',
       'm2132.ehgaugysd.net/zyso.cgi?18',
       'webcom-software.ws/links/?153646e8b0a88',
       'worldgymperu.com',
       'zgsysz.com',
       'oknarai.ru',
       'realinnovation.com/css/menu.js']

In [None]:
# URLS to check if they are malicious or not
urls= ['crackspider.us/toolbar/install.php?pack=exe',
       'deletespyware-adware.com',
       'realinnovation.com/css/menu.js']

### 6.3 Assigning Test URLs into a variable (test_data)

In [None]:
test_data = URLProcessor(urls)
#test_data = URLProcessor('http://ak.imgfarm.com/images/nocache/vicinio/installers/205320000.S10570.1/507981-150710122501-S10570.1/VideoDownloadConvertAuto.exe_0')

In [None]:
models

### 6.4 Using models to predict Test URL

In [None]:
import joblib

# benign:0, defacement:1, phishing:2, malware:3
for m in models:
    print('---------------------------------------')
    print('######-Model =>\033[07m {} \033[0m'.format(m))
    model_ = m()
    model_.fit(X_train, y_train)
    #Save the model (model_) 
    model_name = type(model_).__name__
    model_name_to_save ='HackerURLDetectorModel_' + model_name + '.joblib'
    joblib.dump(model_, model_name_to_save)
    print(model_name_to_save + ' was saved')
    #pred = model_.predict(test_data)
    #print(pred)

### 6.5 Model Evaluation (Accuracy) Report

In [None]:
ml_output = pd.DataFrame({"Model":['Random Forest Classifier'],
                         "Accuracy": accuracy_test})
print(ml_output)

In [None]:
#Loading a saved joblib model to make predictions
malurl_model = joblib.load('HackerURLDetectorModel_RandomForestClassifier.joblib')
#predictions = malurl_model.predict(x_test)
#predictions

In [None]:
# malware
url = "http://ak.imgfarm.com/images/nocache/vicinio/installers/205320000.S10570.1/507981-150710122501-S10570.1/VideoDownloadConvertAuto.exe_0"

# defacement
#url = "http://www.raci.it/component/user/reset.html"
    
# phishing


# benign
#url = "https://hallmark.businessgreetings.com"
#url = "https://parsippanysoccerclub.org/"

test_data = URLProcessor(url)

#url = "https://parsippanysoccerclub.org/"
class_mapping = {0: 'benign',
                 1: 'defacement',
                 2: 'phishing',
                 3: 'malware'
                }

# benign:0, defacement:1, phishing:2, malware:3
# for m in models:
#     print('---------------------------------------')
#     print('######-Model =>\033[07m {} \033[0m'.format(m))
#     model_ = m()
#     model_.fit(X_train, y_train)
#     pred = model_.predict(test_data)
#     pred_int = 
#     print(pred)
    
prediction_int = malurl_model.predict(test_data)[0]
prediction_label = class_mapping.get(prediction_int, 'Unknown')
#print(url, model_predict(url)[0],model_predict(url)[1])
print(url, prediction_int, prediction_label)