<a href="https://colab.research.google.com/github/Waldea/Phishing-URL-Detection/blob/main/Omara.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **2. DATA COLLECTION**

The data contains both phishing/malign URL and clean/benign URL.
The phishing URLs are crawled from phishtank.org while the clean data comes from commoncrawl.org.
The data is split into 2 set: Training set and Test set.
The Training set has 2 million URLs for each phishing and clean data. The Test set has 1 Million phish URLs and 1 million clean URLs.

In [8]:
#importing basic packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [16]:
#Loading the data
# We then specify the column names using 'names=['url']'.
benign_test_file = pd.read_table('benign_Test.txt', header=None, names=['url'])
malign_test_file = pd.read_table('malign_Test.txt', header=None, names=['url'])
benign_train_file = pd.read_table('benign_Train.txt', header=None, names=['url'])
malign_train_file = pd.read_table('malign_Train.txt', header=None, names=['url'])

# Now you can access the 'url' column:
benign_test_file.head()
malign_test_file.head()
benign_train_file.head()
malign_train_file.head()

Unnamed: 0,url
0,bosquedarobina.blogspot.com.au
1,get.aknigi.club
2,amazon.com/experiment-adrien-brody/dp/b003vel9ei
3,sexymaryjane.blogspot.be
4,linkfreak.blogspot.de


In [18]:
# Label the data (1 for phishing, 0 for clean)
benign_train_file["label"] = "benign_train" # Assign "benign_train" to the "label" column
malign_train_file["label"] = "malign_train" # Assign "malign_train" to the "label" column
benign_test_file["label"] = "benign_test"   # Assign "benign_test" to the "label" column
malign_test_file["label"] = "malign_test"   # Assign "malign_test" to the "label" column


benign_test_file.head()
malign_test_file.head()
benign_train_file.head()
malign_train_file.head()

Unnamed: 0,url,label
0,bosquedarobina.blogspot.com.au,malign_train
1,get.aknigi.club,malign_train
2,amazon.com/experiment-adrien-brody/dp/b003vel9ei,malign_train
3,sexymaryjane.blogspot.be,malign_train
4,linkfreak.blogspot.de,malign_train


In [19]:
# Convert to csv
benign_train_file.to_csv("benign_Train.txt", sep="\t", index=False, header=True)
malign_train_file.to_csv("malign_Train.txt", sep="\t", index=False, header=True) # Add to_csv
benign_test_file.to_csv("benign_Test.txt", sep="\t", index=False, header=True) # Add to_csv
malign_test_file.to_csv("malign_Test.txt", sep="\t", index=False, header=True) # Add to_csv

In [21]:
# Combine the training and test datasets
train_data = pd.concat([benign_train_file, malign_train_file], ignore_index=True)
test_data = pd.concat([benign_test_file, malign_test_file], ignore_index=True)

In [22]:
combined_data = pd.concat([train_data, test_data], ignore_index=True)
combined_data.head()

Unnamed: 0,url,label
0,karens-krazy-korner.blogspot.com/2010/,benign_train
1,www.tmentertainment.com/ccount/click.php?id=173,benign_train
2,asian.videos.xblog.in/thai-wife-get-fuck-by-hu...,benign_train
3,kpn.se/resurser/bilder-att-farglagga,benign_train
4,www.indokitay.ru/landing/toniki-aasha-herbals/,benign_train


In [23]:
combined_data.shape

(6568184, 2)

## **3. FEATURE EXTRACTION:**

In this step, features are extracted from the URLs dataset.

The extracted features are categorized into


1.   Address Bar based Features
2.   Domain based Features
3.   HTML & Javascript based Features
4.   Special Character based Features




**# 3.1 Address Bar based features**

In [24]:
# importing required packages for this section
from urllib.parse import urlparse,urlencode
import ipaddress
import re

In [25]:
# 1.Domain of the URL (Domain)
def getDomain(url):
  domain = urlparse(url).netloc
  if re.match(r"^www.",domain):
	       domain = domain.replace("www.","")
  return domain

In [26]:
# 2.Checks for IP address in URL (Have_IP)
def havingIP(url):
  try:
    ipaddress.ip_address(url)
    ip = 1
  except:
    ip = 0
  return ip


In [27]:
# 3.Checks the presence of @ in URL (Have_At)
def haveAtSign(url):
  if "@" in url:
    at = 1
  else:
    at = 0
  return at

In [28]:
# 4.Finding the length of URL and categorizing (URL_Length)
def getLength(url):
  if len(url) < 54:
    length = 0
  else:
    length = 1
  return length

In [29]:
# 5.Gives number of '/' in URL (URL_Depth)
def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth

In [30]:
# 6.Checking for redirection '//' in the url (Redirection)
def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0

In [31]:
# 7.Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
def httpDomain(url):
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1
  else:
    return 0

In [32]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

In [33]:
# 8. Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0

In [34]:
# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate

 **3.2 Domain Based Features**

In [35]:
!pip install python-whois

Collecting python-whois
  Downloading python_whois-0.9.4-py3-none-any.whl.metadata (2.6 kB)
Downloading python_whois-0.9.4-py3-none-any.whl (103 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/103.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-whois
Successfully installed python-whois-0.9.4


In [36]:
# importing required packages for this section
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime

In [37]:
# 11.DNS Record availability (DNS_Record)
# obtained in the featureExtraction function itself

In [43]:
# 12.Web traffic (Web_Traffic)
def web_traffic(url):
  try:
    #Filling the whitespaces in the URL if any
    url = urllib.parse.quote(url)
    rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
        "REACH")['RANK']
    rank = int(rank)
  except TypeError:
        return 1
  if rank <100000:
    return 1
  else:
    return 0

    print(rank)

In [44]:
# 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)
def domainAge(domain_name):
  creation_date = domain_name.creation_date
  expiration_date = domain_name.expiration_date
  if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
    try:
      creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if ((expiration_date is None) or (creation_date is None)):
      return 1
  elif ((type(expiration_date) is list) or (type(creation_date) is list)):
      return 1
  else:
    ageofdomain = abs((expiration_date - creation_date).days)
    if ((ageofdomain/30) < 6):
      age = 1
    else:
      age = 0
  return age

  print(ageofdomain)

In [40]:
# 14.End time of domain: The difference between termination time and current time (Domain_End)
def domainEnd(domain_name):
  expiration_date = domain_name.expiration_date
  if isinstance(expiration_date,str):
    try:
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if (expiration_date is None):
      return 1
  elif (type(expiration_date) is list):
      return 1
  else:
    today = datetime.now()
    end = abs((expiration_date - today).days)
    if ((end/30) < 6):
      end = 0
    else:
      end = 1
  return end

# **4.Extract JS features**

In [41]:
import requests
from bs4 import BeautifulSoup
import re

def extract_js_features(url):
    try:
        # Fetch HTML content of the website
        response = requests.get(url)
        html_content = response.text

        # Parse HTML with BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract all <script> tags
        script_tags = soup.find_all('script')

        # Count number of script tags
        js_count = len(script_tags)

        # Check for inline JavaScript (JavaScript embedded within HTML)
        inline_js = sum(1 for script in script_tags if not script.get('src'))

        # Extract potential JavaScript-related attributes (onload, onclick, etc.)
        js_events = re.findall(r'on\w+=["\'].*?["\']', html_content)
        js_event_count = len(js_events)

        return js_count, inline_js, js_event_count

    except Exception as e:
        return None, None, None

# Example usage:
url = 'http://example.com'
js_count, inline_js, js_event_count = extract_js_features(url)
print(f'JS Scripts: {js_count}, Inline JS: {inline_js}, JS Events: {js_event_count}')


JS Scripts: 0, Inline JS: 0, JS Events: 2


# **5. Extract SpecialChar features**

In [42]:
import re
from urllib.parse import urlparse, unquote

def extract_special_characters_from_url(url):
    # Parse the URL
    parsed_url = urlparse(url)

    # Check for encoded characters (e.g., %20 for space)
    encoded_characters = re.findall(r'%[0-9A-Fa-f]{2}', url)

    # Check for special characters like &, ?, #, @, !, etc.
    special_characters = re.findall(r'[\&\?\#\!\@\$\%\*\+\=]', url)

    # Check for unusual domain names (e.g., punycode or suspicious subdomains)
    domain = parsed_url.netloc
    special_in_domain = re.findall(r'[^\x00-\x7F]', domain)  # Non-ASCII characters

    # Decode URL and check for hidden or special characters after decoding
    decoded_url = unquote(url)
    decoded_special_characters = re.findall(r'[^\x00-\x7F]', decoded_url)

    return {
        "encoded_characters": encoded_characters,
        "special_characters": special_characters,
        "special_in_domain": special_in_domain,
        "decoded_special_characters": decoded_special_characters
    }

# Example usage:
url = 'http://example.com/?search=%3Cscript%3Ealert%281%29%3C/script%3E'
special_features = extract_special_characters_from_url(url)
print(special_features)


{'encoded_characters': ['%3C', '%3E', '%28', '%29', '%3C', '%3E'], 'special_characters': ['?', '=', '%', '%', '%', '%', '%', '%'], 'special_in_domain': [], 'decoded_special_characters': []}
