In [None]:
#importing required packages for this module
import pandas as pd

In [None]:
from urllib.parse import urlparse
import re

def extract_domain(url):
    """Extracts the domain from a given URL, removing 'www.' if present."""
    domain = urlparse(url).netloc
    return domain.replace("www.", "") if domain.startswith("www.") else domain


In [None]:
import ipaddress

def contains_ip(url):
    """Checks if the given URL is an IP address."""
    try:
        ipaddress.ip_address(url)
        return 1  # URL contains an IP address
    except ValueError:
        return 0  # URL does not contain an IP address


In [None]:
def contains_at_symbol(url):
    """Checks if the given URL contains an '@' symbol."""
    return 1 if "@" in url else 0


In [None]:
def categorize_url_length(url):
    """Categorizes the URL based on its length (short < 54, long ≥ 54)."""
    return 1 if len(url) >= 54 else 0


In [None]:
from urllib.parse import urlparse

def get_url_depth(url):
    """Counts the number of '/' in the URL path, excluding empty segments."""
    return sum(1 for segment in urlparse(url).path.split('/') if segment)


In [None]:
def has_redirection(url):
    """Checks if the URL contains '//' beyond the protocol (indicating redirection)."""
    return 1 if url.rfind('//') > 7 else 0


In [None]:
from urllib.parse import urlparse

def has_https_in_domain(url):
    """Checks if 'https' appears in the domain part of the URL."""
    return 1 if "https" in urlparse(url).netloc else 0


In [None]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

In [None]:
import re

def is_tiny_url(url, shortening_services):
    """Checks if the URL belongs to a known shortening service."""
    return 1 if re.search(shortening_services, url) else 0


In [None]:
from urllib.parse import urlparse

def has_prefix_suffix(url):
    """Checks if the domain contains a '-' (which can indicate phishing)."""
    return 1 if "-" in urlparse(url).netloc else 0


In [None]:
!pip install python-whois



In [None]:
# importing required packages for this section
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime

In [None]:
# 11.DNS Record availability (DNS_Record)
# obtained in the featureExtraction function itself

In [None]:
from datetime import datetime

def get_domain_age(domain_name):
    """Calculates the age of a domain based on its creation and expiration dates."""

    creation_date = domain_name.creation_date
    expiration_date = domain_name.expiration_date

    # Handle cases where dates might be strings
    if isinstance(creation_date, str) or isinstance(expiration_date, str):
        try:
            creation_date = datetime.strptime(creation_date, "%Y-%m-%d")
            expiration_date = datetime.strptime(expiration_date, "%Y-%m-%d")
        except ValueError:
            return 1  # Invalid date format

    # Handle missing or list-type dates
    if not creation_date or not expiration_date or isinstance(creation_date, list) or isinstance(expiration_date, list):
        return 1  # Considered phishing if domain details are unclear

    # Calculate the domain's age in months
    domain_age_in_months = (expiration_date - creation_date).days / 30

    return 1 if domain_age_in_months < 6 else 0  # Phishing if domain age < 6 months


In [None]:
from datetime import datetime

def get_domain_end_time(domain_name):
    """Calculates the remaining time until domain expiration."""

    expiration_date = domain_name.expiration_date

    # Handle cases where the expiration date is a string
    if isinstance(expiration_date, str):
        try:
            expiration_date = datetime.strptime(expiration_date, "%Y-%m-%d")
        except ValueError:
            return 1  # Considered phishing if date format is invalid

    # Handle missing or list-type expiration dates
    if not expiration_date or isinstance(expiration_date, list):
        return 1  # Phishing indicator if expiration details are unclear

    # Calculate time remaining until expiration in months
    remaining_months = (expiration_date - datetime.now()).days / 30

    return 0 if remaining_months < 6 else 1  # Phishing if domain expires in <6 months


In [None]:

import requests

In [None]:
import re

def has_iframe_redirection(response):
    """Checks if an iFrame is present in the response (possible redirection)."""

    if not response:
        return 1  # Phishing indicator if response is empty

    return 0 if re.search(r"<iframe|frameBorder", response.text, re.IGNORECASE) else 1


In [None]:
import re

def has_mouse_over_event(response):
    """Checks if an 'onmouseover' event is present in the response (possible phishing attempt)."""

    if not response:
        return 1  # Phishing indicator if response is empty

    return 1 if re.search(r"<script>.*onmouseover.*</script>", response.text, re.IGNORECASE) else 0


In [None]:
import re

def has_right_click_disabled(response):
    """Checks if right-click is disabled using JavaScript (possible phishing indicator)."""

    if not response:
        return 1  # Phishing indicator if response is empty

    return 0 if re.search(r"event\.button\s*==\s*2", response.text) else 1


In [None]:
def has_multiple_forwardings(response):
    """Checks if the website has excessive redirects (possible phishing indicator)."""

    if not response:
        return 1  # Phishing indicator if response is empty

    return 1 if len(response.history) > 2 else 0


In [None]:
#Function to extract features
def featureExtraction(url):

  features = []
  #Address bar based features (10)
  features.append(getDomain(url))
  features.append(havingIP(url))
  features.append(haveAtSign(url))
  features.append(getLength(url))
  features.append(getDepth(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))

  #Domain based features (4)
  dns = 0
  try:
    domain_name = whois.whois(urlparse(url).netloc)
  except:
    dns = 1

  features.append(dns)
  features.append(1 if dns == 1 else domainAge(domain_name))
  features.append(1 if dns == 1 else domainEnd(domain_name))

  # HTML & Javascript based features (4)
  try:
    response = requests.get(url)
  except:
    response = ""
  features.append(iframe(response))
  features.append(mouseOver(response))
  features.append(rightClick(response))
  features.append(forwarding(response))

  return features

In [None]:
#Extracting the feautres & storing them in a list
legi_features = []
label = 0
url=""

legi_features = featureExtraction(url)

legi_features1 = legi_features[1:]

print("Extracted Features:", legi_features1)



Extracted Features: [0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0]


In [None]:
import pandas as pd

# Feature names
feature_names = ['Have_IP', 'Have_At', 'URL_Length', 'URL_Depth', 'Redirection',
                 'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record',
                 'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over', 'Right_Click', 'Web_Forwards']

# Ensure features are in the correct format
legi_features1 = legi_features[1:]
feat = pd.DataFrame([legi_features1], columns=feature_names)

# Display the dataframe
feat.head()
feat.columns

print(feat)


   Have_IP  Have_At  URL_Length  URL_Depth  Redirection  https_Domain  \
0        0        0           1          2            0             0   

   TinyURL  Prefix/Suffix  DNS_Record  Domain_Age  Domain_End  iFrame  \
0        0              0           0           1           1       0   

   Mouse_Over  Right_Click  Web_Forwards  
0           0            1             0  
