In [2]:
import math
from collections import Counter
import re
import pandas as pd
import base64

In [1]:
def is_https(url):
    return url.startswith("https://")

In [1]:
def url_length(url):
    return len(url.strip())

In [1]:
def domain_length(url):
    url = url.strip().lower()

    if url.startswith("https://"):
        url = url[8:]
    elif url.startswith("http://"):
        url = url[7:]

    domain = url.split('/')[0].split(':')[0]
    return domain, len(domain)

14


In [2]:
def path_length(url):
    url = url.strip().lower()

    # Remove scheme
    if url.startswith("https://"):
        url = url[8:]
    elif url.startswith("http://"):
        url = url[7:]

    parts = url.split('/', 1)
    if len(parts) == 1:
        return "Path absent", 0

    path_part = parts[1]
    path = path_part.split('?', 1)[0].split('#', 1)[0]

    clean_path = '/' + path if path else ''
    return clean_path, len(clean_path)

In [3]:
def query_length_and_count(url):
    url = url.strip().lower()

    if '?' not in url:
        return "Query absent", 0, 0  # No query present

    # Extract query part (remove fragment if present)
    query_part = url.split('?', 1)[1]
    query = query_part.split('#', 1)[0]

    query_length = len(query)
    query_count = len(query.split('&')) if query else 0

    return query, query_length, query_count

In [7]:
def fragment_length(url):
    url = url.strip().lower()

    if '#' not in url:
        return "Fragment absent", 0

    fragment = url.split('#', 1)[1]
    return fragment, len(fragment)

In [3]:
def shannon_entropy(url):
    if not url:
        return 0.0

    freq = Counter(url)
    length = len(url)

    entropy = 0.0
    for count in freq.values():
        p = count / length
        entropy -= p * math.log2(p)

    return entropy

print(shannon_entropy("211.98.21.34"))

2.625814583693911


In [1]:
import re

def count_dots(url):
    return url.count('.')

In [2]:
def count_at_symbols(url):
    return url.count('@')

In [3]:
def count_equals(url):
    return url.count('=')

In [4]:
def count_special_chars(url):
    return len(re.findall(r'[^a-zA-Z0-9./]', url))

In [5]:
def count_slashes(url):
    return url.count('/')

In [6]:
def count_hyphens(url):
    return url.count('-')

In [19]:
def is_domain_ip(url):
    # Remove protocol if present
    if url.startswith("http://"):
        url = url[7:]
    elif url.startswith("https://"):
        url = url[8:]

    # Remove path, query, fragment — keep only domain
    domain = url.split('/')[0].split(':')[0]

    parts = domain.split('.')
    if len(parts) != 4:
        return False

    for part in parts:
        if not part.isdigit():
            return False
        num = int(part)
        if num < 0 or num > 255:
            return False

    return True

In [None]:
# check for ip

def count_subdomains(url):
    # Remove protocol if present
    if url.startswith("http://"):
        url = url[7:]
    elif url.startswith("https://"):
        url = url[8:]

    # Remove port, path, query, and fragment
    domain = url.split('/')[0].split(':')[0]

    # Split domain by dots
    parts = domain.split('.')

    if len(parts) <= 2:
        return 0  # No subdomain, only domain + TLD
    
    return len(parts) - 2  # Subdomains = total parts - domain - TLD

In [9]:
# handle condition of ip
def extract_tld(url):
    # Remove scheme
    if url.startswith("http://"):
        url = url[7:]
    elif url.startswith("https://"):
        url = url[8:]

    # Remove port and path
    domain = url.split('/')[0].split(':')[0]

    parts = domain.split('.')
    if len(parts) < 2:
        return ""
    return parts[-1].lower()

In [10]:
import pandas as pd

def load_csv(csv_path):
    try:
        return pd.read_csv(csv_path)
    except Exception as e:
        print(f"[!] Error loading CSV: {e}")
        return pd.DataFrame()

def is_tld_iana_registered_from_csv(url, tld_set):
    tld = extract_tld(url)
    return tld in tld_set

In [11]:
# ip condition

def has_sld(url, tld_set):
    # Remove protocol
    if url.startswith("http://"):
        url = url[7:]
    elif url.startswith("https://"):
        url = url[8:]

    # Extract domain (strip path, port)
    domain = url.split('/')[0].split(':')[0].lower()

    # Split by dot
    parts = domain.split('.')

    if len(parts) < 2:
        return False

    # Check if second-last part is a valid TLD (i.e., looks like an SLD)
    return parts[-2] in tld_set


In [12]:
def check_hex_encoding(url):
    # Find all %XX patterns
    matches = re.findall(r'%[0-9a-fA-F]{2}', url)
    has_encoding = len(matches) > 0
    count = len(matches)
    return has_encoding, count

In [13]:
import base64

def contains_base64(url):
    # Base64 patterns: at least 8 valid base64 chars (4^n), possibly with padding (= or ==)
    candidates = re.findall(r'([A-Za-z0-9+/]{8,}={0,2})', url)
    
    for candidate in candidates:
        try:
            # Try to decode safely
            decoded = base64.b64decode(candidate, validate=True)
            # Additional: filter false positives by requiring decoded to be readable
            if decoded and all(32 <= b <= 126 for b in decoded):  # printable ASCII
                return True, candidate
        except Exception:
            continue

    return False, "-------------"

In [14]:
def path_depth(url):
    # Remove scheme
    if url.startswith("http://"):
        url = url[7:]
    elif url.startswith("https://"):
        url = url[8:]

    # Remove domain and port
    path_part = url.split('/', 1)[1] if '/' in url else ''
    
    # Remove query and fragment
    path_part = path_part.split('?', 1)[0].split('#', 1)[0]

    # Split by '/' and remove empty parts
    parts = [p for p in path_part.split('/') if p]

    return len(parts)


In [15]:
def digit_count(url):
    return sum(c.isdigit() for c in url)


In [16]:
def count_colons(url):
    return url.count(':')


In [17]:
def is_punycode(url):
    domain = url.split('//')[-1].split('/')[0]
    return 'xn--' in domain

In [18]:
def character_transition_entropy(text):
    if len(text) < 2:
        return 0.0

    bigrams = [text[i:i+2] for i in range(len(text)-1)]
    total = len(bigrams)
    
    freq = Counter(bigrams)
    entropy = 0.0
    for count in freq.values():
        p = count / total
        entropy -= p * math.log2(p)

    return entropy

In [5]:
def extract_features(url):
    ishttps = 0
    if is_https(url):
        ishttps = 1

    url_len = url_length(url)

    domain, domain_len = domain_length(url)

    is_domain_ip = 0
    if is_domain_ip(domain):
        is_domain_ip = 1
        
    path, path_len = path_length(url)
    query, query_len, no_of_query = query_length_and_count(url)
    fragment, fragment_len = fragment_length(url)
    
    se_url = shannon_entry(url)
    se_domain = 10
    se_path = 0
    se_query = 0
    se_fragment = 0
    if(is_domain_ip == 0):
        se_domain = shannon_entropy(domain)
    if(path_len > 0):
        se_path = shannon_entropy(path)
    if(query_len > 0):
        se_query = shannon_entropy(query)
    if(fragment_len > 0):
        se_fragment = shannon_entropy(fragment)

    cte_domain = 0

In [None]:
p