In [1]:
import math
from collections import Counter
import re
import pandas as pd
import base64

In [2]:
def is_https(url):
    try:
        if url.startswith("https://"):
            return 1
        elif url.startswith("http://"):
            return 0
        else:
            raise ValueError("Invalid URL: must start with http:// or https://")
    except Exception as e:
        print(f"Error: {e}")
        return None

In [3]:
def url_length(url):
    try:
        return len(url)  
    except Exception as e:
        print(f"{e}")
        return None

In [5]:
for i in range(1, 62):
    file_name = f"../datasets/training_dataset_with_features/urls_{i}.csv"
    
    try:
        # Read the CSV
        df = pd.read_csv(file_name)

        # Add the urllength column
        df["is_https"] = df["url"].apply(is_https)

        # Save the modified file (overwrite or save to new location)
        df.to_csv(f"../datasets/training_dataset_with_features/urls_{i}.csv", index=False)
        print(f"Processed and saved: urls_{i}.csv")
        
    except FileNotFoundError:
        print(f"File not found: {file_name}")
    except Exception as e:
        print(f"Error with file {file_name}: {e}")


Processed and saved: urls_1.csv
Processed and saved: urls_2.csv
Processed and saved: urls_3.csv
Processed and saved: urls_4.csv
Processed and saved: urls_5.csv
Processed and saved: urls_6.csv
Processed and saved: urls_7.csv
Processed and saved: urls_8.csv
Processed and saved: urls_9.csv
Processed and saved: urls_10.csv
Processed and saved: urls_11.csv
Processed and saved: urls_12.csv
Processed and saved: urls_13.csv
Processed and saved: urls_14.csv
Processed and saved: urls_15.csv
Processed and saved: urls_16.csv
Processed and saved: urls_17.csv
Processed and saved: urls_18.csv
Processed and saved: urls_19.csv
Processed and saved: urls_20.csv
Processed and saved: urls_21.csv
Processed and saved: urls_22.csv
Processed and saved: urls_23.csv
Processed and saved: urls_24.csv
Processed and saved: urls_25.csv
Processed and saved: urls_26.csv
Processed and saved: urls_27.csv
Processed and saved: urls_28.csv
Processed and saved: urls_29.csv
Processed and saved: urls_30.csv
Processed and saved

In [7]:
test_urls = [
    "https://example.com",
    "https://example.com:443",
    "https://example.com/path/to/resource",
    "https://example.com/path?query=1",
    "https://example.com#fragment",
    "https://192.168.1.1",
    "https://192.168.1.1:8080/admin",
    "https://user:pass@example.com",
    "https://sub.domain.example.com",
    "https://localhost",
    "https://[2001:db8::1]",
    "https://255.255.255.255",
    "http://example.com",
    "http://example.com:80",
    "http://example.com/page",
    "http://192.168.1.1",
    "http://192.168.1.1:8080/login",
    "http://localhost:3000",
    "http://user:pass@site.com",
    "http://sub.domain.example.com",
    "http://[2001:db8::1]",
    "http://0.0.0.0",
    "ftp://example.com",
    "example.com",
    "//example.com",
    "htp://example.com",
    "https:/example.com",
    "http:example.com",
    "randomtext",
    "",
    None,
    12345
]

for url in test_urls:
    result = url_length(url)
    print(f"{url!r:45} -> is_https: {result}")

'https://example.com'                         -> is_https: 19
'https://example.com:443'                     -> is_https: 23
'https://example.com/path/to/resource'        -> is_https: 36
'https://example.com/path?query=1'            -> is_https: 32
'https://example.com#fragment'                -> is_https: 28
'https://192.168.1.1'                         -> is_https: 19
'https://192.168.1.1:8080/admin'              -> is_https: 30
'https://user:pass@example.com'               -> is_https: 29
'https://sub.domain.example.com'              -> is_https: 30
'https://localhost'                           -> is_https: 17
'https://[2001:db8::1]'                       -> is_https: 21
'https://255.255.255.255'                     -> is_https: 23
'http://example.com'                          -> is_https: 18
'http://example.com:80'                       -> is_https: 21
'http://example.com/page'                     -> is_https: 23
'http://192.168.1.1'                          -> is_https: 18
'http://

In [5]:
def domain_length(url):
    try:
        if url.startswith("https://"):
            url = url[8:]
        elif url.startswith("http://"):
            url = url[7:]
        else:
            raise ValueError("Invalid URL: must start with http:// or https://")
        
        domain = url.split('/')[0].split(':')[0]
        return domain, len(domain)
    except Exception as e:
        print(f"Error: {e}")
        return None, 0

In [6]:
def path_length(url):
    try:
        if url.startswith("https://"):
            url = url[8:]
        elif url.startswith("http://"):
            url = url[7:]
        else:
            raise ValueError("Invalid URL: must start with http:// or https://")

        parts = url.split('/', 1)
        if len(parts) == 1:
            return "Path absent", 0

        path_part = parts[1]
        path = path_part.split('?', 1)[0].split('#', 1)[0]

        clean_path = '/' + path if path else ''
        return clean_path, len(clean_path)

    except Exception as e:
        print(f"Error: {e}")
        return None, 0


In [7]:
def query_length_and_count(url):
    try:
        if '?' not in url:
            return "Query absent", 0, 0

        query_part = url.split('?', 1)[1]
        query = query_part.split('#', 1)[0]

        query_length = len(query)
        query_count = len(query.split('&')) if query else 0

        return query, query_length, query_count

    except Exception as e:
        print(f"Error: {e}")
        return None, 0, 0

In [8]:
def fragment_length(url):
    try:
        if '#' not in url:
            return "Fragment absent", 0

        fragment = url.split('#', 1)[1]
        return fragment, len(fragment)

    except Exception as e:
        print(f"Error: {e}")
        return None, 0

In [41]:
def shannon_entropy(text):
    try:
        if not text:
            return 0.0

        freq = Counter(text)
        length = len(text)

        entropy = 0.0
        for count in freq.values():
            p = count / length
            entropy -= p * math.log2(p)

        return entropy

    except Exception as e:
        print(f"Error: {e}")
        return 0.0

In [42]:
def count_dots(url):
    try:
        return url.count('.')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [43]:
def count_at_symbols(url):
    try:
        return url.count('@')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [44]:
def count_equals(url):
    try:
        return url.count('=')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [45]:
def count_special_chars(url):
    try:
        return len(re.findall(r'[^a-zA-Z0-9./@=:%\-]', url))
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [46]:
def count_slashes(url):
    try:
        return url.count('/')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [47]:
def count_hyphens(url):
    try:
        return url.count('-')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [48]:
def count_digits(url):
    try:
        return sum(c.isdigit() for c in url)
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [49]:
def count_colons(url):
    try:
        return url.count(':')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [50]:
def count_subdomains(domain):
    try:
        parts = domain.split('.')
        if len(parts) <= 2:
            return 0  
        return len(parts) - 2  
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [51]:
def is_domain_ip(domain):
    try:
        parts = domain.split('.')
        if len(parts) != 4:
            return False

        for part in parts:
            if not part.isdigit():
                return False
            num = int(part)
            if num < 0 or num > 255:
                return False

        return True
    except Exception as e:
        print(f"Error: {e}")
        return False

In [52]:
def extract_tld(domain):
    try:
        parts = domain.split('.')
        if len(parts) < 2:
            return "Invalid URL, TLD absent"
        return parts[-1].lower()
    except Exception as e:
        print(f"Error: {e}")
        return ""

In [53]:
def check_tld_and_mtld(domain, tld_df):
    try:
        tld_set = set(tld_df.iloc[:, 0].astype(str).str.lower().str.strip())

        parts = domain.split('.')
        if len(parts) < 2:
            return False, False  

        mtld = parts[-2].lower()  
        tld = parts[-1].lower()

        return tld in tld_set, mtld in tld_set
    except Exception as e:
        print(f"Error: {e}")
        return False, False


In [54]:
def check_hex_encoding(url):
    try:
        matches = re.findall(r'%[0-9a-fA-F]{2}', url)
        has_encoding = len(matches) > 0
        count = len(matches)
        return has_encoding, count
    except Exception as e:
        print(f"Error: {e}")
        return False, 0

In [55]:
def has_base64_encoding(url):
    try:
        candidates = re.findall(r'([A-Za-z0-9+/]{8,}={0,2})', url)

        for candidate in candidates:
            try:
                decoded = base64.b64decode(candidate, validate=True)
                if decoded and all(32 <= b <= 126 for b in decoded): 
                    return True
            except Exception:
                continue

        return False

    except Exception as e:
        print(f"Error: {e}")
        return False

In [56]:
def path_depth(path):
    try:
        if path.lower() == "path absent":
            return 0
        parts = [p for p in path.split('/') if p]
        return len(parts)
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [57]:
def is_punycode(domain):
    try:
        parts = domain.split('.')
        puny_parts = [p.encode('idna').decode('ascii') for p in parts]
        puny_domain = '.'.join(puny_parts)
        has_punycode = any(p.startswith('xn--') for p in puny_parts)
        return has_punycode
    except Exception as e:
        print(f"Error: {e}")
        return False

In [58]:
def character_transition_entropy(text):
    try:
        if not text or len(text) < 2:
            return 0.0

        bigrams = [text[i:i+2] for i in range(len(text)-1)]
        total = len(bigrams)

        freq = Counter(bigrams)
        entropy = 0.0
        for count in freq.values():
            p = count / total
            entropy -= p * math.log2(p)

        return entropy

    except Exception as e:
        print(f"Error: {e}")
        return 0.0

In [59]:
try:
    tlds = pd.read_csv("../datasets/tlds/tlds.csv")
except FileNotFoundError:
    print("Error: The file '../datasets/tlds/tlds.csv' was not found.")
    tlds = None