In [7]:
import math
from collections import Counter
import re
import pandas as pd

<h1>All the features that are extracted and the function used are given below. There are total of 38 features excluding url and label.</h1> 

<h1>Features:</h1>

| Feature                      | Description                                                                |
|------------------------------|----------------------------------------------------------------------------|
| is_https                     | Indicates whether the URL uses HTTPS (1) or HTTP (0).                      |
| url_length                   | Total number of characters in the URL.                                     |
| domain_length                | Number of characters in the domain portion.                                |
| path_length                  | Number of characters in the path of the URL.                               |
| path_depth                   | Count of segments in the URL path.                      |
| query_length                 | Number of characters in the query string.                                  |
| query_count                  | Number of key-value pairs in the query string.                             |
| fragment_length              | Number of characters in the fragment (after `#`).                          |
| se_url                       | Shannon entropy of the full URL (measures randomness).                     |
| se_domain                    | Shannon entropy of the domain name.                                        |
| se_path                      | Shannon entropy of the path portion.                                       |
| se_query                     | Shannon entropy of the query string.                                       |
| se_fragment                  | Shannon entropy of the fragment string.                                    |
| cte_domain                   | Character transition entropy in the domain.                                |
| dots                         | Number of dot (`.`) characters in the URL.                                 |
| equals                       | Number of equal sign (`=`) characters in the URL.                          |
| slashes                      | Number of slash (`/`) characters in the URL.                               |
| at                           | Number of at (`@`) characters in the URL.                                  |
| special_chars                | Count of special characters (non-alphanumeric & symbols).                  |
| hyphens                      | Number of hyphen (`-`) characters.                                         |
| digits                       | Count of numeric characters (0–9) in the URL.                              |
| colons                       | Number of colon (`:`) characters.                                          |
| question_marks               | Number of question mark (`?`) characters.                                  |
| and                          | Number of ampersand (`&`) characters.                                      |
| underscore                   | Count of underscore (`_`) characters.                                      |
| tilde                        | Count of tilde (`~`) characters.                                           |
| percent                      | Number of percent (`%`) characters.                                        |
| lowercase                    | Count of lowercase alphabetic characters (a–z).                            |
| uppercase                    | Count of uppercase alphabetic characters (A–Z).                            |
| upper_to_lower_ratio         | Ratio of uppercase to lowercase letters.                                   |
| subdomains                   | Number of subdomains (dot-separated parts before main domain).             | 
| tld                          | Top-level domain (like com, org, xyz, etc.).                               |
| is_domain_ip                 | Indicates if domain is an IP address (1) or not (0).                       |
| is_tld_iana_reg              | Whether the TLD is in IANA's official registry. <br>                       |
|                              | (Refer to ../scripts/datasets_fetch/tlds_fetch.py to get list of TLDs)     |
| is_mtld                      | Whether the mTLD (second-level TLD) is valid e.g.= .co.uk, .gov.in         |
| digit_to_length_ratio        | Ratio of digits to total URL length.                                       |
| char_to_length_ratio         | Ratio of alphabetic characters to URL length.                              |
| specialchar_to_length_ratio  | Ratio of special characters to URL length.                                 |

<br><br>

In [8]:
def is_https(url):
    try:
        if url.startswith("https://"):
            return 1
        elif url.startswith("http://"):
            return 0
        else:
            raise ValueError("Invalid URL: must start with http:// or https://")
    except Exception as e:
        print(f"Error: {e}")
        return None

In [9]:
def url_length(url):
    try:
        return len(url)  
    except Exception as e:
        print(f"{e}")
        return None

In [10]:
def domain_length(url):
    try:
        if url.startswith("https://"):
            url = url[8:]
        elif url.startswith("http://"):
            url = url[7:]
        else:
            raise ValueError("Invalid URL: must start with http:// or https://")
        
        domain = url.split('/')[0].split(':')[0]
        return domain, len(domain)
    except Exception as e:
        print(f"Error: {e}")
        return None, 0

In [11]:
def path_length(url):
    try:
        if url.startswith("https://"):
            url = url[8:]
        elif url.startswith("http://"):
            url = url[7:]
        else:
            raise ValueError("Invalid URL: must start with http:// or https://")

        parts = url.split('/', 1)

        if len(parts) == 1:
            return "/", 0

        path_part = parts[1]
        path = path_part.split('?', 1)[0].split('#', 1)[0]

        if path.strip() == "":
            return "/", 0

        clean_path = '/' + path
        return clean_path, len(clean_path)

    except Exception as e:
        print(f"Error: {e} | URL: {url}")
        return None, 0

In [12]:
def path_depth(path):
    try:
        if path == "/":
            return 0
            
        parts = [p for p in path.split('/') if p]
        return len(parts)
    except Exception as e:
        print(f"Error: {e} | Path: {path}")
        return 0

In [13]:
def query_length_and_count(url):
    try:
        if '?' not in url:
            return "?", 0, 0

        query_part = url.split('?', 1)[1]
        query = query_part.split('#', 1)[0]

        if query.strip() == "":
            return "?", 0, 0

        query_length = len(query)
        query_count = len(query.split('&'))

        return query, query_length, query_count
        
    except Exception as e:
        print(f"Error: {e} | URL: {url}")
        return None, 0, 0

In [14]:
def fragment_length(url):
    try:
        if '#' not in url:
            return "#", 0

        fragment = url.split('#', 1)[1]

        if fragment.strip() == "":
            return "#", 0

        return fragment, len(fragment)

    except Exception as e:
        print(f"Error: {e} | URL: {url}")
        return None, 0

In [15]:
def shannon_entropy(text):
    try:
        if not text:
            return 0.0

        freq = Counter(text)
        length = len(text)

        entropy = 0.0
        for count in freq.values():
            p = count / length
            entropy -= p * math.log2(p)

        return entropy

    except Exception as e:
        print(f"Error: {e}")
        return 0.0

In [16]:
def count_dots(url):
    try:
        return url.count('.')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [17]:
def count_at_symbols(url):
    try:
        return url.count('@')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [18]:
def count_equals(url):
    try:
        return url.count('=')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [19]:
def count_special_chars(url):
    try:
        return len(re.findall(r'[^a-zA-Z0-9./@=:%\-&~?_]', url))
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [20]:
def count_slashes(url):
    try:
        return url.count('/')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [21]:
def count_hyphens(url):
    try:
        return url.count('-')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [22]:
def count_digits(url):
    try:
        return sum(c.isdigit() for c in url)
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [23]:
def count_colons(url):
    try:
        return url.count(':')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [24]:
def count_qm(url):
    try:
        return url.count('?')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [25]:
def count_and(url):
    try:
        return url.count('&')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [26]:
def count_underscore(url):
    try:
        return url.count('_')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [27]:
def count_tilde(url):
    try:
        return url.count('~')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [28]:
def count_percent(url):
    try:
        return url.count('%')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [29]:
def count_lowercase_letters(url):
    try:
        return sum(1 for c in url if c.islower())
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [30]:
def count_uppercase_letters(url):
    try:
        return sum(1 for c in url if c.isupper())
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [31]:
def upper_to_lower_ratio(upper, lower):
    try:
        if lower == 0:
            return round(upper, 2) if upper != 0 else 0.0 
        return round(upper / lower, 2)
    except Exception as e:
        print(f"Error: {e} | upper: {upper}, lower: {lower}")
        return 0.0

In [32]:
def is_domain_ip(domain):
    try:
        parts = str(domain).split('.')
        if len(parts) != 4:
            return 0

        for part in parts:
            if not part.isdigit():
                return 0
            num = int(part)
            if num < 0 or num > 255:
                return 0

        return 1
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [33]:
def count_subdomains(domain, is_domain_ip):
    try:
        if is_domain_ip == 1:
            return 0  
        parts = domain.strip().split('.')
        if len(parts) <= 2:
            return 0 
        return len(parts) - 2
    except Exception as e:
        print(f"Error: {e} | Domain: {domain}")
        return 0

In [34]:
def extract_tld(domain, is_ip):
    try:
        if is_ip == 1:
            return "Absent"

        parts = domain.split('.')
        if len(parts) < 2:
            return "Invalid"

        return parts[-1].lower()
    except Exception as e:
        print(f"Error: {e}")
        return ""

In [35]:
def check_tld_and_mtld(domain, tld_set):
    try:
        domain = domain.strip().lower()
        parts = domain.split('.')

        if len(parts) < 2:
            return 0, 0 

        tld = parts[-1]
        mtld = parts[-2]

        return int(tld in tld_set), int(mtld in tld_set)

    except Exception as e:
        print(f"Error: {e}")
        return 0, 0

In [36]:
def character_transition_entropy(text):
    try:
        if not text or len(text) < 2:
            return 0.0

        bigrams = [text[i:i+2] for i in range(len(text)-1)]
        total = len(bigrams)

        freq = Counter(bigrams)
        entropy = 0.0
        for count in freq.values():
            p = count / total
            entropy -= p * math.log2(p)

        return entropy

    except Exception as e:
        print(f"Error: {e}")
        return 0.0

In [39]:
def digit_to_length_ratio(digits, url_length):
    try:
        if url_length == 0:
            return 0.0
        return round(digits / url_length, 2)
    except Exception as e:
        print(f"Error: {e}")
        return 0.0

In [43]:
def char_to_length_ratio(al_count, url_length):
    try:
        if url_length == 0:
            return 0.0
        return round(al_count / url_length, 2)
    except Exception as e:
        print(f"Error: {e}")
        return 0.0

In [44]:
def specialchar_to_length_ratio(special_count, url_length):
    try:
        if url_length == 0:
            return 0.0
        return round(special_count / url_length, 2)
    except Exception as e:
        print(f"Error: {e}")
        return 0.0

In [37]:
try:
    tlds = pd.read_csv("../datasets/tlds/tlds.csv")
except FileNotFoundError:
    print("Error: The file '../datasets/tlds/tlds.csv' was not found.")
    tlds = None

# ---- <i>NOTE</i> ----

<h2>The code below is used to add structural features to the CSV files.<br>
If your files are stored in a different directory, make sure to update the file_name variable accordingly.</h2>

<h2>Avoid enabling all feature additions in a single run, as this can put excessive load on your CPU and significantly slow down the process.<br>
It's strongly recommended to add one feature at a time.</h2>

In [45]:
for i in range(1, 62):
    file_name = f"../datasets/dataset_with_features/urls_{i}.csv"
    
    try:
        df = pd.read_csv(file_name)

        # df["is_https"] = df["url"].apply(is_https)
        
        # df["dots"] = df["url"].apply(count_dots)
        # df["at"] = df["url"].apply(count_at_symbols)
        # df["special_chars"] = df["url"].apply(count_special_chars) 
        # df["colons"] = df["url"].apply(count_colons)
        # df["equals"] = df["url"].apply(count_equals)
        # df["slashes"] = df["url"].apply(count_slashes)
        # df["hyphens"] = df["url"].apply(count_hyphens)
        # df["digits"] = df["url"].apply(count_digits)
        # df["question_marks"] = df["url"].apply(count_qm)
        # df["and"] = df["url"].apply(count_and)
        # df["tilde"] = df["url"].apply(count_tilde)
        # df["underscore"] = df["url"].apply(count_underscore)
        # df["percent"] = df["url"].apply(count_percent)
        # df["lowercase"] = df["url"].apply(count_lowercase_letters)
        # df["uppercase"] = df["url"].apply(count_uppercase_letters)

        # df["upper_to_lower_ratio"] = df.apply(
        #     lambda row: upper_to_lower_ratio(row["uppercase"], row["lowercase"]),
        #     axis=1
        # )

        # df["url_length"] = df["url"].apply(url_length)

        # df[["domain", "domain_length"]] = df["url"].apply(
        #     lambda x: pd.Series(domain_length(x))
        # )

        # df[["path", "path_length"]] = df["url"].apply(
        #     lambda x: pd.Series(path_length(x))
        # )
        
        # df["path_depth"] = df["path"].apply(path_depth)

        # df[["query", "query_length", "query_count"]] = df["url"].apply(
        #     lambda x: pd.Series(query_length_and_count(x))
        # )

        # df[["fragment", "fragment_length"]] = df["url"].apply(
        #     lambda x: pd.Series(fragment_length(x))
        # )
        
        # df["se_url"] = df["url"].apply(shannon_entropy)
        # df["se_domain"] = df["domain"].apply(shannon_entropy)
        # df["se_path"] = df["path"].apply(shannon_entropy)
        # df["se_query"] = df["query"].apply(shannon_entropy)
        # df["se_fragment"] = df["fragment"].apply(shannon_entropy)
        # df["cte_domain"] = df["domain"].apply(character_transition_entropy)

        # df["is_domain_ip"] = df["domain"].apply(is_domain_ip)

        # df["tld"] = df.apply(lambda row: extract_tld(row["domain"], row["is_domain_ip"]), axis=1)

        # df[["is_tld_iana_reg", "is_mtld"]] = df["domain"].apply(
        #     lambda d: pd.Series(check_tld_and_mtld(d, tld_set))
        # )

        # df["subdomains"] = df.apply(
        #     lambda row: count_subdomains(row["domain"], row["is_domain_ip"]),
        #     axis=1
        # )
        # df["subdomains"] = df.apply(
        #     lambda row: max(0, row["subdomains"] - 1) if row["is_mtld"] == 1 else row["subdomains"],
        #     axis=1
        # )

        # df["digit_to_length_ratio"] = df.apply(
        #     lambda row: digit_to_length_ratio(row["digits"], row["url_length"]),
        #     axis=1
        # )

        # df["char_to_length_ratio"] = df.apply(
        #     lambda row: char_to_length_ratio(
        #         row["lowercase"] + row["uppercase"], row["url_length"]
        #     ), axis=1
        # )

        # df["specialchar_to_length_ratio"] = df.apply(
        #     lambda row: specialchar_to_length_ratio(row["special_chars"], row["url_length"]),
        #     axis=1
        # )

        df.to_csv(f"../datasets/dataset_with_features/urls_{i}.csv", index=False)
        print(f"Processed and saved: urls_{i}.csv")
        
    except FileNotFoundError:
        print(f"File not found: {file_name}")
    except Exception as e:
        print(f"Error with file {file_name}: {e}")

Processed and saved: urls_1.csv
Processed and saved: urls_2.csv
Processed and saved: urls_3.csv
Processed and saved: urls_4.csv
Processed and saved: urls_5.csv
Processed and saved: urls_6.csv
Processed and saved: urls_7.csv
Processed and saved: urls_8.csv
Processed and saved: urls_9.csv
Processed and saved: urls_10.csv
Processed and saved: urls_11.csv
Processed and saved: urls_12.csv
Processed and saved: urls_13.csv
Processed and saved: urls_14.csv
Processed and saved: urls_15.csv
Processed and saved: urls_16.csv
Processed and saved: urls_17.csv
Processed and saved: urls_18.csv
Processed and saved: urls_19.csv
Processed and saved: urls_20.csv
Processed and saved: urls_21.csv
Processed and saved: urls_22.csv
Processed and saved: urls_23.csv
Processed and saved: urls_24.csv
Processed and saved: urls_25.csv
Processed and saved: urls_26.csv
Processed and saved: urls_27.csv
Processed and saved: urls_28.csv
Processed and saved: urls_29.csv
Processed and saved: urls_30.csv
Processed and saved

# ---- <i>OPTIONAL</i> ----

<h3>Dropping the string columns : domain, query, path, fragment. <br>
May help reduce file size</h3>

In [None]:
for i in range(1, 62):
    file_name = f"../datasets/dataset_with_features/urls_{i}.csv"
    
    try:
        df = pd.read_csv(file_name)

        # df.drop(columns = ["domain", "query", "path", "fragment"], inplace = True)
        
        df.to_csv(f"../datasets/dataset_with_features/urls_{i}.csv", index=False)
        print(f"Processed and saved: urls_{i}.csv")
        
    except FileNotFoundError:
        print(f"File not found: {file_name}")
    except Exception as e:
        print(f"Error with file {file_name}: {e}")