In [1]:
import math
from collections import Counter
import re
import pandas as pd

In [2]:
def is_https(url):
    try:
        if url.startswith("https://"):
            return 1
        elif url.startswith("http://"):
            return 0
        else:
            raise ValueError("Invalid URL: must start with http:// or https://")
    except Exception as e:
        print(f"Error: {e}")
        return None

In [3]:
def url_length(url):
    try:
        return len(url)  
    except Exception as e:
        print(f"{e}")
        return None

In [5]:
def domain_length(url):
    try:
        if url.startswith("https://"):
            url = url[8:]
        elif url.startswith("http://"):
            url = url[7:]
        else:
            raise ValueError("Invalid URL: must start with http:// or https://")
        
        domain = url.split('/')[0].split(':')[0]
        return domain, len(domain)
    except Exception as e:
        print(f"Error: {e}")
        return None, 0

In [6]:
def path_length(url):
    try:
        if url.startswith("https://"):
            url = url[8:]
        elif url.startswith("http://"):
            url = url[7:]
        else:
            raise ValueError("Invalid URL: must start with http:// or https://")

        parts = url.split('/', 1)
        if len(parts) == 1:
            return "Path absent", 0

        path_part = parts[1]
        path = path_part.split('?', 1)[0].split('#', 1)[0]

        clean_path = '/' + path if path else ''
        return clean_path, len(clean_path)

    except Exception as e:
        print(f"Error: {e}")
        return None, 0


In [7]:
def query_length_and_count(url):
    try:
        if '?' not in url:
            return "Query absent", 0, 0

        query_part = url.split('?', 1)[1]
        query = query_part.split('#', 1)[0]

        query_length = len(query)
        query_count = len(query.split('&')) if query else 0

        return query, query_length, query_count

    except Exception as e:
        print(f"Error: {e}")
        return None, 0, 0

In [8]:
def fragment_length(url):
    try:
        if '#' not in url:
            return "Fragment absent", 0

        fragment = url.split('#', 1)[1]
        return fragment, len(fragment)

    except Exception as e:
        print(f"Error: {e}")
        return None, 0

In [41]:
def shannon_entropy(text):
    try:
        if not text:
            return 0.0

        freq = Counter(text)
        length = len(text)

        entropy = 0.0
        for count in freq.values():
            p = count / length
            entropy -= p * math.log2(p)

        return entropy

    except Exception as e:
        print(f"Error: {e}")
        return 0.0

In [42]:
def count_dots(url):
    try:
        return url.count('.')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [43]:
def count_at_symbols(url):
    try:
        return url.count('@')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [44]:
def count_equals(url):
    try:
        return url.count('=')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [10]:
def count_special_chars(url):
    try:
        return len(re.findall(r'[^a-zA-Z0-9./@=:%\-&~?_]', url))
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [46]:
def count_slashes(url):
    try:
        return url.count('/')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [47]:
def count_hyphens(url):
    try:
        return url.count('-')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [48]:
def count_digits(url):
    try:
        return sum(c.isdigit() for c in url)
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [49]:
def count_colons(url):
    try:
        return url.count(':')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [1]:
def count_qm(url):
    try:
        return url.count('?')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [3]:
def count_and(url):
    try:
        return url.count('&')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [4]:
def count_underscore(url):
    try:
        return url.count('_')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [5]:
def count_tilde(url):
    try:
        return url.count('~')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [6]:
def count_percent(url):
    try:
        return url.count('%')
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [7]:
def count_lowercase_letters(url):
    try:
        return sum(1 for c in url if c.islower())
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [8]:
def count_uppercase_letters(url):
    try:
        return sum(1 for c in url if c.isupper())
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [9]:
def upper_to_lower_ratio(upper, lower):
    try:
        if lower == 0:
            return round(upper, 2) if upper != 0 else 0.0 
        return round(upper / lower, 2)
    except Exception as e:
        print(f"Error: {e} | upper: {upper}, lower: {lower}")
        return 0.0

In [50]:
def count_subdomains(domain):
    try:
        parts = domain.split('.')
        if len(parts) <= 2:
            return 0  
        return len(parts) - 2  
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [51]:
def is_domain_ip(domain):
    try:
        parts = domain.split('.')
        if len(parts) != 4:
            return False

        for part in parts:
            if not part.isdigit():
                return False
            num = int(part)
            if num < 0 or num > 255:
                return False

        return True
    except Exception as e:
        print(f"Error: {e}")
        return False

In [52]:
def extract_tld(domain):
    try:
        parts = domain.split('.')
        if len(parts) < 2:
            return "Invalid URL, TLD absent"
        return parts[-1].lower()
    except Exception as e:
        print(f"Error: {e}")
        return ""

In [53]:
def check_tld_and_mtld(domain, tld_df):
    try:
        tld_set = set(tld_df.iloc[:, 0].astype(str).str.lower().str.strip())

        parts = domain.split('.')
        if len(parts) < 2:
            return False, False  

        mtld = parts[-2].lower()  
        tld = parts[-1].lower()

        return tld in tld_set, mtld in tld_set
    except Exception as e:
        print(f"Error: {e}")
        return False, False


In [56]:
def path_depth(path):
    try:
        if path.lower() == "path absent":
            return 0
        parts = [p for p in path.split('/') if p]
        return len(parts)
    except Exception as e:
        print(f"Error: {e}")
        return 0

In [None]:
def character_transition_entropy(text):
    try:
        if not text or len(text) < 2:
            return 0.0

        bigrams = [text[i:i+2] for i in range(len(text)-1)]
        total = len(bigrams)

        freq = Counter(bigrams)
        entropy = 0.0
        for count in freq.values():
            p = count / total
            entropy -= p * math.log2(p)

        return entropy

    except Exception as e:
        print(f"Error: {e}")
        return 0.0

In [59]:
try:
    tlds = pd.read_csv("../datasets/tlds/tlds.csv")
except FileNotFoundError:
    print("Error: The file '../datasets/tlds/tlds.csv' was not found.")
    tlds = None

In [None]:
for i in range(1, 62):
    file_name = f"../datasets/training_dataset_with_features/urls_{i}.csv"
    
    try:
        df = pd.read_csv(file_name)

        # df["is_https"] = df["url"].apply(is_https)
        
        # df["dots"] = df["url"].apply(count_dots)
        # df["at"] = df["url"].apply(count_at_symbols)
        # df["special_chars"] = df["url"].apply(count_special_chars) remaining to be added
        # df["colons"] = df["url"].apply(count_colons)
        # df["equals"] = df["url"].apply(count_equals)
        # df["slashes"] = df["url"].apply(count_slashes)
        # df["hyphens"] = df["url"].apply(count_hyphens)
        # df["digits"] = df["url"].apply(count_digits)
        # df["question_marks"] = df["url"].apply(count_qm)
        # df["and"] = df["url"].apply(count_and)
        # df["tilde"] = df["url"].apply(count_tilde)
        # df["underscore"] = df["url"].apply(count_underscore)
        # df["percent"] = df["url"].apply(count_percent)
        # df["lowercase"] = df["url"].apply(count_lowercase_letters)
        # df["uppercase"] = df["url"].apply(count_uppercase_letters)

        # df["upper_to_lower_ratio"] = df.apply(
        #     lambda row: upper_to_lower_ratio(row["uppercase"], row["lowercase"]),
        #     axis=1
        # )

        # df["url_length"] = df["url"].apply(url_length)

        # df["se_url"] = df["url"].apply(shannon_entropy)
        # df["se_domain"] = df["url"].apply(shannon_entropy)
        # df["se_path"] = df["url"].apply(shannon_entropy)
        # df["se_query"] = df["url"].apply(shannon_entropy)
        # df["se_fragment"] = df["url"].apply(shannon_entropy)
        # df["cte_domain"] = df["url"].apply(character_transition_entropy)
        
        df.to_csv(f"../datasets/training_dataset_with_features/urls_{i}.csv", index=False)
        print(f"Processed and saved: urls_{i}.csv")
        
    except FileNotFoundError:
        print(f"File not found: {file_name}")
    except Exception as e:
        print(f"Error with file {file_name}: {e}")