In [None]:
import os
import time
import random
import math
import re
import pandas as pd
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import Counter
import whois
import tldextract

In [None]:
extractor = tldextract.TLDExtract(cache_dir='.tld_cache', suffix_list_urls=None)
ip_pattern = re.compile(r'^\d{1,3}(\.\d{1,3}){3}$')
shorteners_set = {"bit.ly", "t.co", "tinyurl.com", "goo.gl", "ow.ly", "is.gd", "buff.ly", "cutt.ly"}
keywords_set = {"secure", "account", "update", "free", "lucky", "bonus", "click", "offer", "winner", "login", "verify", "banking", "confirm", "password", "signin"}

In [None]:
def url_entropy(s: str) -> float:
    if not s:
        return 0.0
    probs = [c / len(s) for c in Counter(s).values()]
    return -sum(p * math.log2(p) for p in probs)

In [None]:
def normalize_date(value):
    if value is None:
        return None
    if isinstance(value, pd.Timestamp):
        return value.to_pydatetime()
    if isinstance(value, list):
        flat = []
        for v in value:
            if isinstance(v, list):
                flat.extend(v)
            else:
                flat.append(v)
        flat = [v for v in flat if v is not None]
        if not flat:
            return None
        return flat[0]
    return value

In [None]:
def extract_whois_features(features, url):
    try:
        w = whois.whois(url)
        creation = normalize_date(w.creation_date)
        expiration = normalize_date(w.expiration_date)

        now = pd.Timestamp.now()
        features["domain_age"] = (now - pd.Timestamp(creation)).days if creation else -1
        features["days_to_expire"] = (pd.Timestamp(expiration) - now).days if expiration else -1
        features["registration_length"] = (
            (pd.Timestamp(expiration) - pd.Timestamp(creation)).days
            if creation and expiration else -1
        )

    except Exception as e:
        print(f"⚠️ WHOIS chyba pri {url}: {e}")
        features["domain_age"] = -1
        features["days_to_expire"] = -1
        features["registration_length"] = -1

    return features

In [None]:
def extract_url_features(url: str, phishing: int = 0) -> dict:
    parsed = urlparse(url)
    hostname = parsed.hostname or ""
    path = parsed.path or ""
    query = parsed.query or ""

    ext = extractor(url)
    url_lower = url.lower()

    full_domain = f"{ext.domain}"
    if ext.suffix and not full_domain.endswith(ext.suffix):
        full_domain += f".{ext.suffix}"

    features = {
        "phishing": phishing,
        "url": url,
        "url_len": len(url),
        "host_len": len(hostname),
        "path_len": len(path),
        "query_len": len(query),
        "is_https": 1 if parsed.scheme == "https" else 0,
        "count_dots": hostname.count("."),
        "count_hyphen": url.count("-"),
        "count_at": url.count("@"),
        "count_qm": url.count("?"),
        "count_eq": url.count("="),
        "count_slash": url.count("/"),
        "count_double_slash": url.count("//") - 1 if url.count("//") > 1 else 0,
        "count_digits": sum(c.isdigit() for c in url),
        "has_ip": 1 if ip_pattern.match(hostname) else 0,
        "has_shortener": 1 if full_domain in shorteners_set else 0,
        "has_keyword": 1 if any(kw in url_lower for kw in keywords_set) else 0,
        "subdomain_len": len(ext.subdomain),
        "domain": ext.domain,
        "suffix": ext.suffix,
        "domain_entropy": url_entropy(hostname)
    }

    extract_whois_features(features, full_domain)
    
    return features

In [None]:
def worker(url: str, phishing: int, retries: int = 9, delay: float = 0.5):
    for attempt in range(1, retries + 1):
        try:
            features = extract_url_features(url, phishing)

            # Retry ak WHOIS neúspešný
            if features.get("domain_age", -1) == -1 and attempt < retries:
                raise ValueError("WHOIS failed, retrying...")

            if attempt > 1:
                print(f"✅ {url} succeeded on attempt {attempt}")
            return features

        except Exception as e:
            print(f"⚠️ {url} attempt {attempt}/{retries} failed: {e}")
            if attempt < retries:
                time.sleep(delay * attempt + random.uniform(0, 0.3))
            else:
                print(f"❌ {url} failed all {retries} attempts")
    return None

In [None]:
def process_urls(input_file: str, output_file: str, phishing: int = 0, max_workers: int = 10, limit: int = None):
    """Spracuje URL zo súboru (txt alebo csv) paralelne s retry logikou."""
    # Načítanie URL
    if input_file.endswith(".csv"):
        df = pd.read_csv(input_file)
        urls = df["url"].astype(str).tolist() if "url" in df.columns else df.iloc[:, 0].astype(str).tolist()
    else:
        with open(input_file, "r", encoding="utf-8", errors="ignore") as f:
            urls = [line.strip() for line in f if line.strip()]

    if limit:
        urls = urls[:limit]

    records = []
    failed = []

    print(f"🧩 Načítaných {len(urls)} URL, spúšťam spracovanie s {max_workers} vláknami...")

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(worker, u, phishing): u for u in urls}
        total = len(futures)

        for i, future in enumerate(as_completed(futures), 1):
            u = futures[future]
            try:
                res = future.result()
                if res:
                    records.append(res)
                else:
                    records.append(u)
            except Exception as e:
                print(f"❌ Výnimka pri {u}: {e}")
                records.append(u)

            if i % 20 == 0 or i == total:
                print(f"Progress {i}/{total} | Success: {len(records)} | Failed: {len(failed)}")
                if records:
                    out_df = pd.DataFrame(records)
                    out_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))
                    records.clear()

    print(f"✅ Hotovo. Výsledok: {output_file}")
    return pd.read_csv(output_file) if os.path.exists(output_file) else pd.DataFrame()


In [None]:
df = process_urls(
    input_file="../dataset/phishing.txt",
    output_file="../dataset/df_phishing.csv",
    phishing=1,
    max_workers=90,
    limit=10000
)

In [None]:
df = process_urls(
    input_file="../dataset/benign.txt",
    output_file="../dataset/df_benign.csv",
    phishing=0,
    max_workers=90,
    limit=10000
)