<a href="https://colab.research.google.com/github/Tanzaniav0825/Algorithms-of-Data-Science/blob/session-2/Python_code_deliverable_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
#Libraries for URL parsing, networking, and utilities.
import re
import math
import json
import time
import socket
from typing import List, Dict, Any, Optional
from urllib.parse import urlparse, parse_qs
import http.client


In [None]:
# [Purpose] Centralize helpers. This version tightens URL validation so malformed inputs stay invalid.

GOOD_NEWS_DOMAINS = {
    "nytimes.com", "washingtonpost.com", "bbc.com", "apnews.com", "reuters.com",
    "theguardian.com", "npr.org", "wsj.com", "financialtimes.com", "nature.com",
}
URL_SHORTENERS = {
    "bit.ly", "tinyurl.com", "t.co", "goo.gl", "ow.ly", "is.gd", "buff.ly",
    "rb.gy", "rebrand.ly", "lnkd.in"
}
TRACKING_PARAMS = {"utm_source","utm_medium","utm_campaign","utm_term","utm_content","gclid","fbclid"}

_SCHEME_RE = re.compile(r'^[a-zA-Z][a-zA-Z0-9+\-.]*://')
# Detect "http//" (missing colon) and similar malformed schemes
_MALFORMED_SCHEME_RE = re.compile(r'^[a-zA-Z]+//')

def _plausible_hostname(host: str) -> bool:
    """
    Lightweight hostname check:
      - contains at least one dot
      - labels: 1–63 chars, a–z 0–9 -, no label starts/ends with '-'
      - final TLD is alphabetic and length 2–24
    """
    host = host.lower()
    if "@" in host:
        host = host.split("@", 1)[-1]
    if ":" in host:
        host = host.split(":", 1)[0]

    if "." not in host:
        return False
    labels = host.split(".")
    for lab in labels:
        if not (1 <= len(lab) <= 63):
            return False
        if lab[0] == "-" or lab[-1] == "-":
            return False
        if not re.fullmatch(r'[a-z0-9-]+', lab):
            return False
    tld = labels[-1]
    if not (2 <= len(tld) <= 24 and tld.isalpha()):
        return False
    return True

def safe_parse(url: str):
    """Parse a URL safely. Returns (parsed, error_msg). Stricter than before."""
    if not isinstance(url, str):
        return None, "Invalid URL: Input is not a string."
    url = url.strip()
    if not url:
        return None, "Invalid URL: Empty string."

    # Explicitly reject 'http//...' style (missing colon)
    if _MALFORMED_SCHEME_RE.match(url):
        return None, "Invalid URL: Malformed scheme (missing ':')."

    # If scheme is missing, try assuming https://
    if not _SCHEME_RE.match(url):
        url = "https://" + url

    try:
        parsed = urlparse(url)
    except Exception as e:
        return None, f"URL parse error: {e}"

    if not parsed.netloc:
        return None, "Invalid URL: Missing network location (domain)."

    # Final hostname plausibility gate
    host = parsed.netloc.lower()
    if "@" in host:
        host = host.split("@", 1)[-1]
    if ":" in host:
        host = host.split(":", 1)[0]
    if not _plausible_hostname(host):
        return None, "Invalid URL: Hostname not plausible."

    return parsed, None

def domain_from_netloc(netloc: str) -> str:
    """Extract base domain (very basic; not a full PSL parser)."""
    netloc = netloc.lower()
    if "@" in netloc:
        netloc = netloc.split("@", 1)[1]
    if ":" in netloc:
        netloc = netloc.split(":", 1)[0]
    return netloc

def is_reachable_via_head(parsed, timeout=2.0) -> Optional[int]:
    """
    Attempt a lightweight HEAD request to gauge reachability & status.
    Returns HTTP status code or None if we can't reach.
    """
    try:
        host = parsed.netloc
        path = parsed.path or "/"
        if parsed.query:
            path += "?" + parsed.query

        if parsed.scheme == "https":
            conn = http.client.HTTPSConnection(host, timeout=timeout)
        elif parsed.scheme == "http":
            conn = http.client.HTTPConnection(host, timeout=timeout)
        else:
            return None

        conn.request("HEAD", path, headers={"User-Agent": "CredibilityProto/0.1"})
        resp = conn.getresponse()
        status = resp.status
        conn.close()
        return status
    except Exception:
        return None

def sigmoid(x: float) -> float:
    """Map any real number to (0, 1) for a tidy score."""
    return 1 / (1 + math.exp(-x))


In [16]:
# [Purpose] Simple, explainable heuristics; consistent JSON output.

def evaluate_url(url: str, network_check: bool = True, timeout: float = 2.0) -> Dict[str, Any]:
    parsed, parse_err = safe_parse(url)
    explanation_lines = []
    linear_score = 0.0  # accumulate, then squash to (0,1) via sigmoid

    if parse_err:
        explanation_lines.append(parse_err)
        return {"score": 0.0, "explanation": " | ".join(explanation_lines)}

    scheme = parsed.scheme.lower()
    netloc = domain_from_netloc(parsed.netloc)
    path = parsed.path or "/"
    query = parsed.query or ""
    tld = netloc.split(".")[-1] if "." in netloc else ""

    explanation_lines.append(f"scheme={scheme}, domain={netloc}")

    # 1) Scheme preference
    if scheme == "https":
        linear_score += 0.6
        explanation_lines.append("+ HTTPS detected (+0.6)")
    elif scheme == "http":
        linear_score += 0.2
        explanation_lines.append("+ HTTP detected (+0.2)")
    else:
        linear_score -= 0.2
        explanation_lines.append(f"- Non-HTTP(S) scheme '{scheme}' (-0.2)")

    # 2) Domain category
    if tld in {"edu", "gov"}:
        linear_score += 1.0
        explanation_lines.append("+ .edu/.gov top-level domain (+1.0)")

    if any(netloc.endswith(d) for d in GOOD_NEWS_DOMAINS):
        linear_score += 0.8
        explanation_lines.append("+ recognized reputable domain (+0.8)")

    if any(netloc == s or netloc.endswith("." + s) for s in URL_SHORTENERS):
        linear_score -= 0.8
        explanation_lines.append("- URL shortener detected (-0.8)")

    # 3) Suspicious domain patterns
    hyphens = netloc.count("-")
    digits = sum(c.isdigit() for c in netloc)
    if "xn--" in netloc:
        linear_score -= 0.6
        explanation_lines.append("- punycode domain (-0.6)")
    if hyphens >= 3:
        linear_score -= 0.5
        explanation_lines.append(f"- many hyphens in domain ({hyphens}) (-0.5)")
    if digits >= 5:
        linear_score -= 0.4
        explanation_lines.append(f"- many digits in domain ({digits}) (-0.4)")

    # 4) URL complexity
    url_str = parsed.geturl()
    if len(url_str) > 200:
        linear_score -= 0.4
        explanation_lines.append("- very long URL (>200 chars) (-0.4)")

    q = parse_qs(query)
    qp_count = len(q)
    if qp_count >= 5:
        linear_score -= 0.2
        explanation_lines.append(f"- many query parameters ({qp_count}) (-0.2)")
    if any(p in q for p in TRACKING_PARAMS):
        linear_score -= 0.1
        explanation_lines.append("- common tracking params present (-0.1)")

    # 5) Optional reachability (HEAD)
    status = None
    if network_check:
        status = is_reachable_via_head(parsed, timeout=timeout)
        if status is None:
            linear_score -= 0.15
            explanation_lines.append("- could not verify reachability (-0.15)")
        else:
            explanation_lines.append(f"HEAD status={status}")
            if 200 <= status < 300:
                linear_score += 0.3
                explanation_lines.append("+ 2xx status (+0.3)")
            elif 300 <= status < 400:
                linear_score += 0.1
                explanation_lines.append("+ 3xx status (+0.1)")
            elif 400 <= status < 500:
                linear_score -= 0.3
                explanation_lines.append("- 4xx status (-0.3)")
            else:
                linear_score -= 0.2
                explanation_lines.append("- 5xx/other status (-0.2)")

    score = sigmoid(linear_score)
    score = max(0.0, min(1.0, score))  # clamp

    return {
        "score": float(round(score, 4)),
        "explanation": " | ".join(explanation_lines),
    }


In [17]:
# Score multiple URLs in one call; handy for tests/benchmarks.

def evaluate_urls(urls: List[str], network_check: bool = True, timeout: float = 2.0) -> List[Dict[str, Any]]:
    results = []
    for u in urls:
        try:
            results.append(evaluate_url(u, network_check=network_check, timeout=timeout))
        except Exception as e:
            results.append({"score": 0.0, "explanation": f"Internal error: {e}"})
    return results


In [18]:
# Sanity tests demonstrating consistent JSON, malformed inputs, and different URL types.

test_urls = [
    "https://www.nytimes.com",
    "http://example.com",
    "bbc.com/news",                        # missing scheme -> we auto-prepend https://
    "https://bit.ly/xyz",
    "https://xn--pt-eka.com",             # punycode-ish pattern
    "https://some---weird---domain12345.net/path?x=1&y=2&z=3&a=4&b=5&utm_source=abc",
    "ftp://data.server.com/file.csv",     # non-http(s)
    "http:/bad",                          # malformed
    "",                                   # empty
    None                                  # non-string
]

print("=== Functional tests (network_check=False for speed/reliability in class) ===")
for u in test_urls:
    result = evaluate_url(u, network_check=False)
    # Enforce required JSON shape
    assert isinstance(result, dict) and "score" in result and "explanation" in result
    assert isinstance(result["score"], float)
    assert isinstance(result["explanation"], str)
    print(u, "->", json.dumps(result, ensure_ascii=False))


=== Functional tests (network_check=False for speed/reliability in class) ===
https://www.nytimes.com -> {"score": 0.8022, "explanation": "scheme=https, domain=www.nytimes.com | + HTTPS detected (+0.6) | + recognized reputable domain (+0.8)"}
http://example.com -> {"score": 0.5498, "explanation": "scheme=http, domain=example.com | + HTTP detected (+0.2)"}
bbc.com/news -> {"score": 0.8022, "explanation": "scheme=https, domain=bbc.com | + HTTPS detected (+0.6) | + recognized reputable domain (+0.8)"}
https://bit.ly/xyz -> {"score": 0.4502, "explanation": "scheme=https, domain=bit.ly | + HTTPS detected (+0.6) | - URL shortener detected (-0.8)"}
https://xn--pt-eka.com -> {"score": 0.3775, "explanation": "scheme=https, domain=xn--pt-eka.com | + HTTPS detected (+0.6) | - punycode domain (-0.6) | - many hyphens in domain (3) (-0.5)"}
https://some---weird---domain12345.net/path?x=1&y=2&z=3&a=4&b=5&utm_source=abc -> {"score": 0.3543, "explanation": "scheme=https, domain=some---weird---domain123

In [19]:
# Separate pure-heuristic speed from network overhead.

bench_urls = [
    "nytimes.com", "reuters.com", "example.com", "bit.ly/abc",
    "some-long-domain-123456789.com/path?x=1&y=2&z=3&a=4&b=5&utm_source=abc",
] * 20  # 100 URLs total

def bench(fn, label):
    t0 = time.time()
    _ = fn()
    dt = time.time() - t0
    print(f"{label}: {dt:.3f}s")

bench(lambda: evaluate_urls(bench_urls, network_check=False), "Heuristics-only (100 URLs)")

# Keep the network test small to avoid rate-limits/timeouts in class.
small_network_urls = ["https://www.nytimes.com", "https://www.bbc.com", "http://example.com"]
bench(lambda: evaluate_urls(small_network_urls, network_check=True, timeout=2.0), "With HEAD checks (3 URLs)")


Heuristics-only (100 URLs): 0.005s
With HEAD checks (3 URLs): 0.375s


In [20]:
# Clear demonstration of required JSON output.
demo = evaluate_url("https://www.nature.com/articles", network_check=False)
print(json.dumps(demo, ensure_ascii=False, indent=2))


{
  "score": 0.8022,
  "explanation": "scheme=https, domain=www.nature.com | + HTTPS detected (+0.6) | + recognized reputable domain (+0.8)"
}
