<a href="https://colab.research.google.com/github/Tanzaniav0825/Algorithms-of-Data-Science/blob/session-2/Python_code_deliverable_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
# === PURPOSE ===
# Bring in libraries needed for:
# - URL parsing (urllib.parse)
# - Regex/string checks
# - Networking for optional reachability checks (http.client)
# - JSON structuring

import re
import math
import json
from urllib.parse import urlparse, parse_qs
import http.client


In [22]:
# === PURPOSE ===
# Keep modularity: small helpers make the main function easier to read and maintain.

# Some whitelisted credible domains (illustrative; expand later)
KNOWN_REPUTABLE = {
    "nytimes.com", "bbc.com", "apnews.com", "reuters.com", "nature.com",
    "npr.org", "theguardian.com", "who.int", "cdc.gov"
}

# URL shorteners (less transparent → lower credibility)
SHORTENERS = {"bit.ly", "tinyurl.com", "t.co", "goo.gl", "ow.ly"}

def domain_from_netloc(netloc: str) -> str:
    """Extract clean domain (drop user/pass, ports)."""
    if "@" in netloc:
        netloc = netloc.split("@", 1)[1]
    if ":" in netloc:
        netloc = netloc.split(":", 1)[0]
    return netloc.lower()

def is_reachable(parsed, timeout=2.0) -> bool:
    """Attempt a HEAD request to see if the URL responds."""
    try:
        conn = None
        if parsed.scheme == "https":
            conn = http.client.HTTPSConnection(parsed.netloc, timeout=timeout)
        elif parsed.scheme == "http":
            conn = http.client.HTTPConnection(parsed.netloc, timeout=timeout)
        else:
            return False
        path = parsed.path or "/"
        if parsed.query:
            path += "?" + parsed.query
        conn.request("HEAD", path, headers={"User-Agent": "CredibilityProto/0.1"})
        resp = conn.getresponse()
        return 200 <= resp.status < 400
    except Exception:
        return False


In [23]:
# === PURPOSE ===
# Main entry point: takes a URL string, applies rules/heuristics,
# and returns JSON with score + explanation.

def evaluate_url(url: str, network_check: bool = True) -> dict:
    explanation = []
    score = 0.0

    # --- Step 1: Validate & parse ---
    if not isinstance(url, str) or not url.strip():
        return {"score": 0.0, "explanation": "Invalid input: not a URL string."}
    if not re.match(r'^[a-zA-Z][a-zA-Z0-9+\-.]*://', url):
        url = "https://" + url  # assume https if missing scheme
    parsed = urlparse(url)
    if not parsed.netloc:
        return {"score": 0.0, "explanation": "Invalid URL: missing domain."}

    domain = domain_from_netloc(parsed.netloc)
    explanation.append(f"domain={domain}")

    # --- Step 2: Heuristics ---
    # HTTPS bonus
    if parsed.scheme == "https":
        score += 0.6
        explanation.append("+ uses HTTPS (+0.6)")
    else:
        score += 0.2
        explanation.append("+ non-HTTPS but valid scheme (+0.2)")

    # Known reputable domains
    if any(domain.endswith(rep) for rep in KNOWN_REPUTABLE):
        score += 0.8
        explanation.append("+ recognized reputable domain (+0.8)")

    # Shorteners
    if any(domain == s or domain.endswith("." + s) for s in SHORTENERS):
        score -= 0.8
        explanation.append("- URL shortener detected (-0.8)")

    # Suspicious patterns: too many digits/hyphens
    if sum(c.isdigit() for c in domain) > 5:
        score -= 0.4
        explanation.append("- too many digits in domain (-0.4)")
    if domain.count("-") > 3:
        score -= 0.4
        explanation.append("- too many hyphens in domain (-0.4)")

    # --- Step 3: Optional network check ---
    if network_check:
        if is_reachable(parsed):
            score += 0.3
            explanation.append("+ URL reachable (+0.3)")
        else:
            score -= 0.2
            explanation.append("- URL not reachable (-0.2)")

    # --- Step 4: Normalize to 0–1 range ---
    # use a sigmoid squashing for interpretability
    final_score = 1 / (1 + math.exp(-score))
    return {
        "score": round(final_score, 3),
        "explanation": " | ".join(explanation)
    }


In [24]:
# === PURPOSE ===
# Demonstrate it works on good, bad, and malformed inputs.
# Validate consistent JSON output.

test_urls = [
    "https://www.nature.com/articles",
    "bbc.com/news",  # missing scheme
    "http://example.com",
    "bit.ly/xyz123",
    "http:/broken-url",
    "",
    None
]

for u in test_urls:
    result = evaluate_url(u, network_check=False)
    print(f"{u} -> {result}")


https://www.nature.com/articles -> {'score': 0.802, 'explanation': 'domain=www.nature.com | + uses HTTPS (+0.6) | + recognized reputable domain (+0.8)'}
bbc.com/news -> {'score': 0.802, 'explanation': 'domain=bbc.com | + uses HTTPS (+0.6) | + recognized reputable domain (+0.8)'}
http://example.com -> {'score': 0.55, 'explanation': 'domain=example.com | + non-HTTPS but valid scheme (+0.2)'}
bit.ly/xyz123 -> {'score': 0.45, 'explanation': 'domain=bit.ly | + uses HTTPS (+0.6) | - URL shortener detected (-0.8)'}
http:/broken-url -> {'score': 0.646, 'explanation': 'domain=http | + uses HTTPS (+0.6)'}
 -> {'score': 0.0, 'explanation': 'Invalid input: not a URL string.'}
None -> {'score': 0.0, 'explanation': 'Invalid input: not a URL string.'}


In [25]:
# === PURPOSE ===
# Show how this could integrate with a chatbot or RAG pipeline:
# Each retrieved reference is annotated with a credibility score.

references = [
    "https://www.nytimes.com",
    "https://random-blog1234.net/article",
    "https://bit.ly/shortlink"
]

scored_refs = [evaluate_url(ref, network_check=False) for ref in references]
print(json.dumps(scored_refs, indent=2))


[
  {
    "score": 0.802,
    "explanation": "domain=www.nytimes.com | + uses HTTPS (+0.6) | + recognized reputable domain (+0.8)"
  },
  {
    "score": 0.646,
    "explanation": "domain=random-blog1234.net | + uses HTTPS (+0.6)"
  },
  {
    "score": 0.45,
    "explanation": "domain=bit.ly | + uses HTTPS (+0.6) | - URL shortener detected (-0.8)"
  }
]
