In [11]:
import os
import sys

# Handle both notebook and normal script cases
try:
    CURRENT_DIR = os.path.dirname(__file__)
except NameError:
    # __file__ is not defined in notebooks — use the current working directory
    CURRENT_DIR = os.getcwd()

PARENT_DIR = os.path.abspath(os.path.join(CURRENT_DIR, ".."))
if PARENT_DIR not in sys.path:
    sys.path.insert(0, PARENT_DIR)

from DD_FEATURE_EXTRACTOR_09_21_2025 import safe_requests_get

def inspect_url(url: str):
    """Fetch a URL using safe_requests_get and print key response details."""
    if safe_requests_get is None:
        print("safe_requests_get not available; aborting.")
        return
    r = safe_requests_get(url)
    if r is None:
        print("Request failed or returned no response object.")
        return
    print("---------------------------------------------------")
    print("Status:", r.status_code)
    print("Final URL:", r.url)
    print("Content-Type:", r.headers.get("Content-Type"))
    print("Content-Length header:", r.headers.get("Content-Length"))
    body_bytes = r.content or b""
    print("Actual body length (bytes):", len(body_bytes))
    try:
        preview = body_bytes[:2000].decode(r.encoding or "utf-8", errors="replace")
    except Exception:
        preview = "(failed to decode preview)"
    print("---------------------------------------------------")
    print("HTML Preview (first 500 characters):")
    print(preview)
    print("---------------------------------------------------")

if __name__ == "__main__":
    test_url = "https://example.com"
    inspect_url(test_url)


---------------------------------------------------
Status: 200
Final URL: https://example.com/
Content-Type: text/html
Content-Length header: 363
Actual body length (bytes): 513
---------------------------------------------------
HTML Preview (first 500 characters):
<!doctype html><html lang="en"><head><title>Example Domain</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.<p><a href="https://iana.org/domains/example">Learn more</a></div></body></html>

---------------------------------------------------


In [12]:
import os
import sys
import hashlib

# ----------------------------------------------------------------------
# Handle path for importing DD_FEATURE_EXTRACTOR_09_21_2025.py
# (works inside Jupyter since __file__ is not defined)
# ----------------------------------------------------------------------
CURRENT_DIR = os.getcwd()
PARENT_DIR = os.path.abspath(os.path.join(CURRENT_DIR, ".."))
if PARENT_DIR not in sys.path:
    sys.path.insert(0, PARENT_DIR)

# ----------------------------------------------------------------------
# Import safe_requests_get from your extractor module
# ----------------------------------------------------------------------
try:
    from DD_FEATURE_EXTRACTOR_09_21_2025 import safe_requests_get
except Exception as exc:
    print("Failed to import safe_requests_get from DD_FEATURE_EXTRACTOR_09_21_2025:", exc)
    safe_requests_get = None


# ----------------------------------------------------------------------
# Inspect function with MD5 computation
# ----------------------------------------------------------------------
def inspect_url(url: str):
    """Fetch a URL using safe_requests_get and print response info + MD5 hash."""
    if safe_requests_get is None:
        print("safe_requests_get not available; aborting.")
        return None

    r = safe_requests_get(url)
    if r is None:
        print("Request failed or returned no response object.")
        return None

    print("---------------------------------------------------")
    print("Status:", r.status_code)
    print("Final URL:", r.url)
    print("Content-Type:", r.headers.get("Content-Type"))
    print("Content-Length header:", r.headers.get("Content-Length"))

    body_bytes = r.content or b""
    print("Actual body length (bytes):", len(body_bytes))

    # ---- Compute MD5 hash of HTML content ----
    md5_hash = hashlib.md5(body_bytes).hexdigest()
    print("MD5 Hash of page content:", md5_hash)

    try:
        preview = body_bytes[:500].decode(r.encoding or "utf-8", errors="replace")
    except Exception:
        preview = "(failed to decode preview)"

    print("---------------------------------------------------")
    print("HTML Preview (first 500 characters):")
    print(preview)
    print("---------------------------------------------------")

    return md5_hash


# ----------------------------------------------------------------------
# Example test
# ----------------------------------------------------------------------
test_url = "https://example.com"
md5_value = inspect_url(test_url)
print(f"\n✅ MD5 hash: {md5_value}")


---------------------------------------------------
Status: 200
Final URL: https://example.com/
Content-Type: text/html
Content-Length header: 363
Actual body length (bytes): 513
MD5 Hash of page content: bc2473a18e003bdb249eba5ce893033f
---------------------------------------------------
HTML Preview (first 500 characters):
<!doctype html><html lang="en"><head><title>Example Domain</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.<p><a href="https://iana.org/domains/example">Learn more</a></div></
---------------------------------------------------

✅ MD5 hash: bc2473a18e003bdb249eba5ce893033f


In [13]:
from urllib.parse import urlparse

def normalize_domain(url_or_domain: str) -> str:
    """Extract and normalize the domain from a URL or domain string."""
    if not url_or_domain:
        return ""

    # Ensure lowercase
    text = url_or_domain.strip().lower()

    # Add scheme if missing (needed for urlparse)
    if not text.startswith(("http://", "https://")):
        text = "http://" + text

    try:
        parsed = urlparse(text)
        host = parsed.netloc or parsed.path  # handle 'example.com' case
        # Remove leading 'www.' if present
        if host.startswith("www."):
            host = host[4:]
        # Strip port numbers, e.g., example.com:8080 → example.com
        if ":" in host:
            host = host.split(":")[0]
        return host
    except Exception:
        return ""


In [14]:
test_urls = [
    "https://www.google.com/search?q=test",
    "http://sub.example.co.uk:8080/path/page.html",
    "example.com",
    "https://EXAMPLE.COM/",
    "ftp://my.site.net",
    "http://localhost:5000/api",
    "",
]

print("Testing domain normalization:\n")
for u in test_urls:
    normalized = normalize_domain(u)
    print(f"{u:45} → {normalized}")


Testing domain normalization:

https://www.google.com/search?q=test          → google.com
http://sub.example.co.uk:8080/path/page.html  → sub.example.co.uk
example.com                                   → example.com
https://EXAMPLE.COM/                          → example.com
ftp://my.site.net                             → ftp
http://localhost:5000/api                     → localhost
                                              → 
