In [None]:
# -*- coding: utf-8 -*-
"""
FAST MODE: MovieLens -> IMDb /reference only -> One TXT per movie (Title_YYYY.txt)

Speed-ups:
  • 1 HTTP request per movie (reference page only)
  • requests.Session() with connection pooling
  • ThreadPoolExecutor concurrency (tune MAX_WORKERS)
  • Resume-safe: skip files that already exist

Outputs:
  • data/imdb_txt_fast/Title_YYYY.txt (movieId, imdb_id, title, year, url, ratingValue, ratingCount, genres, directors, actors, duration, description)
  • data/imdb_txt_fast/index_imdb_txt.csv (movieId, tt, title, year, txt_filename, imdb_url)
"""

import re
import time
import html
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# ---------------- Config ----------------
LINKS_CSV   = "Movielens/links.csv"                # MovieLens
MOVIES_CSV  = "Movielens/movies.csv"               # for fallback display title if needed
OUT_DIR     = Path("data/imdb_txt_fast")
OUT_DIR.mkdir(parents=True, exist_ok=True)

SELECT_BY   = "first_n"                  # "first_n" or "all"
FIRST_N     = 1000

TOP_ACTORS_PER_MOVIE = 20                # cap actors saved per movie
MAX_WORKERS = 8                          # concurrency (be polite; 6–10 is reasonable)
REQUEST_TIMEOUT = 20
RETRIES = 3

UA = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

# -------------- Helpers -----------------
def imdb_tt_from_numeric(imdb_numeric: int) -> str:
    return f"tt0{int(imdb_numeric):06d}"

def safe_title_for_filename(title: str) -> str:
    s = (title or "").strip()
    s = s.replace(" ", "_")
    s = re.sub(r'[\\/:*?"<>|]+', "_", s)
    s = re.sub(r"_+", "_", s)
    return s or "untitled"

def human_duration_from_reference(runtime_text: str) -> str:
    """
    On /reference, runtime often looks like '142 min' or '1 hr 36 min'.
    Convert to '2h 22m' style when possible.
    """
    txt = (runtime_text or "").lower()
    # patterns: "1 hr 36 min", "142 min"
    m = re.search(r'(\d+)\s*hr', txt)
    h = int(m.group(1)) if m else None
    m = re.search(r'(\d+)\s*min', txt)
    mins = int(m.group(1)) if m else None
    if h is not None and mins is not None:
        return f"{h}h {mins}m"
    if mins is not None:
        return f"{mins}m"
    return runtime_text.strip()

def build_session():
    s = requests.Session()
    retries = Retry(
        total=RETRIES,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    adapter = HTTPAdapter(max_retries=retries, pool_connections=MAX_WORKERS, pool_maxsize=MAX_WORKERS)
    s.mount("https://", adapter); s.mount("http://", adapter)
    s.headers.update(UA)
    return s

# ----------- Parsing /reference (single-source) -----------
def parse_reference(html_text: str):
    """
    Extract main fields from /reference page:
    returns dict: name, year, ratingValue, ratingCount, genres(list), directors(list), actors(list), duration(str), description(str)
    """
    soup = BeautifulSoup(html_text, "html.parser")
    out = {
        "name": None, "year": None, "ratingValue": None, "ratingCount": None,
        "genres": [], "directors": [], "actors": [], "duration": "", "description": ""
    }

    # Title & year from <title> or h3
    ttag = soup.find("title")
    if ttag and ttag.text:
        ttxt = ttag.text.replace(" - IMDb", "").strip()
        out["name"] = re.sub(r"\s*\(\d{4}\)$", "", ttxt).strip()
        m = re.search(r"\((\d{4})\)$", ttxt)
        if m:
            out["year"] = m.group(1)

    # Rating value / count
    # Look for itemprop or known labels
    rv = soup.find(attrs={"itemprop": "ratingValue"})
    if rv and rv.text.strip():
        out["ratingValue"] = rv.text.strip()
    rc = soup.find(attrs={"itemprop": "ratingCount"})
    if rc and rc.text.strip():
        out["ratingCount"] = re.sub(r"[^\d]", "", rc.text)

    # Sections are often under h5/h4 labels on /reference
    # We'll scan labeled blocks nearby.
    def collect_names_after_header(header_substrs, limit=None):
        names = []
        for hdr in soup.find_all(["h2","h3","h4","h5"]):
            label = hdr.get_text(" ", strip=True).lower()
            if any(sub in label for sub in header_substrs):
                for a in hdr.find_all_next("a", href=True):
                    href = a["href"]
                    if "/name/nm" in href:
                        nm = a.get_text(strip=True)
                        if nm and nm not in names:
                            names.append(nm)
                        if limit and len(names) >= limit:
                            return names
                    # stop if we hit another header
                    if a.find_previous(["h2","h3","h4","h5"]) is not hdr:
                        break
                break
        return names

    # Directors
    out["directors"] = collect_names_after_header(["directed by", "director"], limit=None)

    # Actors (Cast) — limit to TOP_ACTORS_PER_MOVIE
    actors = collect_names_after_header(["cast"], limit=TOP_ACTORS_PER_MOVIE)
    out["actors"] = actors

    # Genres
    # Find a 'Genres' header and gather following text/links until next header
    genres = []
    for hdr in soup.find_all(["h2","h3","h4","h5"]):
        label = hdr.get_text(" ", strip=True).lower()
        if "genres" in label or "genre" in label:
            # collect nearby links or list items
            for a in hdr.find_all_next("a", href=True):
                txt = a.get_text(strip=True)
                if txt and len(txt) < 40 and "/search/title/?genres=" in a["href"].lower():
                    if txt not in genres:
                        genres.append(txt)
                if a.find_previous(["h2","h3","h4","h5"]) is not hdr:
                    break
            break
    out["genres"] = genres

    # Runtime
    runtime = ""
    for hdr in soup.find_all(["h2","h3","h4","h5"]):
        lbl = hdr.get_text(" ", strip=True).lower()
        if "runtime" in lbl:
            # collect the first text after header
            nxt = hdr.find_next()
            # search forward for a tag that has minutes text
            limit_nodes = 40
            while nxt and limit_nodes > 0:
                txt = nxt.get_text(" ", strip=True) if hasattr(nxt, "get_text") else ""
                if re.search(r"\b\d+\s*min\b", txt) or re.search(r"\b\d+\s*hr", txt):
                    runtime = txt
                    break
                if nxt.name in ("h2","h3","h4","h5"):
                    break
                nxt = nxt.find_next()
                limit_nodes -= 1
            break
    out["duration"] = human_duration_from_reference(runtime)

    # Description / Plot
    # Look for "Plot" or "Storyline" or "Summary"
    desc = ""
    for hdr in soup.find_all(["h2","h3","h4","h5"]):
        lbl = hdr.get_text(" ", strip=True).lower()
        if any(k in lbl for k in ["plot", "storyline", "summary"]):
            # take a paragraph after this header
            p = hdr.find_next("p")
            if p and p.get_text(strip=True):
                desc = p.get_text(" ", strip=True)
                break
    out["description"] = html.unescape(desc)

    return out

# -------------- Load data & select set --------------
links = pd.read_csv(LINKS_CSV).dropna(subset=["imdbId"])
links["imdbId"] = links["imdbId"].astype(int)
links["tt"] = links["imdbId"].apply(imdb_tt_from_numeric)

# Fallback display title from MovieLens (for year/title if parsing fails)
try:
    movies = pd.read_csv(MOVIES_CSV)[["movieId", "title"]]
except Exception:
    movies = pd.DataFrame(columns=["movieId", "title"])

df = links.merge(movies, on="movieId", how="left")
sel = df.head(FIRST_N) if SELECT_BY == "first_n" else df

print(f"Preparing to fetch {len(sel)} movies")
print("Saving to:", OUT_DIR.resolve())

# Resume-safe: skip existing files
existing = {p.name for p in OUT_DIR.glob("*.txt")}

index_rows = []

# ----------- Worker -----------
def fetch_one(session: requests.Session, row):
    movieId = int(row.movieId)
    tt = row.tt
    url = f"https://www.imdb.com/title/{tt}/reference"
    try:
        r = session.get(url, timeout=REQUEST_TIMEOUT)
        r.raise_for_status()
        data = parse_reference(r.text)

        # fallback title/year from MovieLens title "(YYYY)"
        ml_title = row.title if isinstance(row.title, str) else ""
        if not data["name"] and ml_title:
            data["name"] = re.sub(r"\s*\(\d{4}\)\s*$", "", ml_title).strip()
        if not data["year"] and ml_title:
            m = re.search(r"\((\d{4})\)", ml_title)
            if m: data["year"] = m.group(1)

        title = data["name"] or tt
        year = data["year"] or ""
        fname = f"{safe_title_for_filename(title)}_{year}.txt" if year else f"{safe_title_for_filename(title)}.txt"
        out_path = OUT_DIR / fname

        if out_path.name in existing:
            # already written in this run or previous run
            index_rows.append({
                "movieId": movieId, "tt": tt, "title": title, "year": year,
                "txt_filename": out_path.name, "imdb_url": f"https://www.imdb.com/title/{tt}/"
            })
            return ("SKIP", tt, out_path.name)

        # Format fields
        genres = ", ".join(data["genres"])
        directors = ", ".join(data["directors"])
        actors = ", ".join(data["actors"][:TOP_ACTORS_PER_MOVIE])
        duration = data["duration"]
        ratingValue = data["ratingValue"] or ""
        ratingCount = data["ratingCount"] or ""
        description = data["description"]

        # Write TXT
        lines = []
        lines.append(f"movieId: {movieId}")
        lines.append(f"imdb_id: {tt}")
        lines.append(f"title: {title}")
        lines.append(f"year: {year}")
        lines.append(f"url: https://www.imdb.com/title/{tt}/")
        lines.append(f"ratingValue: {ratingValue}")
        lines.append(f"ratingCount: {ratingCount}")
        lines.append(f"genres: {genres}")
        lines.append(f"directors: {directors}")
        lines.append(f"actors: {actors}")
        lines.append(f"duration: {duration}")
        lines.append("")
        lines.append("description:")
        lines.append(description)
        out_path.write_text("\n".join(lines), encoding="utf-8")

        existing.add(out_path.name)
        index_rows.append({
            "movieId": movieId, "tt": tt, "title": title, "year": year,
            "txt_filename": out_path.name, "imdb_url": f"https://www.imdb.com/title/{tt}/"
        })
        return ("OK", tt, out_path.name)
    except Exception as e:
        return ("FAIL", tt, str(e))

# ----------- Run concurrent fetch -----------
session = build_session()
futures = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    for row in sel.itertuples(index=False):
        futures.append(ex.submit(fetch_one, session, row))

done = ok = skip = fail = 0
for fut in as_completed(futures):
    status, tt, info = fut.result()
    done += 1
    if status == "OK":
        ok += 1; print(f"[{done:>5}/{len(futures)}] OK   {tt} -> {info}")
    elif status == "SKIP":
        skip += 1; print(f"[{done:>5}/{len(futures)}] SKIP {tt} ({info})")
    else:
        fail += 1; print(f"[{done:>5}/{len(futures)}] FAIL {tt}: {info}")

# Write index CSV
pd.DataFrame(index_rows).to_csv(OUT_DIR / "index_imdb_txt.csv", index=False)
print("\nDone.")
print(f"OK: {ok} | SKIP: {skip} | FAIL: {fail}")
print("Folder:", OUT_DIR.resolve())
print("Index:", OUT_DIR / "index_imdb_txt.csv")


Preparing to fetch 1000 movies
Saving to: C:\Users\Notandi\Desktop\Social graph\Lokaverkefni\data\imdb_txt_fast
