In [2]:
# Bollywood Retry Script: Descriptions & Posters 
# 25th July 2025

import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

# === File and folder setup ===
#NOTE: PLEASE CHANGE FILE PATHS ACCORDINGLY 
# PLEASE ENSURE movies_random_sample.csv is in Desktop/ra_app
# REST AUTOMATIC CARRY FORWARD FROM primary_data_download.ipynb

csv_path = os.path.expanduser("~/Desktop/ra_app/movies_random_sample.csv")
base_folder = os.path.expanduser("~/Desktop/ra_app/data")
desc_folder = os.path.join(base_folder, "description")
poster_folder = os.path.join(base_folder, "posters")
log_file = os.path.join(base_folder, "log_retry_from_dryrun.txt")
summary_file = os.path.join(base_folder, "missing_files_report.csv")

# Ensure folders exist
os.makedirs(desc_folder, exist_ok=True)
os.makedirs(poster_folder, exist_ok=True)

# Logging function
def log(msg):
    print(msg)
    with open(log_file, "a", encoding="utf-8") as f:
        f.write(msg + "\n")

# Load data
df = pd.read_csv(csv_path)
imdb_ids = df["imdb_id"].tolist()

# Get existing files
existing_desc = set(f.replace(".txt", "") for f in os.listdir(desc_folder) if f.endswith(".txt"))
existing_poster = set(f.replace(".jpg", "").replace(".png", "") for f in os.listdir(poster_folder) if f.endswith((".jpg", ".png")))

# Identify and retry missing files
missing_data = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Checking & Downloading Missing"):
    imdb_id = row["imdb_id"]
    wiki_link = row["wiki_link"]
    poster_url = row["poster_path"]

    # === Check what’s missing ===
    missing_desc = imdb_id not in existing_desc
    missing_post = imdb_id not in existing_poster

    # === Log & track missing ===
    if missing_desc or missing_post:
        missing_data.append({
            "imdb_id": imdb_id,
            "missing_description": missing_desc,
            "missing_poster": missing_post
        })
        log(f"{imdb_id} → description: {'❌' if missing_desc else '✅'}, poster: {'❌' if missing_post else '✅'}")

    # === Download Description ===
    if missing_desc and pd.notna(wiki_link):
        try:
            resp = requests.get(wiki_link, timeout=10)
            if resp.status_code == 200:
                soup = BeautifulSoup(resp.text, "html.parser")
                content = soup.select("div.mw-parser-output > p")
                paragraphs = [p.get_text().strip() for p in content if p.get_text().strip()]
                text = "\n".join(paragraphs[:2])
                if text:
                    desc_path = os.path.join(desc_folder, f"{imdb_id}.txt")
                    with open(desc_path, "w", encoding="utf-8") as f:
                        f.write(text)
                    log("✅ Description downloaded from Wikipedia.")
                else:
                    log("⚠️ Wikipedia has no valid paragraph content.")
            else:
                log(f"❌ Failed to fetch Wikipedia page (status {resp.status_code}).")
        except Exception as e:
            log(f"❌ Error fetching Wikipedia: {e}")

    # === Download Poster ===
    if missing_post and pd.notna(poster_url):
        try:
            headers = {
                "User-Agent": "Mozilla/5.0",
                "Referer": "https://en.wikipedia.org/"
            }
            resp = requests.get(poster_url, headers=headers, timeout=10)
            if resp.status_code == 200:
                poster_path = os.path.join(poster_folder, f"{imdb_id}.jpg")
                with open(poster_path, "wb") as f:
                    f.write(resp.content)
                log("✅ Poster downloaded from Wikimedia.")
            else:
                log(f"❌ Failed to download poster (status {resp.status_code}).")
        except Exception as e:
            log(f"❌ Error downloading poster: {e}")

# === Save report of what was missing ===
if missing_data:
    pd.DataFrame(missing_data).to_csv(summary_file, index=False)
    log(f"\n📄 Missing files report saved to {summary_file}")
else:
    log("\n✅ No missing files found. All data is complete.")

log("\n🎉 Script finished.")

Checking & Downloading Missing: 100%|██████| 100/100 [00:00<00:00, 30865.44it/s]

tt2378057 → description: ✅, poster: ❌

📄 Missing files report saved to /Users/rishabhbijani/Desktop/ra_app/data/missing_files_report.csv

🎉 Script finished.



