The idea is to download all the transcription for all the episodes and seeing which ones have a link to a character. If not, then we have to review it one by one and see which character they mean and add the edge manually. 

In [None]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pickle

# Base URL for the Rick and Morty Wiki
base_url = "https://rickandmorty.fandom.com"
category_url = f"{base_url}/wiki/Category:Season_1_transcripts"

# Directory to save transcript files
save_dir = "rick_and_morty_transcripts_season_1"
os.makedirs(save_dir, exist_ok=True)

# Fandom API endpoint
fandom_api_url = "https://rickandmorty.fandom.com/api.php"

# File to save collected episode titles
pickle_file = "all_transcript_titles.pkl"

# Function to scrape transcript titles and URLs from the category page
def get_transcript_titles(page_url):
    transcript_titles = []
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Loop through each transcript link
    for link in soup.select('a[href*="/Transcript"]'):
        href = link.get('href')
        name = link.text.strip()

        if href:
            transcript_titles.append({
                "name": name,
                "url": urljoin(base_url, href)
            })
    return transcript_titles

# Function to fetch transcript page content using the Fandom API
def fetch_transcript_content(title):
    params = {
        "action": "query",
        "prop": "revisions",
        "titles": title,
        "rvslots": "main",
        "rvprop": "content",
        "format": "json",
        "formatversion": "2"
    }
    response = requests.get(fandom_api_url, params=params)
    if response.status_code == 200:
        data = response.json()
        pages = data.get("query", {}).get("pages", [])
        if pages and "revisions" in pages[0]:
            return pages[0]["revisions"][0]["slots"]["main"]["content"]
    return None

# Function to format names for Fandom URL
def format_name(name):
    return name.replace(" ", "_")

# Function to sanitize filenames
def sanitize_filename(filename):
    return re.sub(r'[<>:"/\\|?*]', '', filename)

# Function to save content as a plain-text file
def save_content(content, filename):
    with open(filename, "w", encoding="utf-8") as file:
        file.write(content)

# Step 1: Check if pickle file exists; if so, load data
if os.path.exists(pickle_file):
    with open(pickle_file, "rb") as f:
        all_transcript_titles = pickle.load(f)
    print(f"Loaded {len(all_transcript_titles)} transcript titles from pickle file.")
else:
    # Step 2: Collect transcript titles from all pages
    all_transcript_titles = []
    next_url = category_url

    while next_url:
        transcript_titles, next_url = get_transcript_titles(next_url)
        all_transcript_titles.extend(transcript_titles)
        print(f"Collected {len(transcript_titles)} transcripts from the current page.")
        print("Next URL:", next_url)  # Debug print to check the next URL

    # Remove duplicates and save to pickle file
    all_transcript_titles = [dict(t) for t in {tuple(d.items()) for d in all_transcript_titles}]
    print(f"Total unique transcripts found: {len(all_transcript_titles)}")

    with open(pickle_file, "wb") as f:
        pickle.dump(all_transcript_titles, f)
    print(f"Transcript titles saved to {pickle_file}")

# Step 3: Fetch and save content for each transcript
for transcript in all_transcript_titles:
    formatted_name = format_name(transcript["name"])
    sanitized_name = sanitize_filename(formatted_name)
    print(f"Fetching page for: {formatted_name}")
    content = fetch_transcript_content(formatted_name)

    if content:
        # Save to file with sanitized transcript name
        filename = os.path.join(save_dir, f"{sanitized_name}.txt")
        save_content(content, filename)
        print(f"Saved: {filename}")
    else:
        print(f"Transcript not found or failed for: {formatted_name}.")


Loaded 11 transcript titles from pickle file.
Fetching page for: Anatomy_Park_(episode)/Transcript
Saved: rick_and_morty_transcripts_season_11\Anatomy_Park_(episode)Transcript.txt
Fetching page for: Close_Rick-Counters_of_the_Rick_Kind/Transcript
Saved: rick_and_morty_transcripts_season_11\Close_Rick-Counters_of_the_Rick_KindTranscript.txt
Fetching page for: Lawnmower_Dog/Transcript
Saved: rick_and_morty_transcripts_season_11\Lawnmower_DogTranscript.txt
Fetching page for: M._Night_Shaym-Aliens!/Transcript
Saved: rick_and_morty_transcripts_season_11\M._Night_Shaym-Aliens!Transcript.txt
Fetching page for: Meeseeks_and_Destroy/Transcript
Saved: rick_and_morty_transcripts_season_11\Meeseeks_and_DestroyTranscript.txt
Fetching page for: Pilot/Transcript
Saved: rick_and_morty_transcripts_season_11\PilotTranscript.txt
Fetching page for: Raising_Gazorpazorp/Transcript
Saved: rick_and_morty_transcripts_season_11\Raising_GazorpazorpTranscript.txt
Fetching page for: Rick_Potion_No._9/Transcript
Sa