In [3]:
import os
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin


In [4]:
# Create a folder to save the transcripts
transcript_folder = "rickmorty_transcripts"
if not os.path.exists(transcript_folder):
    os.makedirs(transcript_folder)

# Base URL of the Fandom wiki
base_url = "https://rickandmorty.fandom.com"

# Function to get transcript links from a category page
def get_transcript_links(category_url):
    transcript_links = []
    page = requests.get(category_url)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Locate all links to transcript pages
    for link in soup.select(".category-page__member-link"):
        transcript_url = urljoin(base_url, link.get('href'))
        transcript_links.append(transcript_url)
    
    return transcript_links

# Function to scrape the transcript from an episode transcript page
def scrape_transcript(episode_url):
    page = requests.get(episode_url)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Locate the main content section
    content_div = soup.find("div", class_="mw-parser-output")

    # Initialize variables to track text extraction
    transcript_started = False
    transcript_lines = []

    # Loop through elements in the main content section
    for element in content_div.find_all(['p']):
        if "TRANSCRIPT" in element.text.upper():  # Start extracting after "TRANSCRIPT"
            transcript_started = True
            continue
        if "SITE NAVIGATION" in element.text.upper():  # Stop extracting before "SITE NAVIGATION"
            break
        if transcript_started:
            # Add the full text of the paragraph
            transcript_lines.append(element.get_text(strip=True))

    # Join all transcript lines into a single string
    full_transcript = "\n".join(transcript_lines)
    return full_transcript

# Main function to download transcripts
def download_transcripts(category_url):
    # Get all transcript links from the category page
    transcript_links = get_transcript_links(category_url)

    for link in transcript_links:
        try:
            print(f"Scraping transcript: {link}")
            transcript_text = scrape_transcript(link)

            # Extract the episode name from the URL
            episode_name = os.path.basename(link).split('/')[0].replace('_', ' ')

            # Save the transcript as a text file
            filename = f"{episode_name}.txt"
            filepath = os.path.join(transcript_folder, filename)

            with open(filepath, "w", encoding="utf-8") as f:
                f.write(transcript_text)

            print(f"Saved transcript for {episode_name}.")
            time.sleep(1)  # Respectful delay
        except Exception as e:
            print(f"Failed to scrape {link}: {e}")

# URL for Season 1 transcripts
season_1_transcripts_url = "https://rickandmorty.fandom.com/wiki/Category:Season_1_transcripts"
download_transcripts(season_1_transcripts_url)


Scraping transcript: https://rickandmorty.fandom.com/wiki/Anatomy_Park_(episode)/Transcript
Saved transcript for Transcript.
Scraping transcript: https://rickandmorty.fandom.com/wiki/Close_Rick-Counters_of_the_Rick_Kind/Transcript
Saved transcript for Transcript.
Scraping transcript: https://rickandmorty.fandom.com/wiki/Lawnmower_Dog/Transcript
Saved transcript for Transcript.
Scraping transcript: https://rickandmorty.fandom.com/wiki/M._Night_Shaym-Aliens!/Transcript
Saved transcript for Transcript.
Scraping transcript: https://rickandmorty.fandom.com/wiki/Meeseeks_and_Destroy/Transcript
Saved transcript for Transcript.
Scraping transcript: https://rickandmorty.fandom.com/wiki/Pilot/Transcript
Saved transcript for Transcript.
Scraping transcript: https://rickandmorty.fandom.com/wiki/Raising_Gazorpazorp/Transcript
Saved transcript for Transcript.
Scraping transcript: https://rickandmorty.fandom.com/wiki/Rick_Potion_No._9/Transcript
Saved transcript for Transcript.
Scraping transcript: h