In [28]:
import os
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin, unquote
import pandas as pd

In [29]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, unquote

# Folder to save the transcripts
transcript_folder = "rickmorty_transcripts"
if not os.path.exists(transcript_folder):
    os.makedirs(transcript_folder)

# Base URL of the Fandom wiki
base_url = "https://rickandmorty.fandom.com"

# Function to get transcript links from a category page
def get_transcript_links(category_url):
    transcript_links = []
    response = requests.get(category_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Locate all links to transcript pages
    for link in soup.select(".category-page__member-link"):
        transcript_url = urljoin(base_url, link.get('href'))
        transcript_links.append(transcript_url)
    
    return transcript_links

# Function to scrape the transcript from an episode transcript page
def scrape_transcript(episode_url):
    response = requests.get(episode_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # List to store all transcript lines
    transcript_lines = []

    # Locate and process the main content section dynamically
    content_div = soup.find("div", class_="mw-parser-output")
    if not content_div:
        print(f"Failed to find main content section for {episode_url}")
        return []

    # Process all <div> and <p> elements within the main content
    for element in content_div.find_all(["div", "p"]):
        # Skip unrelated divs (e.g., navigation or ads)
        if element.name == "div" and "poem" not in element.get("class", []):
            continue

        # Handle <p> and <div class="poem"> elements
        if element.name in ["p", "div"]:
            # Extract text, treating <br> tags as line breaks
            text = element.get_text(separator="\n", strip=True)
            if text:
                transcript_lines.extend(text.split("\n"))  # Split into separate lines

    return transcript_lines

# Function to process transcript lines into a structured format
def process_transcript_lines(transcript_lines, season, episode_name):
    data = []
    line_id = 0  # Initialize line ID

    for line in transcript_lines:
        # Split character and dialogue using <b> tags for "poem" style
        if ":" in line:
            parts = line.split(":", 1)
            character = parts[0].strip()
            phrase = parts[1].strip()
        else:
            # For non-dialogue lines, set character as None
            character = None
            phrase = line.strip()
        
        # Append structured data
        data.append({
            "ID": line_id,
            "Season": season,
            "Episode Name": episode_name,
            "Character": character,
            "Phrase": phrase
        })
        line_id += 1  # Increment ID for each line

    return data

# Main function to scrape and save transcripts for each episode
def scrape_and_save_transcripts(category_url, season):
    # Get all transcript links from the category page
    transcript_links = get_transcript_links(category_url)

    for link in transcript_links:
        try:
            print(f"Scraping transcript: {link}")
            transcript_lines = scrape_transcript(link)

            # Extract the episode name from the URL
            episode_name = unquote(link.split('/')[-2]).replace('_', ' ').replace('(episode)', '').strip()

            # Check if any lines were extracted
            if not transcript_lines:
                print(f"No transcript lines extracted for {episode_name}")
                continue

            # Process transcript lines into structured data
            episode_data = process_transcript_lines(transcript_lines, season, episode_name)

            # Save episode data to a separate CSV file
            df = pd.DataFrame(episode_data)
            csv_filename = os.path.join(transcript_folder, f"{episode_name}.csv")
            df.to_csv(csv_filename, index=False, encoding="utf-8")

            print(f"Saved transcript for {episode_name} to {csv_filename}.")
        except Exception as e:
            print(f"Failed to scrape {link}: {e}")

In [30]:
# List of season transcript URLs
season_transcript_urls = [
    "https://rickandmorty.fandom.com/wiki/Category:Season_1_transcripts",
    "https://rickandmorty.fandom.com/wiki/Category:Season_2_transcripts",
    "https://rickandmorty.fandom.com/wiki/Category:Season_3_transcripts",
    "https://rickandmorty.fandom.com/wiki/Category:Season_4_transcripts",
    "https://rickandmorty.fandom.com/wiki/Category:Season_5_transcripts",
    "https://rickandmorty.fandom.com/wiki/Category:Season_6_transcripts",
    "https://rickandmorty.fandom.com/wiki/Category:Season_7_transcripts"
]

# Iterate through each season and scrape transcripts
for season, url in enumerate(season_transcript_urls, start=1):
    print(f"Processing transcripts for Season {season}...")
    scrape_and_save_transcripts(url, season=season)

Processing transcripts for Season 1...
Scraping transcript: https://rickandmorty.fandom.com/wiki/Anatomy_Park_(episode)/Transcript
Saved transcript for Anatomy Park to rickmorty_transcripts\Anatomy Park.csv.
Scraping transcript: https://rickandmorty.fandom.com/wiki/Close_Rick-Counters_of_the_Rick_Kind/Transcript
Saved transcript for Close Rick-Counters of the Rick Kind to rickmorty_transcripts\Close Rick-Counters of the Rick Kind.csv.
Scraping transcript: https://rickandmorty.fandom.com/wiki/Lawnmower_Dog/Transcript
Saved transcript for Lawnmower Dog to rickmorty_transcripts\Lawnmower Dog.csv.
Scraping transcript: https://rickandmorty.fandom.com/wiki/M._Night_Shaym-Aliens!/Transcript
Saved transcript for M. Night Shaym-Aliens! to rickmorty_transcripts\M. Night Shaym-Aliens!.csv.
Scraping transcript: https://rickandmorty.fandom.com/wiki/Meeseeks_and_Destroy/Transcript
Saved transcript for Meeseeks and Destroy to rickmorty_transcripts\Meeseeks and Destroy.csv.
Scraping transcript: https:

Being my first web scraping, there has been some problems because the transcripts on the fandom website HTMLs are saved/store and edited in different ways, some use < p > as a line divider, some use < br > and put everything under the same class. This is problematic for the dataframe, so it requires some ad hoc fixings for the problematic urls. 

In [31]:
import os
import pandas as pd

# Input folder containing the original CSV files
input_folder = "rickmorty_transcripts"  # Replace with your actual folder path
# Output folder to save fixed CSV files
output_folder = "rickmorty_transcripts_fixed"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

def fix_parsing(file_path):
    """
    Fixes the parsing issues in the provided CSV file by merging lines
    with an empty Character value into the previous valid Character's Phrase.
    """
    # Load the CSV file
    df = pd.read_csv(file_path)

    # Ensure the Phrase column is treated as a string
    df["Phrase"] = df["Phrase"].fillna("").astype(str)

    # List to store the fixed rows
    fixed_rows = []
    last_character = None

    # Iterate through each row in the DataFrame
    for _, row in df.iterrows():
        if pd.notna(row["Character"]) and row["Character"].strip():  # Valid character
            last_character = row["Character"]
            fixed_rows.append(row)
        else:  # Empty character, merge with the previous row's Phrase
            if fixed_rows:  # Ensure there is a valid previous row
                fixed_rows[-1]["Phrase"] += " " + row["Phrase"].strip()

    # Convert the fixed rows back into a DataFrame
    fixed_df = pd.DataFrame(fixed_rows)
    return fixed_df

# Iterate through all CSV files in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):  # Process only CSV files
        input_file_path = os.path.join(input_folder, file_name)
        output_file_path = os.path.join(output_folder, file_name)

        try:
            # Fix the parsing issues in the file
            fixed_df = fix_parsing(input_file_path)

            # Save the fixed DataFrame to the output folder
            fixed_df.to_csv(output_file_path, index=False, encoding="utf-8")
            print(f"Fixed and saved: {file_name}")
        except Exception as e:
            print(f"Failed to process {file_name}: {e}")


Fixed and saved: A Rick in King Mortur's Mort.csv
Fixed and saved: A Rickconvenient Mort.csv
Fixed and saved: A Rickle in Time.csv
Fixed and saved: Air Force Wong.csv
Fixed and saved: Analyze Piss.csv
Fixed and saved: Anatomy Park.csv
Fixed and saved: Auto Erotic Assimilation.csv
Fixed and saved: Big Trouble in Little Sanchez.csv
Fixed and saved: Close Rick-Counters of the Rick Kind.csv
Fixed and saved: Fear No Mort.csv
Fixed and saved: Full Meta Jackrick.csv
Fixed and saved: Get Schwifty.csv
Fixed and saved: Gotron Jerrysis Rickvangelion.csv
Fixed and saved: How Poopy Got His Poop Back.csv
Fixed and saved: Lawnmower Dog.csv
Fixed and saved: Look Who's Purging Now.csv
Fixed and saved: M. Night Shaym-Aliens!.csv
Fixed and saved: Meeseeks and Destroy.csv
Fixed and saved: Mort Dinner Rick Andre.csv
Fixed and saved: Morty's Mind Blowers.csv
Fixed and saved: Mortynight Run.csv
Fixed and saved: Mortyplicity.csv
Fixed and saved: Never Ricking Morty.csv
Fixed and saved: Pickle Rick.csv
Fixed a

Now that all the transcripts have been downloaded, is time to create unified excel documents for each season, with the episodes in a chronological order.

In [42]:
episode_list_file = "RM_episodes_list.xlsx"  # Path to the episode list Excel file
transcripts_folder = "rickmorty_transcripts_fixed"  # Folder containing episode transcript CSVs
output_folder = "season_transcripts"  # Folder for saving the season CSV files

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Load the Excel file with the episode list
episode_list = pd.read_excel(episode_list_file)

# Iterate by Season
for season, season_episodes in episode_list.groupby('Season'):
    season_dfs = []

    # Iterate through each episode in the season
    for _, episode_row in season_episodes.iterrows():
        episode_name = episode_row['Episode Name']  # Use episode name to locate CSV files
        episode_number = episode_row['Episode']  # Season-specific episode number

        # Try to locate the corresponding transcript file
        csv_file_name = f"{episode_name}.csv"  # Assuming filenames match episode names
        csv_file_path = os.path.join(transcripts_folder, csv_file_name)

        if os.path.exists(csv_file_path):
            try:
                # Load the episode transcript, dropping the ID column
                episode_df = pd.read_csv(csv_file_path)
                episode_df = episode_df.drop(columns=['ID'], errors='ignore')  # Remove ID column if present

                # Add metadata columns
                episode_df['Episode'] = episode_number
                episode_df['Episode Name'] = episode_name
                episode_df['Season'] = season
                season_dfs.append(episode_df)
            except pd.errors.EmptyDataError:
                # Silently ignore empty files
                continue
        else:
            continue

    # Combine all episodes into one DataFrame for the season
    if season_dfs:
        season_combined_df = pd.concat(season_dfs, ignore_index=True)

        # Save to CSV file
        output_file_path = os.path.join(output_folder, f"RickAndMortyScripts_S_{season}.csv")
        season_combined_df.to_csv(output_file_path, index=False)
        print(f"Created CSV file for Season {season}: {output_file_path}")

Created CSV file for Season 1: season_transcripts\RickAndMortyScripts_S_1.csv
Created CSV file for Season 2: season_transcripts\RickAndMortyScripts_S_2.csv
Created CSV file for Season 3: season_transcripts\RickAndMortyScripts_S_3.csv
Created CSV file for Season 4: season_transcripts\RickAndMortyScripts_S_4.csv
Created CSV file for Season 5: season_transcripts\RickAndMortyScripts_S_5.csv
Created CSV file for Season 6: season_transcripts\RickAndMortyScripts_S_6.csv
Created CSV file for Season 7: season_transcripts\RickAndMortyScripts_S_7.csv
