In [83]:
!pip install beautifulsoup4 requests
!pip install selenium
!pip install undetected-chromedriver

Collecting undetected-chromedriver
  Downloading undetected-chromedriver-3.5.5.tar.gz (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 kB[0m [31m422.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting websockets (from undetected-chromedriver)
  Downloading websockets-14.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading websockets-14.1-cp312-cp312-macosx_11_0_arm64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: undetected-chromedriver
  Building wheel for undetected-chromedriver (setup.py) ... [?25ldone
[?25h  Created wheel for undetected-chromedriver: filename=undetected_chromedriver-3.5.5-py3-none-any.whl size=47048 sha256=4c15c318ec9f196af71f5b9badf6c4a442518bf1f067fcafebfed2bff1473828
  Stored in directory: /Users/tiffanycl

In [4]:
# Extract data directly from fandom wiki

import requests
from bs4 import BeautifulSoup
import os

def extract_butcher_dialogues(urls, output_directory):
    """
    Extracts Billy Butcher's dialogues from a list of transcript webpages and saves each episode's dialogues
    into separate text files named script1.txt, script2.txt, etc.

    Parameters:
        urls (list): List of URLs containing the episode transcripts.
        output_directory (str): Directory to save the text files with dialogues.
    """
    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    for idx, url in enumerate(urls, start=1):
        print(f"Processing URL {idx}: {url}")

        # Step 1: Fetch the webpage content
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to retrieve URL {url}. Status code: {response.status_code}")
            continue

        # Step 2: Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Step 3: Locate the dialogue table or rows with <b> tags
        butcher_dialogues = []
        rows = soup.find_all('tr')  # Find all rows in the page
        for row in rows:
            speaker_cell = row.find(['th', 'td'])  # Look in <th> or <td> for the speaker
            dialogue_cell = row.find('td')  # Look for the dialogue in <td>

            if speaker_cell and dialogue_cell:
                speaker = speaker_cell.get_text(strip=True)
                dialogue = dialogue_cell.get_text(strip=True)

                # Match lines where the speaker is Billy Butcher
                if "Butcher" in speaker:
                    butcher_dialogues.append(dialogue)

        # Step 4: Save dialogues to a text file
        if butcher_dialogues:
            output_file = os.path.join(output_directory, f"script{idx}.txt")
            with open(output_file, "w", encoding="utf-8") as file:
                file.write("\n".join(butcher_dialogues))
            print(f"Saved {len(butcher_dialogues)} lines from Billy Butcher to {output_file}.")
        else:
            print(f"No dialogues from Billy Butcher found in URL {url}.")

# List of URLs for different episode scripts
urls = [
    "https://the-boys.fandom.com/wiki/Transcript:Cherry",
    "https://the-boys.fandom.com/wiki/Transcript:The_Name_of_the_Game",
    "https://the-boys.fandom.com/wiki/Transcript:Season_Four_Finale",
    "https://the-boys.fandom.com/wiki/Transcript:Get_Some",
    "https://the-boys.fandom.com/wiki/Transcript:Good_for_the_Soul",
    "https://the-boys.fandom.com/wiki/Transcript:Herogasm",
    "https://the-boys.fandom.com/wiki/Transcript:The_Female_of_the_Species",
]

# Directory to save the output text files
output_directory = "Butcher_Dialogues"

# Process all URLs and save results
extract_butcher_dialogues(urls, output_directory)


Processing URL 1: https://the-boys.fandom.com/wiki/Transcript:Cherry
Saved 50 lines from Billy Butcher to Butcher_Dialogues/script1.txt.
Processing URL 2: https://the-boys.fandom.com/wiki/Transcript:The_Name_of_the_Game
Saved 60 lines from Billy Butcher to Butcher_Dialogues/script2.txt.
Processing URL 3: https://the-boys.fandom.com/wiki/Transcript:Season_Four_Finale
Saved 5 lines from Billy Butcher to Butcher_Dialogues/script3.txt.
Processing URL 4: https://the-boys.fandom.com/wiki/Transcript:Get_Some
Saved 9 lines from Billy Butcher to Butcher_Dialogues/script4.txt.
Processing URL 5: https://the-boys.fandom.com/wiki/Transcript:Good_for_the_Soul
No dialogues from Billy Butcher found in URL https://the-boys.fandom.com/wiki/Transcript:Good_for_the_Soul.
Processing URL 6: https://the-boys.fandom.com/wiki/Transcript:Herogasm
Saved 4 lines from Billy Butcher to Butcher_Dialogues/script6.txt.
Processing URL 7: https://the-boys.fandom.com/wiki/Transcript:The_Female_of_the_Species
Saved 9 line

In [6]:
# Parsing Text file from The Loft
import os

def extract_butcher_dialogue(input_file, output_file):
    """
    Extracts Billy Butcher's dialogues from the input file and writes them to the output file.
    If no dialogues are found, it creates an empty file and prints a message.

    Parameters:
        input_file (str): Path to the input file containing the script.
        output_file (str): Path to the output file to save Butcher's dialogues.
    """
    try:
        # Ensure the output directory exists
        output_dir = os.path.dirname(output_file)
        os.makedirs(output_dir, exist_ok=True)

        # Read input file
        with open(input_file, 'r', encoding='utf-8') as infile:
            lines = infile.readlines()

        # Extract dialogues
        butcher_dialogues = []
        for line in lines:
            line = line.strip()  # Remove leading and trailing whitespaces
            if line.startswith("[Butcher]"):
                # Remove "[Butcher]" and keep the dialogue
                dialogue = line.replace("[Butcher]", "").strip()
                butcher_dialogues.append(dialogue)

        # Write to output file (even if no dialogues are found)
        with open(output_file, 'w', encoding='utf-8') as outfile:
            if butcher_dialogues:
                outfile.write("\n".join(butcher_dialogues))
                print(f"Extracted {len(butcher_dialogues)} dialogues from Billy Butcher to {output_file}.")
            else:
                print(f"No dialogues from Billy Butcher found in {input_file}. Creating an empty file.")
        
    except FileNotFoundError:
        print(f"File not found: {input_file}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_file = "/Users/tiffanyclark2003/Downloads/CS460-AI Chatbot/Dialogues_The_Loft/script04x06.txt"
output_file = "/Users/tiffanyclark2003/Downloads/CS460-AI Chatbot/Butcher_Dialogue_The_Loft/script04x06.txt"
extract_butcher_dialogue(input_file, output_file)


Extracted 4 dialogues from Billy Butcher to /Users/tiffanyclark2003/Downloads/CS460-AI Chatbot/Butcher_Dialogue_The_Loft/script04x06.txt.
