# Earnings Calls Cleaning

For recent earnings calls transcripts

In [26]:
import re
import json
from bs4 import BeautifulSoup
from pathlib import Path
import os

In [21]:
BASE_DIR = Path.cwd().parent
DATA_DIR = BASE_DIR / "data"

In [25]:
def extract_transcript_from_view_source(file_path):
    """
    Parses a Yahoo Finance 'View Source' HTML file to extract the clean transcript.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')

        # --- STAGE 1: Reconstruct the Original HTML ---
        # The file is a table of code lines. We need to extract the text from
        # the 'line-content' cells to rebuild the actual HTML page source.
        print(f"Reconstructing HTML source from {os.path.basename(file_path)}...")
        
        line_contents = soup.find_all('td', class_='line-content')
        if not line_contents:
            return "Error: Could not find code line content. File format may differ."
            
        # Join all lines to get the valid underlying HTML string
        original_html_str = "".join([td.get_text() for td in line_contents])

        # --- STAGE 2: Parse the Transcript ---
        # Now we parse the reconstructed HTML string as a webpage
        inner_soup = BeautifulSoup(original_html_str, 'html.parser')
        
        # Locate the main transcript container
        # Yahoo Finance transcripts usually reside in a 'mainContent' section
        main_section = inner_soup.find('section', class_=re.compile(r'mainContent'))
        
        if not main_section:
            return "Error: Could not locate transcript main content section."

        transcript_lines = []
        
        # Transcript items are typically wrapped in 'item' divs
        # Structure: div.item -> div.headline (Speaker info) -> p (Dialogue)
        items = main_section.find_all('div', class_=re.compile(r'item'))

        for item in items:
            # 1. Identify Speaker
            speaker_name = "Unknown Speaker"
            role = ""
            
            speaker_info = item.find('div', class_=re.compile(r'speakerInfo'))
            if speaker_info:
                # Name is usually in a label span
                name_span = speaker_info.find('span', class_=re.compile(r'type-label-lg-med'))
                if name_span:
                    speaker_name = name_span.get_text(strip=True)
                
                # Role (e.g., CEO, Analyst) is usually in the description div
                role_div = speaker_info.find('div', class_=re.compile(r'speakerDesc'))
                if role_div:
                    role = role_div.get_text(strip=True)

            # 2. Extract Dialogue text
            # Text is contained in 'p' tags with typography classes
            paragraphs = item.find_all('p', attrs={'data-testid': 'typography'})
            
            clean_paragraphs = []
            for p in paragraphs:
                text = p.get_text(strip=True)
                
                # Filter out timestamps (e.g., "0:14:36")
                if re.match(r'^\d{1,2}:\d{2}(:\d{2})?$', text):
                    continue
                
                clean_paragraphs.append(text)

            if clean_paragraphs:
                full_text = "\n".join(clean_paragraphs)
                header = f"{speaker_name}" + (f" ({role})" if role else "")
                transcript_lines.append(f"### {header}\n{full_text}\n")

        return "\n".join(transcript_lines)

    except Exception as e:
        return f"An error occurred: {str(e)}"

In [29]:
# --- Usage Example ---
files = [
    'DE-Q4-2025.html', 
    'APPL-Q4-2025.html', 
    'NFLX-Q4-2025.html'
]

In [31]:
for filename in files:
    # Ensure the file exists in your directory
    if os.path.exists(DATA_DIR / "raw_transcripts_html" / filename):
        clean_text = extract_transcript_from_view_source(DATA_DIR / "raw_transcripts_html" / filename)
        
        # Save to a new text file
        output_filename = filename.replace('.html', '_transcript.txt')
        with open(DATA_DIR / 'cleaned_transcripts' / output_filename, 'w', encoding='utf-8') as f:
            f.write(clean_text)
            
        print(f"Saved extracted transcript to: {output_filename}")
    else:
        print(f"File not found: {filename}")

Reconstructing HTML source from DE-Q4-2025.html...
Saved extracted transcript to: DE-Q4-2025_transcript.txt
Reconstructing HTML source from APPL-Q4-2025.html...
Saved extracted transcript to: APPL-Q4-2025_transcript.txt
Reconstructing HTML source from NFLX-Q4-2025.html...
Saved extracted transcript to: NFLX-Q4-2025_transcript.txt
