Input JP Morgan files

In [47]:
import pandas as pd
import numpy as np
from PyPDF2 import PdfReader
import requests
from bs4 import BeautifulSoup
import yfinance as yf
from datetime import datetime, timedelta
#import openai
#from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import re, json, time, os, sys, glob
from typing import Dict, List, Tuple
from dataclasses import dataclass


In [48]:
# Configuration
@dataclass
class Config:
    openai_api_key: str = os.environ["OPENAI_API_KEY"]
    ticker: str = "JPM"
    quarters: List[str] = None

    def __post_init__(self):
        if self.quarters is None:
            self.quarters = ["2024Q4", "2025Q1"]

config = Config()

In [49]:
# get the pdf file https://drive.google.com/drive/folders/1fzSQl9zgYXVGVVNEJjW0UZkqS7PtJ3Tm
def extract_pdf_text(file_name, base_path='JPM Presentation texts/2025/Q1'):
    """
    Extract text from a PDF file in Google Drive

    Args:
        file_name (str): Name of the PDF file
        base_path (str): Base path to the file location

    Returns:
        str: Extracted text from the PDF
    """

    # Construct full file path
    pdf_path = os.path.join(base_path, file_name)

    # Check if file exists
    if not os.path.exists(pdf_path):
        print(f"Error: File not found at {pdf_path}")
        print("Available files in directory:")
        try:
            print(os.listdir(base_path))
        except FileNotFoundError:
            print(f"Directory {base_path} not found")
        return None

    try:
        with open(pdf_path, 'rb') as f:
            pdf_reader = PdfReader(f)

            # Get number of pages
            num_pages = len(pdf_reader.pages)
            print(f"Successfully loaded PDF: {file_name}")
            print(f"Number of pages: {num_pages}")

            # Extract text from all pages more efficiently
            full_text = "\n".join(
                page.extract_text()
                for page in pdf_reader.pages
            )

            return full_text

    except FileNotFoundError:
        print(f"Error: Could not find file {pdf_path}")
        return None
    except Exception as e:
        print(f"Error reading PDF: {str(e)}")
        return None



In [50]:
def separate_presentation_and_qa(text):
    """
    Separate the presentation text from Q&A section in earnings transcript

    Args:
        text (str): Full transcript text

    Returns:
        tuple: (presentation_text, qa_text)
    """
    if not text:
        return None, None

    # Common patterns that indicate start of Q&A section
    qa_patterns = [
        r"(?i)questions?\s*(?:and|&)\s*answers?",
        r"(?i)q\s*(?:and|&)\s*a",
        r"(?i)question\s*(?:and|&)\s*answer\s*session",
        r"(?i)we\s*(?:will\s*)?now\s*(?:begin|start|open)\s*(?:the\s*)?(?:question|q)",
        r"(?i)(?:now\s*)?(?:let'?s\s*)?(?:begin|start|open)\s*(?:the\s*)?(?:question|qa)",
        r"(?i)(?:we\s*)?(?:will\s*)?now\s*take\s*questions?",
        r"(?i)(?:let'?s\s*)?(?:begin|start|open)\s*(?:with\s*)?(?:the\s*)?(?:first\s*)?question",
        r"(?i)analyst\s*questions?",
        r"(?i)thank\s*you.*(?:question|qa)",
        #r"(?i)(?:first\s*)?(?:question\s*)?(?:is\s*)?from\s*\w+",
        r"(?i)our\s*first\s*(?:question|analyst)"
    ]

    # Find the earliest Q&A indicator
    qa_start_pos = len(text)  # Start with end of text
    matched_pattern = None

    for pattern in qa_patterns:
        matches = list(re.finditer(pattern, text))
        if matches:
            # Take the first match for this pattern
            first_match_pos = matches[0].start()
            if first_match_pos < qa_start_pos:
                qa_start_pos = first_match_pos
                matched_pattern = pattern

    # If no Q&A section found, return entire text as presentation
    if qa_start_pos == len(text):
        print("No Q&A section found. Entire text treated as presentation.")
        return text.strip(), ""

    # Split the text
    presentation_text = text[:qa_start_pos].strip()
    qa_text = text[qa_start_pos:].strip()

    print(f"Q&A section detected using pattern: {matched_pattern}")
    print(f"Presentation text: {len(presentation_text):,} characters")
    print(f"Q&A text: {len(qa_text):,} characters")

    return presentation_text, qa_text

def clean_text(text):
    """
    Clean up extracted text by removing extra whitespace and formatting issues

    Args:
        text (str): Raw text to clean

    Returns:
        str: Cleaned text
    """
    if not text:
        return ""

    # Remove excessive whitespace
    text = re.sub(r'\n\s*\n', '\n\n', text)  # Multiple newlines to double newline
    text = re.sub(r' {2,}', ' ', text)  # Multiple spaces to single space
    text = re.sub(r'\t', ' ', text)  # Tabs to spaces
    text = re.sub(r'\.{20,}', '', text) # Remove the dotted line separators
    return text.strip()

In [51]:
def clean_speaker_name(name):
    """Clean and standardize speaker names"""
    if not name:
        return ""

    # Remove common prefixes and suffixes
    name = re.sub(r'^(Mr\.?|Ms\.?|Mrs\.?|Dr\.?)\s*', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\s*[-–—]\s*.*$', '', name)  # Remove everything after dash
    name = re.sub(r'\s*,.*$', '', name)  # Remove everything after comma
    name = re.sub(r'\s+', ' ', name).strip()  # Clean whitespace

    return name

def identify_questioner(question_text):
    """Extract questioner name from question text"""
    # Common patterns for questioner identification
    patterns = [
        r'(?:question\s+(?:is\s+)?from\s+|from\s+)([A-Za-z\s\.]+?)(?:\s+at\s+|\s+with\s+|\s+from\s+|\.|,|$)',
        r'([A-Za-z\s\.]+?)(?:\s+at\s+|\s+with\s+|\s+from\s+)([A-Za-z\s&\.]+?)(?:\.|,|:|\s*$)',
        r'^([A-Za-z\s\.]+?)(?:\s*[-–—]\s*|\s*:\s*)',
        r'analyst\s+([A-Za-z\s\.]+?)(?:\s+asks?|\s*:|\s*$)',
    ]

    for pattern in patterns:
        match = re.search(pattern, question_text, re.IGNORECASE)
        if match:
            name = clean_speaker_name(match.group(1))
            if len(name) > 2 and len(name.split()) <= 4:  # Reasonable name length
                return name

    return "Unknown"

In [52]:
def parse_qa_section(qa_text):
    """
    Parse Q&A section into structured data
    Returns list of dictionaries with question/answer data
    """
    if not qa_text:
        return []

    qa_data = []
    question_no = 1
    # Split by common question indicators
    question_splits = re.split(
        r'(?i)(?:^|\n)\s*(?:'
        r'(?:next\s+|first\s+)?question\s+(?:is\s+)?from\s+|'
        r'(?:our\s+)?(?:next\s+)?question\s+(?:comes\s+)?from\s+|'
        r'analyst\s+\w+|'
        r'question\s*:|'
        r'q\s*:|'
        r'Q\s*:|'
        r'unidentified\s+(?:analyst|participant)'
        r')',
        qa_text
    )

    for i, section in enumerate(question_splits[1:], 1):  # Skip first split (usually empty or intro)
        if not section.strip():
            continue

        # Extract questioner name
        questioner = identify_questioner(section)

        # Split section into question and answer parts
        # Look for answer indicators
        answer_patterns = [
            r'(?i)(?:^|\n)\s*([A-Za-z\s\.]+?)\s*[-–—]\s*',  # Name followed by dash
            r'(?i)(?:^|\n)\s*([A-Za-z\s\.]+?)\s*:\s*',      # Name followed by colon
            r'(?i)(?:^|\n)\s*([A-Za-z\s\.]+?)\s*A\s*',      # Name followed by A
            r'(?i)(?:^|\n)\s*(Thank\s+you|Thanks)',         # Thank you responses
        ]

        # Find where answers start
        answer_start = len(section)
        for pattern in answer_patterns:
            matches = list(re.finditer(pattern, section))
            if matches:
                # Take the first answer indicator
                first_match = matches[0]
                answer_start = min(answer_start, first_match.start())

        # Extract question text
        question_text = section[:answer_start].strip()

        # Clean question text
        question_text = re.sub(r'^.*?(?:question\s+(?:is\s+)?from\s+.*?)[.:]?\s*', '', question_text, flags=re.IGNORECASE)
        question_text = re.sub(r'^\s*[-–—]\s*', '', question_text)
        question_text = question_text.strip()

         # Extract answer section
        answer_section = section[answer_start:].strip()

        # Parse answers and answerers
        answerers = []
        answers = []
        full_answer_parts = []
        if answer_section:
            # Split by speaker changes
            speaker_splits = re.split(
                r'(?i)(?:^|\n)\s*([A-Za-z\s\.]+?)\s*(?:[-–—]|:)\s*',
                answer_section
            )

            current_speaker = None
            current_answer = ""

            for j, part in enumerate(speaker_splits):
                if j % 2 == 1:  # Odd indices are speaker names
                    # Save previous speaker's answer
                    if current_speaker and current_answer.strip():
                        answerers.append(clean_speaker_name(current_speaker))
                        answers.append(current_answer.strip())
                        full_answer_parts.append(current_answer.strip())

                    current_speaker = part
                    current_answer = ""
                else:  # Even indices are answer content
                    current_answer = part.strip()

            # Don't forget the last speaker
            if current_speaker and current_answer.strip():
                answerers.append(clean_speaker_name(current_speaker))
                answers.append(current_answer.strip())
                full_answer_parts.append(current_answer.strip())

            # If no speakers found, treat entire answer section as one answer
            if not answerers and answer_section.strip():
                answerers.append("Unknown")
                answers.append(answer_section.strip())
                full_answer_parts.append(answer_section.strip())

        # Create JSON structures
        answerers_json = json.dumps(answerers) if answerers else "[]"
        answers_dict = {}
        for idx, (answerer, answer) in enumerate(zip(answerers, answers)):
            answers_dict[answerer] = answer
        answers_json = json.dumps(answers_dict)

        full_answer = " ".join(full_answer_parts)

        # Only add if we have a meaningful question
        if question_text and len(question_text) > 10:
            qa_data.append({
                'Question_No': question_no,
                'Questioner': questioner,
                'Question': question_text,
                'Answerers_Json': answerers_json,
                'Answers_Json': answers_json,
                'Full_Answer': full_answer
            })
            question_no += 1

    return qa_data

In [53]:
def create_qa_dataframe(qa_data):
    """Create DataFrame from Q&A data"""
    if not qa_data:
        print("No Q&A data found to create DataFrame")
        return pd.DataFrame()

    df = pd.DataFrame(qa_data)

    print(f"Created DataFrame with {len(df)} questions")
    print(f"Columns: {list(df.columns)}")

    return df

In [54]:
def save_to_excel(df, base_path='output', filename="jpmorgan_qa_analysis.xlsx"):
    """Save DataFrame to Excel file with formatting"""
    if df.empty:
        print("DataFrame is empty, nothing to save")
        return

    try:
        # Create Excel writer object
        file = os.path.join(base_path, filename)
        with pd.ExcelWriter(file, engine='openpyxl') as writer:
            # Write main data
            df.to_excel(writer, sheet_name='Q&A Analysis', index=False)

            # Get the workbook and worksheet
            workbook = writer.book
            worksheet = writer.sheets['Q&A Analysis']

            # Auto-adjust column widths
            for column in worksheet.columns:
                max_length = 0
                column_letter = column[0].column_letter

                for cell in column:
                    try:
                        if len(str(cell.value)) > max_length:
                            max_length = len(str(cell.value))
                    except:
                        pass

                adjusted_width = min(max_length + 2, 50)  # Cap at 50 characters
                worksheet.column_dimensions[column_letter].width = adjusted_width

            # Create summary sheet
            summary_data = {
                'Metric': [
                    'Total Questions',
                    'Unique Questioners',
                    'Total Answerers',
                    'Average Question Length',
                    'Average Answer Length'
                ],
                'Value': [
                    len(df),
                    df['Questioner'].nunique(),
                    len(set([name for names_json in df['Answerers_Json'] for name in json.loads(names_json)])),
                    df['Question'].str.len().mean(),
                    df['Full_Answer'].str.len().mean()
                ]
            }

            summary_df = pd.DataFrame(summary_data)
            summary_df.to_excel(writer, sheet_name='Summary', index=False)

        print(f"Successfully saved Q&A analysis to: {filename}")

    except Exception as e:
        print(f"Error saving to Excel: {str(e)}")
        # Fallback to CSV
        csv_filename = filename.replace('.xlsx', '.csv')
        df.to_csv(csv_filename, index=False)
        print(f"Saved as CSV instead: {csv_filename}")

In [55]:
def analyze_qa_dataframe(df):
    """Provide analysis of the Q&A DataFrame"""
    if df.empty:
        print("No data to analyze")
        return

    print("\n" + "="*60)
    print("Q&A ANALYSIS SUMMARY")
    print("="*60)

    print(f"Total Questions: {len(df)}")
    print(f"Unique Questioners: {df['Questioner'].nunique()}")

    # Top questioners
    top_questioners = df['Questioner'].value_counts().head(5)
    print(f"\nTop Questioners:")
    for questioner, count in top_questioners.items():
        print(f"  {questioner}: {count} questions")

    # Answer statistics
    all_answerers = []
    for answerers_json in df['Answerers_Json']:
        all_answerers.extend(json.loads(answerers_json))

    answerer_counts = pd.Series(all_answerers).value_counts()
    print(f"\nTop Answerers:")
    for answerer, count in answerer_counts.head(5).items():
        print(f"  {answerer}: {count} answers")

    # Length statistics
    print(f"\nQuestion Length Statistics:")
    print(f"  Average: {df['Question'].str.len().mean():.0f} characters")
    print(f"  Median: {df['Question'].str.len().median():.0f} characters")

    print(f"\nAnswer Length Statistics:")
    print(f"  Average: {df['Full_Answer'].str.len().mean():.0f} characters")
    print(f"  Median: {df['Full_Answer'].str.len().median():.0f} characters")

In [56]:
def save_sections_to_files(presentation_text, qa_text, base_path='output', base_filename="jpmorgan_earnings"):
    """
    Save presentation and Q&A sections to separate files

    Args:
        presentation_text (str): Presentation section text
        qa_text (str): Q&A section text
        base_filename (str): Base filename for output files
    """
    # Save presentation section
    presentation_file_name = f"{base_filename}_presentation.txt"
    presentation_file = os.path.join(base_path, presentation_file_name)
    os.makedirs(os.path.dirname(presentation_file), exist_ok=True)
    print(f"Open presentation file {presentation_file}")
    with open(presentation_file, 'w', encoding='utf-8') as f:
        f.write(presentation_text)
    print(f"Presentation section saved to: {presentation_file}")

    # Save Q&A section
    qa_file_name = f"{base_filename}_qa.txt"
    qa_file = os.path.join(base_path, qa_file_name)
    with open(qa_file, 'w', encoding='utf-8') as f:
        f.write(qa_text)
    print(f"Q&A section saved to: {qa_file}")

    return presentation_file, qa_file

def analyze_sections(presentation_text, qa_text):
    """
    Provide basic analysis of the separated sections

    Args:
        presentation_text (str): Presentation section text
        qa_text (str): Q&A section text
    """
    print("\n" + "="*60)
    print("SECTION ANALYSIS")
    print("="*60)

    if presentation_text:
        print(f"Presentation Section:")
        print(f"  - Characters: {len(presentation_text):,}")
        print(f"  - Words (approx): {len(presentation_text.split()):,}")
        print(f"  - Lines: {len(presentation_text.splitlines()):,}")
        print(f"  - Preview: {presentation_text[:100]}...")

    if qa_text:
        print(f"\nQ&A Section:")
        print(f"  - Characters: {len(qa_text):,}")
        print(f"  - Words (approx): {len(qa_text.split()):,}")
        print(f"  - Lines: {len(qa_text.splitlines()):,}")

        # Count questions (rough estimate)
        question_patterns = [r'\?', r'(?i)question', r'(?i)analyst']
        total_questions = sum(len(re.findall(pattern, qa_text)) for pattern in question_patterns)
        print(f"  - Estimated questions: {total_questions}")

        print(f"  - Preview: {qa_text[:100]}...")


In [57]:
# Extract text from PDF
print("Step 1: Extracting text from PDF...")
file_name = "JPM_1q25-earnings-transcript.pdf"
full_text = extract_pdf_text(file_name)
if not full_text:
    print("Failed to extract text from PDF")
    sys.exit()

Step 1: Extracting text from PDF...
Successfully loaded PDF: JPM_1q25-earnings-transcript.pdf
Number of pages: 21


In [58]:
# Extract text from PDF
print("Step 1: Extracting text from PDF...")
file_name = "JPM_1q25-earnings-transcript.pdf"
full_text = extract_pdf_text(file_name)
if not full_text:
    print("Failed to extract text from PDF")
    sys.exit()

Step 1: Extracting text from PDF...
Successfully loaded PDF: JPM_1q25-earnings-transcript.pdf
Number of pages: 21


In [59]:
# Separate presentation and Q&A
print("\nStep 2: Separating presentation and Q&A sections...")
presentation_text, qa_text = separate_presentation_and_qa(full_text)
presentation_text = clean_text(presentation_text)
qa_text = clean_text(qa_text)
# Save to separate files
print("\nSaving sections to files...")
save_sections_to_files(presentation_text, qa_text)


Step 2: Separating presentation and Q&A sections...
Q&A section detected using pattern: (?i)q\s*(?:and|&)\s*a
Presentation text: 11,497 characters
Q&A text: 90,153 characters

Saving sections to files...
Open presentation file output\jpmorgan_earnings_presentation.txt
Presentation section saved to: output\jpmorgan_earnings_presentation.txt
Q&A section saved to: output\jpmorgan_earnings_qa.txt


('output\\jpmorgan_earnings_presentation.txt',
 'output\\jpmorgan_earnings_qa.txt')

In [60]:
# Analyze the sections
analyze_sections(presentation_text, qa_text)

# Parse Q&A section
print("\nStep 3: Parsing Q&A section...")
qa_data = parse_qa_section(qa_text)


SECTION ANALYSIS
Presentation Section:
  - Characters: 10,871
  - Words (approx): 1,706
  - Lines: 130
  - Preview: 1Q25 FINANCIAL RESULTS 
EARNINGS CALL TRANSCRIPT 
April 11, 2025 

 NOVEMBER 2024 

 1 
MANAGEMENT D...

Q&A Section:
  - Characters: 55,486
  - Words (approx): 9,512
  - Lines: 837
  - Estimated questions: 138
  - Preview: Q&A. 
 
QUESTION AND ANSWER SECTION 

Operator : Thank you. Please stand by. Our first question come...

Step 3: Parsing Q&A section...


In [61]:
# Create DataFrame
print("\nStep 4: Creating Q&A DataFrame...")
if len(qa_data) == 0:
    print("1 No structured Q&A data could be extracted")
    sys.exit() 

df = create_qa_dataframe(qa_data)

if df.empty:
    print("2 No structured Q&A data could be extracted")
    sys.exit()


Step 4: Creating Q&A DataFrame...
1 No structured Q&A data could be extracted


SystemExit: 

In [None]:

# Analyze DataFrame
print("\nStep 5: Analyzing DataFrame...")
analyze_qa_dataframe(df)

# Save to Excel
print("\nStep 6: Saving to Excel...")
save_to_excel(df)

# Display sample data
print("\nSample of extracted data:")
print("="*60)
if len(df) > 0:
    sample_row = df.iloc[0]
    print(f"Question 1:")
    print(f"  Questioner: {sample_row['Questioner']}")
    print(f"  Question: {sample_row['Question'][:200]}...")
    print(f"  Answerers: {sample_row['Answerers_Json']}")
    print(f"  Full Answer: {sample_row['Full_Answer'][:200]}...")



Step 1: Extracting text from PDF...
Successfully loaded PDF: JPM_1q25-earnings-transcript.pdf
Number of pages: 21

Step 2: Separating presentation and Q&A sections...
Q&A section detected using pattern: (?i)(?:first\s*)?(?:question\s*)?(?:is\s*)?from\s*\w+
Presentation text: 2,026 characters
Q&A text: 99,624 characters

Saving sections to files...
Open presentation file output\jpmorgan_earnings_presentation.txt
Presentation section saved to: output\jpmorgan_earnings_presentation.txt
Q&A section saved to: output\jpmorgan_earnings_qa.txt

SECTION ANALYSIS
Presentation Section:
  - Characters: 1,993
  - Words (approx): 226
  - Lines: 24
  - Preview: 1Q25 FINANCIAL RESULTS 
EARNINGS CALL TRANSCRIPT 
April 11, 2025 

 NOVEMBER 2024 

 1 
MANAGEMENT D...

Q&A Section:
  - Characters: 99,364
  - Words (approx): 11,117
  - Lines: 943
  - Estimated questions: 138
  - Preview: from prior quarters, as well 
as higher wholesale deposits. NIR ex. Markets was up $2.2 billion, or ...

Step 3: Parsing

: 