In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import matplotlib.pyplot as plt

In [2]:
PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / "data"

DATA_FILE = DATA_DIR / "earnings_calls_cleaned.csv"

DATA_FILE


WindowsPath('C:/Users/ssmyt/earnings-call-nlp/data/earnings_calls_cleaned.csv')

In [3]:
df = pd.read_csv(DATA_FILE)
df.head()

Unnamed: 0,date,exchange,q,ticker,transcript,call_datetime,quarter_only,transcript_clean_basic,transcript_nlp,prepared_remarks,qna,prepared_len,qna_len
0,"Nov 17, 2022, 5:00 p.m. ET",NYSE: GPS,2022-Q3,GPS,"Prepared Remarks:\nOperator\nGood afternoon, l...",2022-11-17 17:00:00,Q3,"Prepared Remarks:\nOperator\nGood afternoon, l...","Good afternoon, ladies and gentlemen. My name ...","Prepared Remarks:\nOperator\nGood afternoon, l...",Questions & Answers:\nOperator\n[Operator inst...,23689,34247
1,"Feb 10, 2021, 5:00 p.m. ET",NASDAQ: QLYS,2020-Q4,QLYS,Prepared Remarks:\nOperator\nLadies and gentle...,2021-02-10 17:00:00,Q4,Prepared Remarks:\nOperator\nLadies and gentle...,I will now turn the conference over to Mr. Vin...,Prepared Remarks:\nOperator\nLadies and gentle...,Questions and Answers:\nOperator\nThank you. [...,20191,35689
2,"Oct 27, 2020, 10:00 a.m. ET",NYSE: CSTM,2020-Q3,CSTM,Prepared Remarks:\nOperator\nLadies and gentle...,2020-10-27 10:00:00,Q3,Prepared Remarks:\nOperator\nLadies and gentle...,"Ladies and gentlemen, thank you for standing b...",Prepared Remarks:\nOperator\nLadies and gentle...,Questions and Answers:\nOperator\nThank you. [...,17543,21658
3,"Aug 5, 2021, 4:30 p.m. ET",NASDAQ: LOCO,2021-Q2,LOCO,Prepared Remarks:\nOperator\nWelcome to the El...,2021-08-05 16:30:00,Q2,Prepared Remarks:\nOperator\nWelcome to the El...,And now I would like to turn the conference ov...,Prepared Remarks:\nOperator\nWelcome to the El...,Questions and Answers:\nOperator\nThank you. [...,17295,17213
4,"Nov 7, 2019, 8:30 a.m. ET",NYSE: FLO,2019-Q3,FLO,Prepared Remarks:\nOperator\nWelcome to the Fl...,2019-11-07 08:30:00,Q3,Prepared Remarks:\nOperator\nWelcome to the Fl...,"I will now turn the call over to J.T. Rieck, T...",Prepared Remarks:\nOperator\nWelcome to the Fl...,Questions and Answers:\nOperator\nThank you. [...,16635,5357


In [4]:
df.shape, df.columns

((497, 13),
 Index(['date', 'exchange', 'q', 'ticker', 'transcript', 'call_datetime',
        'quarter_only', 'transcript_clean_basic', 'transcript_nlp',
        'prepared_remarks', 'qna', 'prepared_len', 'qna_len'],
       dtype='object'))

In [5]:
nlp_df = df[['ticker','call_datetime','q','quarter_only','prepared_remarks','qna','prepared_len','qna_len']].copy()
nlp_df.shape

(497, 8)

In [20]:
#  sample for testing
df_sample = df.sample(10, random_state=42).reset_index(drop=True)

In [22]:
import os
from openai import OpenAI


api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found. Make sure it's set in your environment.")

client = OpenAI(api_key=api_key)


In [29]:
#prompt for prepared remarks
def summarize_transcript(text, max_tokens=600):
    prompt = f"""
You are an expert equity research analyst. 
Summarize the following earnings call transcript into 4-6 concise bullet points, focusing on:
- Performance
- Guidance
- Risks
- Strategic changes

Transcript:
{text}
"""
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",  # cheaper option for now
    messages=[{"role": "user", "content": prompt}],
    max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()


In [30]:
sample_transcript = df_sample.loc[0, "prepared_remarks"]

print(len(sample_transcript))
print(sample_transcript[:500])  # preview first 500 chars


23448
Prepared Remarks:
Operator
Welcome, everyone, and thank you for standing by for the Alphabet Q3 2021 Earnings Conference Call. [Operator Instructions] I'd now like to hand the conference over to your speaker today, Jim Friedland, Director of Investor Relations. Please go ahead.
James Friedland -- Director of Investor Relations
Thank you. Good afternoon, everyone, and welcome to Alphabet's third quarter 2021 earnings conference call. With us today are Sundar Pichai, Philipp Schindler and Ruth Por


In [31]:
# test of api call
summary = summarize_transcript(sample_transcript)
print(summary)

- Performance: Alphabet reported strong revenue growth in Q3 2021, driven by broad-based strength in advertiser spend and elevated consumer online activity across Google Services, particularly in Google Search and YouTube advertising.
- Guidance: The company expects continued revenue growth in Google Services, with a focus on supporting the holiday season through elevated sales and marketing expenses. Google Cloud is seeing positive trends in revenue growth, especially in GCP and Google Workspace.
- Risks: The company highlighted uncertainty in the global economy due to ongoing fluctuations in recovery rates across different regions, which could impact future performance. Additionally, there were increased expenses in legal matters and headcount growth affecting operating expenses.
- Strategic changes: Alphabet emphasized its continued investments in AI and machine learning, especially with the introduction of the new Pixel devices and advancements in Google Cloud's data analytics and 

In [41]:
#individual prompts for the prepared remark and Q&A sections
def summarize_prepared_remarks(text, max_tokens=300):
    """
    Summarize management's Prepared Remarks into 4-6 bullet points focusing
    on performance, guidance, risks, and strategic changes.
    """
    prompt = f"""
You are an expert equity research analyst.
Summarize the following management-prepared remarks from an earnings call into 4-6 concise bullet points, focusing on:
- Performance
- Guidance
- Risks
- Strategic changes

Transcript:
{text}
"""
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()


def summarize_qna(text, max_tokens=300):
    """
    Summarize the Q&A section into 3-5 bullet points highlighting key management
    responses, clarifications on performance, additional guidance, and risks.
    """
    prompt = f"""
You are an expert equity research analyst.
Summarize the following Q&A from an earnings call into 3-5 concise bullet points, focusing on:
- Key management responses
- Clarifications on performance
- Additional guidance
- Risks or challenges discussed

Transcript:
{text}
"""
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()


In [42]:
if 'prepared_summary' not in df.columns:
    df['prepared_summary'] = ""
if 'qna_summary' not in df.columns:
    df['qna_summary'] = ""
if 'api_called' not in df.columns:
    df['api_called'] = False


In [57]:
OUTPUT_DIR = PROJECT_ROOT / "output"
OUTPUT_DIR.mkdir(exist_ok=True)

output_file = OUTPUT_DIR / "earnings_summaries.csv"

In [47]:
# due to max token linits function to split into chunks
def chunk_text(text, max_chars=8000):
    """Split text into chunks of up to max_chars characters."""
    chunks = []
    start = 0
    while start < len(text):
        chunks.append(text[start:start+max_chars])
        start += max_chars
    return chunks

# Function to summarize multiple chunks and combine
def summarize_long_text(text, summarizer_func):
    chunks = chunk_text(text)
    bullet_points = []
    for chunk in chunks:
        summary = summarizer_func(chunk)
        # Split into bullets if model returns multi-line string
        for line in summary.split("\n"):
            line = line.strip()
            if line:
                bullet_points.append(line)
    return "\n".join(bullet_points)


In [51]:
save_every = 20
for idx, row in df.iterrows():
    if not row['api_called']:
        prepared_text = row['prepared_remarks']
        qna_text = row['qna']

        # Summarize each section separately
        df.at[idx, 'prepared_summary'] = summarize_long_text(prepared_text, summarize_prepared_remarks)
        df.at[idx, 'qna_summary'] = summarize_long_text(qna_text, summarize_qna)

        df.at[idx, 'api_called'] = True

        if idx % save_every == 0:
            df.to_csv(output_file, index=False)
            print(f"Checkpoint saved at row {idx}")


Checkpoint saved at row 20
Checkpoint saved at row 40
Checkpoint saved at row 60
Checkpoint saved at row 80
Checkpoint saved at row 100
Checkpoint saved at row 120
Checkpoint saved at row 140
Checkpoint saved at row 160
Checkpoint saved at row 180
Checkpoint saved at row 200
Checkpoint saved at row 220
Checkpoint saved at row 240
Checkpoint saved at row 260
Checkpoint saved at row 280
Checkpoint saved at row 300
Checkpoint saved at row 320
Checkpoint saved at row 340
Checkpoint saved at row 360
Checkpoint saved at row 380
Checkpoint saved at row 400
Checkpoint saved at row 420
Checkpoint saved at row 440
Checkpoint saved at row 460
Checkpoint saved at row 480


In [58]:
df.to_csv(output_file, index=False)

In [56]:
#check that summaries for all rows
df['prepared_summary'].isna().sum()
df['qna_summary'].isna().sum()

0