In [1]:
from openai import OpenAI
from datetime import datetime
from pydantic import BaseModel
from typing import List
import json
import yaml
import pandas as pd
import textwrap

In [2]:
# Load API key from config file
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)
    api_key = config['api_key']

client = OpenAI(api_key=api_key)

In [3]:
class ExtractResponse(BaseModel):
    representative_quotes: List[str]
    summary: str

In [4]:
def detect_data_or_ai(passage: str) -> int:
    prompt = f"""You are a classifier. Read the following passage and determine if the company talks
about topics related to 'data' or 'AI' (artificial intelligence), including concepts
like analytics, machine learning, algorithms, data-driven decision making, big data,
or AI applications.

Respond with only a single number:
- 1 if the passage discusses AI or data in any meaningful way.
- 0 if it does not.

Passage:
\"\"\"{passage}\"\"\"
    """

    response = client.responses.create(
        model="gpt-5-mini",
        instructions="You are a strict binary classifier.",
        reasoning={"effort": "medium"},
        text={"verbosity": "high"},
        input=[
            {"role": "user", "content": prompt}
        ]
    )

    try:
        result = int(response.output_text)
        return 1 if result == 1 else 0
    except:
        return 0

In [5]:
def extract(passage: str) -> ExtractResponse:
    """Return representative quotes and summary for passages about data/AI."""
    analyst_instructions = """You are an analyst focused on how companies discuss data and AI in earnings calls. Follow the provided formatting rules exactly."""

    user_prompt = textwrap.dedent(
        f"""Goal:
Identify how this earnings-call passage talks about 'data' and 'AI'. Focus on
meaningful mentions tied to strategy, products, operations, risks, or investments.

Format:
1. Representative Quotes (<=10 short excerpts that explicitly mention 'data' or 'AI').
2. Brief Summary (<=100 words capturing the themes of those quotes).

Passage:
\"\"\"{passage}\"\"\"""").strip()

    try:
        response: ExtractResponse = client.responses.parse(
            model="gpt-5-mini",
            instructions=analyst_instructions,
            reasoning={"effort": "medium"},
            text={"verbosity": "high"},
            input=[
                {"role": "user", "content": user_prompt},
            ],
            text_format=ExtractResponse,
        )
        return response.output_parsed

    except Exception:
        return ExtractResponse(representative_quotes=[], summary="")

In [6]:
def format_quotes(quotes: List[str]) -> str:
    """Return quotes formatted in "","" style."""
    cleaned = []
    for quote in quotes or []:
        cleaned_quote = " ".join(quote.split())
        if cleaned_quote:
            cleaned.append(cleaned_quote)
    if not cleaned:
        return ""
    return '"' + '","'.join(cleaned) + '"'

def process_row(row: pd.Series) -> pd.Series:
    """Run classification and extraction for a single dataframe row."""
    passage = str(row.get("componenttext", "") or "")
    classification = detect_data_or_ai(passage)
    formatted_quotes = ""
    summary = ""

    if classification == 1:
        extraction = extract(passage)
        formatted_quotes = format_quotes(extraction.representative_quotes)
        summary = extraction.summary.strip()

    return pd.Series(
        {
            "is_data_or_ai": classification,
            "representative_quotes": formatted_quotes,
            "summary": summary,
        }
    )

In [11]:
dataset_path = "transcripts_dataset/2024_q4_clean.csv"
df = pd.read_csv(dataset_path)
df = df.iloc[1098:1112]

In [8]:
df

Unnamed: 0,companyid,keydevid,transcriptid,headline,mostimportantdateutc,mostimportanttimeutc,keydeveventtypeid,keydeveventtypename,companyname,transcriptcollectiontypeid,...,transcriptcomponenttypename,transcriptpersonid,transcriptpersonname,proid,companyofperson,speakertypeid,speakertypename,componenttextpreview,word_count,componenttext
1098,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,Question,146141.0,Matthew Hedberg,30463897.0,,3.0,Analysts,"Yes. Well, that's literally the next question....",42.0,"Yes. Well, that's literally the next question...."
1099,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,Answer,472231.0,Richard Herren,699029329.0,,2.0,Executives,Yes. Enterprise was really strong for us in te...,488.0,Yes. Enterprise was really strong for us in te...
1100,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,Question,146141.0,Matthew Hedberg,30463897.0,,3.0,Analysts,That's exciting. I wonder -- before we get int...,62.0,That's exciting. I wonder -- before we get int...
1101,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,Answer,472231.0,Richard Herren,699029329.0,,2.0,Executives,Yes. We took up both the top line and the bott...,32.0,Yes. We took up both the top line and the bott...
1102,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,Question,146141.0,Matthew Hedberg,30463897.0,,3.0,Analysts,Yes. You're conservative.,3.0,Yes. You're conservative.
1103,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,Answer,472231.0,Richard Herren,699029329.0,,2.0,Executives,That's not my norm. What -- we took at the top...,274.0,That's not my norm. What -- we took at the top...
1104,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,Question,146141.0,Matthew Hedberg,30463897.0,,3.0,Analysts,"Yes. Yes. Yes. Back in our Autodesk days, the ...",115.0,"Yes. Yes. Yes. Back in our Autodesk days, the ..."
1105,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,Answer,472231.0,Richard Herren,699029329.0,,2.0,Executives,I think we are in a pretty unique position. An...,228.0,I think we are in a pretty unique position. An...
1106,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,Question,146141.0,Matthew Hedberg,30463897.0,,3.0,Analysts,Yes. When -- I guess if AI continues to ramp w...,80.0,Yes. When -- I guess if AI continues to ramp w...
1107,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,Answer,472231.0,Richard Herren,699029329.0,,2.0,Executives,"Yes. Of course, it could be. Now that target, ...",25.0,"Yes. Of course, it could be. Now that target, ..."


In [9]:
processed_df = df.copy()

results = processed_df.apply(process_row, axis=1)
processed_df[["is_data_or_ai", "representative_quotes", "summary"]] = results

processed_df.head()

Result: 0
Result: 1
Result: 0
Result: 0
Result: 0
Result: 0
Result: 1
Result: 0
Result: 1
Result: 0
Result: 0
Result: 0
Result: 1
Result: 1


Unnamed: 0,companyid,keydevid,transcriptid,headline,mostimportantdateutc,mostimportanttimeutc,keydeveventtypeid,keydeveventtypename,companyname,transcriptcollectiontypeid,...,proid,companyofperson,speakertypeid,speakertypename,componenttextpreview,word_count,componenttext,is_data_or_ai,representative_quotes,summary
1098,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,30463897.0,,3.0,Analysts,"Yes. Well, that's literally the next question....",42.0,"Yes. Well, that's literally the next question....",0,,
1099,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,699029329.0,,2.0,Executives,Yes. Enterprise was really strong for us in te...,488.0,Yes. Enterprise was really strong for us in te...,1,"""what's driving that demand is companies looki...",The passage frames AI as the primary driver of...
1100,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,30463897.0,,3.0,Analysts,That's exciting. I wonder -- before we get int...,62.0,That's exciting. I wonder -- before we get int...,0,,
1101,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,699029329.0,,2.0,Executives,Yes. We took up both the top line and the bott...,32.0,Yes. We took up both the top line and the bott...,0,,
1102,19691,1904126000.0,3318005.0,"Cisco Systems, Inc. Presents at 2024 RBC Capit...",2024-11-20,16:20:00,51.0,Company Conference Presentations,"Cisco Systems, Inc.",8.0,...,30463897.0,,3.0,Analysts,Yes. You're conservative.,3.0,Yes. You're conservative.,0,,


In [10]:
output_filename = datetime.now().strftime("processed_transcripts_%Y%m%d_%H%M%S.csv")
output_path = f"output/{output_filename}"
processed_df.to_csv(output_path, index=False)