In [1]:
%pip install llama-index-readers-earnings-call-transcript
%pip install llama-index-embeddings-openai
# %pip install -r /Users/LVal18/Documents/GitHub/nu_capstone/requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
[31mERROR: Could not open requirements file: [Errno 2] No such file or directo

In [18]:
import json
import re
from datetime import datetime
from typing import List
import requests
from tenacity import retry, stop_after_attempt, wait_random_exponential
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document

# Set key
dcf_key = "0fd07d8c-a42a-4735-9aea-2c8db692830e"

def correct_date(yr, dt):
    """Some transcripts have incorrect date, correcting it.

    Args:
        yr (int): actual
        dt (datetime): given date

    Returns:
        datetime: corrected date
    """
    dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
    if dt.year != yr:
        dt = dt.replace(year=yr)
    return dt.strftime("%Y-%m-%d %H:%M:%S")


def extract_speakers(cont: str) -> List[str]:
    """Extract the list of speakers.

    Args:
        cont (str): transcript content

    Returns:
        List[str]: list of speakers
    """
    pattern = re.compile(r"\n(.*?):")
    matches = pattern.findall(cont)

    return list(set(matches))


@retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(2))
def get_earnings_transcript(quarter: str, ticker: str, year: int):
    """Get the earnings transcripts.

    Args:
        quarter (str)
        ticker (str)
        year (int)
    """
    response = requests.get(
        # f"https://discountingcashflows.com/api/transcript/{ticker}/{quarter}/{year}/",
        f"https://discountingcashflows.com/api/transcript/?ticker={ticker}&quarter={quarter}&year={year}&key={dcf_key}",
        auth=("user", "pass"),
    )

    resp_text = json.loads(response.text)
    speakers_list = extract_speakers(resp_text[0]["content"])
    corrected_date = correct_date(resp_text[0]["year"], resp_text[0]["date"])
    resp_text[0]["date"] = corrected_date
    return resp_text[0], speakers_list

class EarningsCallTranscriptDCF(BaseReader):
    def __init__(self, year: int, ticker: str, quarter: str):
        """Get the earning call transcripts for a given company, in a given year and quarter.

        Args:
            year (int): Year of the transcript
            ticker (str): ticker symbol of the stock
            quarter (str): quarter
        """
        curr_year = datetime.now().year
        assert year <= curr_year, "The year should be less than current year"

        assert quarter in [
            "Q1",
            "Q2",
            "Q3",
            "Q4",
        ], 'The quarter should from the list ["Q1","Q2","Q3","Q4"]'
        self.year = year
        self.ticker = ticker
        self.quarter = quarter

    def load_data(self) -> List[Document]:
        resp_dict, speakers_list = get_earnings_transcript(
            self.quarter, self.ticker, self.year
        )
        return Document(
            text=resp_dict["content"],
            extra_info={
                "ticker": resp_dict["symbol"],
                "quarter": "Q" + str(resp_dict["quarter"]),
                "date_time": resp_dict["date"],
                "speakers_list": speakers_list,
            },
        )

In [21]:
import csv
import pandas as pd
import requests
from datetime import datetime
from llama_index.readers.earnings_call_transcript import EarningsCallTranscript
import os



# Define last n years dynamically
n_years = 10
current_year = datetime.now().year
years_list = [current_year - i - 1 for i in range(n_years)]
quarters = ["Q1", "Q2", "Q3", "Q4"]  # Standardized quarter format

# Load stock list
stock_list = pd.read_csv("/Users/LVal18/Documents/GitHub/nu_capstone/data/processed/sp50_tech_stocks.csv")
companies = stock_list["Symbol"].tolist()

# Define CSV file path
csv_file_path = "earnings_call_transcripts.csv"

# Define API endpoint for checking individual tickers
DCF_API_URL_TEMPLATE = "https://discountingcashflows.com/api/profile/?ticker={ticker}&key={dcf_key}"

# Function to check if a ticker is available in DCF API
def is_ticker_available(ticker):
    try:
        response = requests.get(DCF_API_URL_TEMPLATE.format(ticker=ticker,dcf_key=dcf_key))
        return response.status_code == 200  # If the API returns 200, the ticker exists
    except requests.RequestException as e:
        print(f"❌ API request failed for {ticker}: {e}")
        return False  # Treat failed requests as unavailable tickers

# Load existing processed records (ticker, year, quarter) from the CSV
existing_records = set()
if os.path.exists(csv_file_path):
    with open(csv_file_path, mode="r", newline="", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            existing_records.add((row["ticker"], row["date_time"][:4], row["quarter"]))  # Store as (ticker, year, quarter)

# Open CSV file and append new data
with open(csv_file_path, mode="a", newline="", encoding="utf-8") as file:
    fieldnames = ["ticker", "quarter", "date_time", "speakers_list", "transcript"]
    writer = csv.DictWriter(file, fieldnames=fieldnames)

    # If file was empty, write the header
    if os.stat(csv_file_path).st_size == 0:
        writer.writeheader()

    # Fetch earnings call transcripts
    for ticker in companies:
        if not is_ticker_available(ticker):  # Check ticker availability first
            print(f"⚠️ Skipping {ticker}: Not found in DCF API.")
            continue  # Skip unavailable tickers

        try:
            for year in years_list:
                for quarter in quarters:
                    if (ticker, str(year), quarter) in existing_records:
                        print(f"⏭️ Skipping {ticker} {year} {quarter}: Already in CSV.")
                        continue  # Skip already processed records

                    loader = EarningsCallTranscriptDCF(year, ticker, quarter)
                    doc = loader.load_data()
                    row_data = {
                        "ticker": ticker,
                        "quarter": quarter,
                        "date_time": doc.metadata.get("date_time", "N/A"),
                        "speakers_list": ", ".join(doc.metadata.get("speakers", [])),
                        "transcript": doc.text  # Limit transcript size
                    }
                    writer.writerow(row_data)

                    print(f"✅ Successfully fetched data for {ticker} in {year} {quarter}")

        except Exception as e:
            print(f"❌ Error fetching data for {ticker}. Skipping to next ticker. Error: {e}")
            continue  # Move to the next ticker

print(f"\n✅ Data saved to {csv_file_path}")

⏭️ Skipping NVDA 2024 Q1: Already in CSV.
⏭️ Skipping NVDA 2024 Q2: Already in CSV.
⏭️ Skipping NVDA 2024 Q3: Already in CSV.
⏭️ Skipping NVDA 2024 Q4: Already in CSV.
⏭️ Skipping NVDA 2023 Q1: Already in CSV.
⏭️ Skipping NVDA 2023 Q2: Already in CSV.
⏭️ Skipping NVDA 2023 Q3: Already in CSV.
⏭️ Skipping NVDA 2023 Q4: Already in CSV.
⏭️ Skipping NVDA 2022 Q1: Already in CSV.
⏭️ Skipping NVDA 2022 Q2: Already in CSV.
⏭️ Skipping NVDA 2022 Q3: Already in CSV.
⏭️ Skipping NVDA 2022 Q4: Already in CSV.
⏭️ Skipping NVDA 2021 Q1: Already in CSV.
⏭️ Skipping NVDA 2021 Q2: Already in CSV.
⏭️ Skipping NVDA 2021 Q3: Already in CSV.
⏭️ Skipping NVDA 2021 Q4: Already in CSV.
⏭️ Skipping NVDA 2020 Q1: Already in CSV.
⏭️ Skipping NVDA 2020 Q2: Already in CSV.
⏭️ Skipping NVDA 2020 Q3: Already in CSV.
⏭️ Skipping NVDA 2020 Q4: Already in CSV.
⏭️ Skipping NVDA 2019 Q1: Already in CSV.
⏭️ Skipping NVDA 2019 Q2: Already in CSV.
⏭️ Skipping NVDA 2019 Q3: Already in CSV.
⏭️ Skipping NVDA 2019 Q4: Already 