<a href="https://colab.research.google.com/github/Rajeswari0410/ML_Projects/blob/main/Summary_Retriever.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install groq

Collecting groq
  Downloading groq-0.30.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.30.0-py3-none-any.whl (131 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/131.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.30.0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import os
from pathlib import Path
from google.colab import userdata
from groq import Groq

folderpath = "/content/drive/MyDrive/Earnings2Insights/ECTsum"

key = userdata.get("Groq_Key")
client1 = Groq(api_key=key)

key = userdata.get("Groq2_key")
client2 = Groq(api_key=key)


def load_transcript(transcript_path):
    try:
        with open(transcript_path, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        print(f"Error reading {transcript_path}: {e}")
        return None

In [8]:
import tiktoken

enc = tiktoken.get_encoding("cl100k_base") #tokenizer

def chunk_text(text, max_tokens=4000):
    tokens = enc.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk = enc.decode(tokens[start:end])
        chunks.append(chunk)
        start = end
    return chunks

def generate_report(transcript_text, ecc_id, client):
    chunks = chunk_text(transcript_text)
    all_chunks_output = []

    for i, chunk in enumerate(chunks):
        prompt = f"""

Here is part {i+1} of the earnings call transcript for {ecc_id}.
Extract key financial metrics from earnings data. Output in structured format.
Output Format
Company: [Name]
Period: [Quarter/Year]
Metrics:

Revenue: [Amount, growth %]
EPS: [GAAP and/or Adjusted amounts]
Guidance: [Forward-looking statements]
Dividends: [Changes if mentioned]
Notable: [Key highlights]

Key Data Points to Extract

Revenue figures and growth percentages
Earnings per share (both GAAP and adjusted)
Profit/loss amounts
Forward guidance (ranges, percentages)
Dividend changes
Year-over-year comparisons

Processing Rules

Extract company names from any format
Convert descriptive terms: "high teens" = 15-19%, "low 30s" = 30-34%
Distinguish GAAP vs adjusted metrics
Note currency impacts when mentioned
Flag significant guidance changes

Example Output
Company: AMETEK
Period: Q1 2021
Metrics:

Revenue: $1.22B, +1% Q1; 2021 guidance: up 15-19% total, up high single digits organic
EPS: Q1 adjusted $1.07; 2021 guidance: $4.48-$4.56; Q2 2021 guidance: $1.08-$1.10
Guidance: Q2 sales expected up 30-34% vs Q2 2020
Notable: Strong forward guidance despite modest Q1 growth

Process the provided financial data using this format.

Transcript (Part {i+1}):
'''
{chunk}
'''
"""
        response = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[{"role": "user", "content": prompt}]
        )
        all_chunks_output.append(response.choices[0].message.content.strip())

    # Join chunk-level summaries
    combined_chunks = "\n\n".join(all_chunks_output)

    summary_prompt = f"""
Extract key financial metrics from earnings data. Output in structured format.
Output Format
Company: [Name]
Period: [Quarter/Year]
Metrics:

Revenue: [Amount, growth %]
EPS: [GAAP and/or Adjusted amounts]
Guidance: [Forward-looking statements]
Dividends: [Changes if mentioned]
Notable: [Key highlights]

Key Data Points to Extract

Revenue figures and growth percentages
Earnings per share (both GAAP and adjusted)
Profit/loss amounts
Forward guidance (ranges, percentages)
Dividend changes
Year-over-year comparisons

Processing Rules

Extract company names from any format
Convert descriptive terms: "high teens" = 15-19%, "low 30s" = 30-34%
Distinguish GAAP vs adjusted metrics
Note currency impacts when mentioned
Flag significant guidance changes

Example Output
Company: AMETEK
Period: Q1 2021
Metrics:

Revenue: $1.22B, +1% Q1; 2021 guidance: up 15-19% total, up high single digits organic
EPS: Q1 adjusted $1.07; 2021 guidance: $4.48-$4.56; Q2 2021 guidance: $1.08-$1.10
Guidance: Q2 sales expected up 30-34% vs Q2 2020
Notable: Strong forward guidance despite modest Q1 growth

Process the provided financial data using this format.

Partial insights:
'''
{combined_chunks}
'''
"""

    final_response = client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[{"role": "user", "content": summary_prompt}]
    )

    return final_response.choices[0].message.content.strip()


In [9]:
final_output = []
folder_list = sorted(os.listdir(folderpath))

for idx, folder_name in enumerate(folder_list):
    source_path = os.path.join(folderpath, folder_name, "source", "source.md")

    if os.path.isfile(source_path):
        print(f"Processing: {folder_name}")
        transcript = load_transcript(source_path)

        if transcript:
            try:
                current_client = client1 if idx % 2 == 0 else client2
                report = generate_report(transcript, folder_name, current_client)
                final_output.append({
                    "ECC": folder_name,
                    "Report": report
                })
            except Exception as e:
                print(f"Error generating report for {folder_name}: {e}")
        else:
            print(f"No transcript found for: {folder_name}")

Processing: ABM_q3_2021
Processing: AME_q1_2021
Processing: CFR_q3_2019
Processing: CPF_q4_2019
Processing: DNB_q2_2021
Processing: DOV_q2_2020
Processing: DX_q1_2021
Processing: FAF_q4_2020
Processing: FIS_q4_2020
Processing: FN_q2_2021
Processing: FSS_q2_2021
Processing: GCO_q1_2022
Processing: GD_q1_2021
Processing: GLW_q1_2021
Processing: GNW_q2_2021
Processing: HR_q2_2021
Processing: HTH_q4_2020
Processing: JBL_q4_2021
Processing: KMT_q1_2022
Processing: KW_q3_2021
Processing: LH_q3_2021
Processing: LNN_q3_2021
Processing: LYB_q3_2021
Processing: MDT_q2_2022
Processing: MKC_q2_2021
Processing: MSI_q3_2021
Processing: MYE_q2_2020
Processing: NEE_q3_2021
Processing: NPO_q2_2021
Processing: OHI_q4_2021
Processing: RPM_q3_2022
Processing: SF_q3_2020
Processing: SWN_q2_2021
Processing: SYY_q2_2021
Processing: TK_q1_2021
Processing: TT_q1_2021
Processing: UVE_q4_2020
Processing: VMI_q1_2021
Processing: VSH_q2_2021
Processing: WWW_q3_2021


In [10]:
final_output

[{'ECC': 'ABM_q3_2021',
  'Report': 'Here is the extracted key financial metrics in the specified format:\n\n**Company:** ABM Industries Incorporated\n**Period:** Q3 2021\n**Metrics:**\n\n**Revenue:** $1.54B, +10.7% year-over-year\n**EPS:**\n  - GAAP: -$0.20\n  - Adjusted: $0.90, +20% year-over-year\n**Guidance:**\n  - Full year fiscal 2021 adjusted EPS guidance: $3.45-$3.55, up from $3.30-$3.50 previously\n**Dividends:**\n  - Paid 221st consecutive quarterly dividend of $0.19 per common share\n  - Declared 222nd consecutive quarterly dividend, payable November 1, 2021\n**Notable:**\n  - Strong third quarter results with double-digit revenue growth, solid cash generation, and a 20% gain in adjusted EPS\n  - Virus protection services remained elevated, but eased slightly in the quarter\n  - Acquisition of Able Services expected to be accretive to adjusted EPS from day one, with estimated $30 million to $40 million in cost savings synergies'},
 {'ECC': 'AME_q1_2021',
  'Report': 'Here is

In [11]:
with open("/content/summaries.txt", "w", encoding="utf-8") as f:
    for entry in final_output:
        f.write(f"=== {entry['ECC']} ===\n{entry['Report']}\n\n")


In [14]:
final_professional_output = []
prof_folderpath = "/content/drive/MyDrive/Earnings2Insights/Professional"
for folder_name in os.listdir(prof_folderpath):
    prof_path = os.path.join(prof_folderpath, folder_name, "source", "source.md")

    if os.path.isfile(prof_path):
        print(f"Processing: {folder_name}")
        transcript = load_transcript(prof_path)

        if transcript:
            try:
                cur_client = client1 if idx % 2 == 0 else client2
                prof_report = generate_report(transcript, folder_name, cur_client)
                final_professional_output.append({
                    "ECC": folder_name,
                    "Report": prof_report
                })
            except Exception as e:
                print(f"Error generating report for {folder_name}: {e}")
        else:
            print(f"No transcript found for: {folder_name}")

Processing: CMI_q4_2015
Processing: DE_q2_2014
Processing: UNH_q2_2014
Processing: DE_q3_2014
Processing: PCAR_q1_2014
Processing: DE_q3_2013
Processing: ETN_q1_2014
Processing: CMI_q3_2014
Processing: PCAR_q1_2015
Processing: CMI_q4_2014
Processing: PCAR_q1_2016
Processing: CMI_q4_2013
Processing: PCAR_q2_2015
Processing: PCAR_q2_2014
Processing: PCAR_q4_2014
Processing: DE_q1_2014
Processing: DE_q4_2014
Processing: DE_q4_2012
Processing: PCAR_q3_2015
Processing: PCAR_q4_2015
Error generating report for PCAR_q4_2015: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-70b-8192` in organization `org_01jzgg6fh0enmrvm08t87bedbx` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499495, Requested 4673. Please try again in 12m0.1574s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Processing: WYNN_q2_2014
Error generating report for WYNN_q2_2014: Error 

In [15]:
with open("/content/prof_summaries.txt", "w", encoding="utf-8") as f:
    for entry in final_professional_output:
        f.write(f"=== {entry['ECC']} ===\n{entry['Report']}\n\n")