In [1]:
import json
import os
from dotenv import load_dotenv
from datetime import datetime
from pathlib import Path
from time import sleep

from pydantic import BaseModel, ValidationError

In [2]:
load_dotenv('../.env')

True

In [3]:
TASK_NAME = "article_summarization_v0"

def get_run_id():
    return os.getenv('RUNID') 

RUNID = get_run_id()

INPUT_DATA_PATH = f"../local_tests_data/azure_openai_batch_processing_files/{RUNID}/{TASK_NAME}/OUTPUTS/"

RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

OUTPUT_DATA_PATH = f"../local_tests_data/relevant_articles_summaries/{RUNID}/"

os.makedirs(OUTPUT_DATA_PATH, exist_ok=True)

print(f"Run ID: {RUNID} at {RUN_TIME}")

Run ID: RUNID_2 at 2025-06-02 16:04:23


In [4]:
def read_outputs():
    outputs = []
    for filename in os.listdir(INPUT_DATA_PATH):
        if filename.endswith('.jsonl'):
            with open(os.path.join(INPUT_DATA_PATH, filename), 'r') as f:
                    lines_raw = f.readlines()
                    for line in lines_raw:
                        output_dict = json.loads(line)
                        model = output_dict.get("response").get("body").get("model")
                        line_id = output_dict.get("custom_id")
                        _, _, article_id = line_id.split("--")
                        summary = output_dict.get("response").get("body").get("choices")[0].get("message").get("content")
                        outputs.append({
                            "model": model,
                            "summary": summary,
                            "article_id": article_id,
                            "run_id": RUNID,
                            "task_name": TASK_NAME,
                        })
    return outputs

outputs = read_outputs()
outputs

[{'model': 'gpt-4o-2024-11-20',
  'summary': "Grammarly has secured a $1 billion commitment from General Catalyst's Customer Value Fund (CVF) to fuel sales, marketing, and potential acquisitions, marking a shift from traditional venture investment structures. This financing is nondilutive, allowing Grammarly to repay the capital with a capped percentage of revenue generated through the funding, preserving its existing valuation. The move aligns with CVF’s strategy to support late-stage startups with predictable revenue streams by providing alternative capital solutions tied to recurring revenue. \n\nThe funding comes as Grammarly transitions from a writing assistant to an AI productivity tool, following its acquisition of Coda and annual revenues exceeding $700 million. For business leaders, this underscores the growing attractiveness of predictable-revenue companies for innovative financing models, enabling acceleration without dilution or valuation disruption amid challenging market 

In [5]:
def save_outputs():
    output_file = OUTPUT_DATA_PATH + f"relevant_articles_summaries_{RUNID}.json"
    with open(output_file, 'w') as f:
        json.dump(outputs, f, indent=4)
    print(f"Outputs saved to {output_file}")

save_outputs()


Outputs saved to ../local_tests_data/relevant_articles_summaries/RUNID_2/relevant_articles_summaries_RUNID_2.json
