In [30]:
import pandas as pd
import json
import os
from dotenv import load_dotenv
from datetime import datetime
from pathlib import Path
from time import sleep

In [3]:
load_dotenv('../.env')

True

In [22]:
TASK_NAME = "source_parsing_v0"

INPUT_DATA_PATH = f"../local_tests_data/azure_openai_batch_processing_files/"
run_n = len(os.listdir(INPUT_DATA_PATH))
RUNID = f"RUNID_{run_n}"
INPUT_DATA_PATH = f"../local_tests_data/azure_openai_batch_processing_files/{RUNID}/{TASK_NAME}/OUTPUTS/"

RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

OUTPUT_DATA_PATH = f"../local_tests_data/raw_articles_list/{RUNID}/"

os.makedirs(OUTPUT_DATA_PATH, exist_ok=True)

print(f"Run ID: {RUNID} at {RUN_TIME}")

Run ID: RUNID_1 at 2025-05-29 17:16:04


In [42]:
def read_outputs():
    outputs = []
    for filename in os.listdir(INPUT_DATA_PATH):
        if filename.endswith('.jsonl'):
            with open(os.path.join(INPUT_DATA_PATH, filename), 'r') as f:
                    lines_raw = f.readlines()
                    for line in lines_raw:
                        output_dict = json.loads(line)
                        model = output_dict.get("response").get("body").get("model")
                        line_id = output_dict.get("custom_id")
                        content_json = output_dict.get("response").get("body").get("choices")[0].get("message").get("content")
                        content = json.loads(content_json)
                        outputs.append({
                            "model": model,
                            "line_id": line_id,
                            "content": content
                        })
    return outputs

import glob
import os

def get_previously_crawled_article_titles():

    root_dir = Path(OUTPUT_DATA_PATH).parent
    pattern = '*.json'               

    search_path = os.path.join(root_dir, '**', pattern)
    all_filenames = glob.glob(search_path, recursive=True)

    previously_crawled_article_titles = []
    for filepath in all_filenames:
        with open(filepath, 'r') as file:
            data = json.load(file)
            for item in data:
                if 'article_title' in item:
                    previously_crawled_article_titles.append(item['article_title'])

    return previously_crawled_article_titles

In [43]:
get_previously_crawled_article_titles()

['Bioprinted organs ‘10–15 years away,’ says startup regenerating dog skin',
 '‘Purest meat alternative’ to grow in Swedish mycoprotein factory',
 'Can geothermal startups drill Europe to clean energy independence?',
 'TNW Backstage tunes into the future of earbuds',
 'Elon Musk’s Grok chatbot banned by a quarter of European firms',
 'ENISA ya cuenta con más de medio millar de startups certificadas',
 '7 metas que deberán marcar la relación de las empresas con la tecnología en 2024',
 'Apple es la compañía tecnológica que más ganancias tiene por empleado',
 'Cómo mantener el impulso de tu startup durante la época navideña',
 'Google paga 5.000 millones de dólares para resolver una demanda colectiva',
 'Meta, la matriz de Facebook, planea abrir tiendas físicas para vender sus gafas inteligentes',
 'Palo sideral a Elon Musk: esto es lo que cuesta el megacohete Starship que ha explotado sobre el Índico',
 'El ecosistema tecnológico español crece un 22% y Madrid desplaza a Barcelona como c

In [34]:
previously_crawled_article_titles = get_previously_crawled_article_titles()

new_raw_articles_list = []
for output in read_outputs():
    model = output['model']
    line_id = output['line_id']
    run_id, task_name, source_name = line_id.split("--")
    content = output['content']

    article_links_list = content.get("article_links_list", [])
    for article_link in article_links_list:
        article_title = article_link.get("title", "")
        article_url = article_link.get("url", "")
        article_keywords = article_link.get("keywords", [])
        article_language = article_link.get("language", "")
        article_id = source_name + "--" + datetime.now().strftime('%Y%m%d%H%M%S%f')

        sleep(0.01)

        if article_title in previously_crawled_article_titles:
            print(f"Skipping {article_title} as it has already been crawled.")
            continue

        new_raw_articles_list.append({
            "model": model,
            "run_id": run_id,
            "task_name": task_name,
            "source_name": source_name,
            "article_id": article_id,
            "article_title": article_title,
            "article_url": article_url,
            "article_keywords": article_keywords,
            "article_language": article_language,
            "crawled_at": RUN_TIME
        })



In [35]:
new_raw_articles_list

[{'model': 'gpt-4o-2024-11-20',
  'run_id': 'RUNID_1',
  'task_name': 'source_parsing_v0',
  'source_name': 'the_next_web',
  'article_id': 'the_next_web--20250529174107305291',
  'article_title': 'Bioprinted organs ‘10–15 years away,’ says startup regenerating dog skin',
  'article_url': 'https://thenextweb.com/news/bioprinting-human-organs-in-10-years',
  'article_keywords': ['bioprinting', 'human organs', 'deep tech'],
  'article_language': 'en',
  'crawled_at': '2025-05-29 17:16:04'},
 {'model': 'gpt-4o-2024-11-20',
  'run_id': 'RUNID_1',
  'task_name': 'source_parsing_v0',
  'source_name': 'the_next_web',
  'article_id': 'the_next_web--20250529174107315426',
  'article_title': '‘Purest meat alternative’ to grow in Swedish mycoprotein factory',
  'article_url': 'https://thenextweb.com/news/purest-meat-alternative-to-grow-in-swedish-mycoprotein-factory',
  'article_keywords': ['meat alternative', 'sustainability', 'factory'],
  'article_language': 'en',
  'crawled_at': '2025-05-29 1

In [36]:
with open(os.path.join(OUTPUT_DATA_PATH, f"raw_articles_list_{RUNID}.json"), 'w') as f:
    json.dump(new_raw_articles_list, f, indent=4)
print(f"New raw articles list saved to {OUTPUT_DATA_PATH}raw_articles_list_{RUNID}.json")

New raw articles list saved to ../local_tests_data/raw_articles_list/RUNID_1/raw_articles_list_RUNID_1.json
