In [39]:
import requests
import json 
from dotenv import load_dotenv
import os
from openai import AzureOpenAI
from datetime import datetime
from pydantic import BaseModel
from typing import List, Optional

In [40]:
load_dotenv()

True

In [41]:
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")

client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY
)

In [42]:
# sources chatgpt link
# https://chatgpt.com/share/682f3278-f380-8011-8c1f-fefff4390074
sources = [
    {'source_name' : 'TechCrunch', 'url': 'https://techcrunch.com/latest/'},
    {'source_name' : 'ITEspresso', 'url': 'https://www.itespresso.es/'},
    {'source_name' : 'Business Insider', 'url': 'https://www.businessinsider.es/tecnologia'},
    {'source_name' : 'CNET', 'url': 'https://www.cnet.com/ai-atlas/'},
    {'source_name' : 'The Next Web', 'url': 'https://thenextweb.com/'}
]

In [43]:
class Article(BaseModel):
    title: str
    url: str
    keywords: List[str]
    author: Optional[List[str]]

class ArticleList(BaseModel):
    articles: List[Article]

def get_source_articles(source_url : str) -> ArticleList:
    """
    Get the articles from the source URL.
    """

    source_url_jina = 'https://r.jina.ai/' + source_url
    source_url_raw_content = requests.get(source_url_jina).text

    messages = [
            {
                "role": "system", 
                "content": "Extract an article list from the following page content. Do not make up any information that's not in the provided text. Current date is " + datetime.now().strftime("%Y-%m-%d") + " If the provided content text contains no articles list (for instance due to a 'page not found' error), return an empty list."
            },
            {
                "role": "user", 
                "content": source_url_raw_content
            }
            ]

    model = "gpt-4o-mini"
    response = client.beta.chat.completions.parse(
        model=model, 
        messages=messages, 
        temperature=0.2,
        response_format=ArticleList
    )
    response = response.choices[0].message.parsed

    return response.articles

In [None]:
for source in sources:
    source_name = source['source_name']
    source_url = source['url']
    print(f"Getting articles from {source_url}")
    articles = [a.model_dump() for a in get_source_articles(source_url)]
    print(f"Found {len(articles)} articles.")
    save_name = source_name.replace(" ", "_").lower() + ".json"
    with open('local_tests_data/articles_list/' + save_name, 'w') as f:
        json.dump(articles, f, indent=4)

Getting articles from https://techcrunch.com/latest/
Found 7 articles.
Getting articles from https://www.itespresso.es/
Found 21 articles.
Getting articles from https://www.businessinsider.es/tecnologia
Found 20 articles.
Getting articles from https://www.cnet.com/ai-atlas/
Found 0 articles.
Getting articles from https://thenextweb.com/
Found 14 articles.


In [None]:
# response.articles[0].url

# from newspaper import Article as NewspaperArticle

# narticle = NewspaperArticle(response.articles[0].url)
# narticle.download()
# narticle.parse()
# print(narticle.text)

'https://thenextweb.com/news/tips-on-ai-agents'