In [None]:
from newspaper import Article
import json
import os
import time
import ssl
from datetime import datetime

In [None]:
PATH_NEWS_API = r'../data/raw/news_api'
PATH_NEWS_DATA = r'../data/raw/news_data'

PATH_SAVE_NEWS_API = r'../data/scraped/news_api'

In [None]:
# Optional: Handle SSL certificate issues
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
def load_article_json(file):
    with open(file, 'rb') as f:
        data = json.load(f)
    return data

def load_content_article(url, retries=2):
    """Load article content with simple retry for SSL errors"""
    for attempt in range(retries + 1):
        try:
            article = Article(url)
            article.download()
            article.parse()
            return article.text
        except Exception as e:
            if attempt < retries and "SSL" in str(e):
                print(f"SSL error, retrying in 10s... ({attempt + 1}/{retries + 1})")
                time.sleep(10)
                continue
            else:
                raise e

In [None]:
def pick_file():
    date = datetime.now().strftime("%Y%m%d")
    for file in os.listdir(PATH_NEWS_API):
        if date in file:
            return os.path.join(PATH_NEWS_API, file)
    return None

def main():
    file = pick_file()
    if not file:
        print("No file found for today.")
        return
    
    print(f"Using file: {file}")
    articles = load_article_json(file)
    print(f"Loaded {len(articles)} articles")
    
    output_file = os.path.join(PATH_SAVE_NEWS_API, f"content_{datetime.now().strftime('%Y%m%d')}.json")
    
    for i, article in enumerate(articles):
        if 'content' not in article or not article['content']:
            try:
                print(f"Processing {i+1}/{len(articles)}: {article['url']}")
                content = load_content_article(article['url'])
                article['content'] = content
                print("✓ Success")
            except Exception as e:
                print(f"✗ Failed: {e}")
                article['content'] = article.get('description', None)
        
        # Save every 10 articles to prevent data loss
        if (i + 1) % 10 == 0:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(articles, f, ensure_ascii=False, indent=4)
            print(f"Progress saved: {i + 1}/{len(articles)} processed")
        
        time.sleep(2)  # Be nice to servers
    
    # Final save
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(articles, f, ensure_ascii=False, indent=4)
    print(f"Completed! Saved to {output_file}")

In [67]:
main()

Using file: ../data/raw/news_api\20250923_144422_NVIDIA.json
Loaded 36 articles from ../data/raw/news_api\20250923_144422_NVIDIA.json
Processing article 1/36: https://www.sammobile.com/news/samsung-may-finally-have-scored-the-nvidia-win-it-desperately-needs/
✓ Successfully loaded content for article 1
✓ Successfully loaded content for article 1
Processing article 2/36: https://finance.yahoo.com/news/samsung-shares-rise-high-nvidia-022419754.html
Processing article 2/36: https://finance.yahoo.com/news/samsung-shares-rise-high-nvidia-022419754.html
✓ Successfully loaded content for article 2
✓ Successfully loaded content for article 2
Processing article 3/36: https://tech.yahoo.com/ai/articles/nvidia-abu-dhabi-institute-launch-050141205.html
Processing article 3/36: https://tech.yahoo.com/ai/articles/nvidia-abu-dhabi-institute-launch-050141205.html
✓ Successfully loaded content for article 3
✓ Successfully loaded content for article 3
Processing article 4/36: https://www.digitimes.com/ne

In [2]:
articles = [
    {
      "source": "The Verge",
      "author": "Hayden Field",
      "headline": "Nvidia is partnering up with OpenAI to offer compute and cash",
      "description": "OpenAI is teaming up with Nvidia via a “strategic partnership” that will get the ChatGPT-maker more compute and more cash to develop new models on the road to superintelligence. The partnership, announced Monday, will allow OpenAI to “build and deploy at leas…",
      "url": "https://www.theverge.com/ai-artificial-intelligence/782624/nvidia-is-partnering-up-with-openai-to-offer-compute-and-cash",
      "timestamp": "2025-09-22T16:33:35Z",
      "scraped_date": "2025-09-22"
    },
    {
      "source": "Slashdot.org",
      "author": "msmash",
      "headline": "Nvidia To Invest $100 Billion in OpenAI",
      "description": "Nvidia will invest up to $100 billion in OpenAI as the AI lab builds data centers requiring 10 gigawatts of power capacity. The 10-gigawatt deployment equals 4 to 5 million GPUs -- the same number Nvidia will ship globally this year. Building one gigawatt of …",
      "url": "https://slashdot.org/story/25/09/22/1637225/nvidia-to-invest-100-billion-in-openai",
      "timestamp": "2025-09-22T16:37:00Z",
      "scraped_date": "2025-09-22"
    },
    {
      "source": "Business Insider",
      "author": "Steven Tweedie",
      "headline": "Nvidia is investing up to $100 billion in OpenAI as part of an AI data center deal",
      "description": "The two tech titans announced a deal on Monday that will see OpenAI build out \"at least 10 gigawatts\" of AI data centers running Nvidia systems.",
      "url": "https://www.businessinsider.com/nvidia-investing-up-to-100-billion-openai-ai-deal-2025-9",
      "timestamp": "2025-09-22T16:28:45Z",
      "scraped_date": "2025-09-22"
    },
    {
      "source": "Business Insider",
      "author": "Katie Notopoulos",
      "headline": "Nvidia plans to invest $100 billion into OpenAI. That's uh, a lot of money.",
      "description": "$100 billion is worth something like 333 AI researchers with Meta salaries.",
      "url": "https://www.businessinsider.com/nvidia-100-billion-openai-data-centers-scale-2025-9",
      "timestamp": "2025-09-22T20:26:46Z",
      "scraped_date": "2025-09-22"
    },
    {
      "source": "Yahoo Entertainment",
      "author": "Yahoo Finance Video",
      "headline": "How to trade the AI boom beyond chips",
      "description": "When people talk about artificial intelligence stocks, they usually think of chip companies like Nvidia (NVDA) or companies working on large language models,...",
      "url": "https://finance.yahoo.com/video/trade-ai-boom-beyond-chips-213601170.html",
      "timestamp": "2025-09-22T21:36:01Z",
      "scraped_date": "2025-09-22"
    }
]

dataset = [
        {
            "source": "Wired",
            "author": "Jacob Roach",
            "title": "Nvidia GeForce Now RTX 5080 (Blackwell) Review: RTX 5080 in a MacBook",
            "url": "https://www.wired.com/review/nvidia-geforce-now-rtx-5080-blackwell/",
            "urlToImage": "https://media.wired.com/photos/68c223fb98347899cc465e7f/191:100/w_1280,c_limit/geforce-now-macbook-1.png",
            "publishedAt": "2025-09-11T15:30:00Z",
            "content": "It breaks open the doors of support. Instead of games that are installed and ready to go, you can install them on a GeForce Now instance (the computer you're connected to for game streaming). Even ma… [+2081 chars]"
        },
        {
            "source": {
                "id": "wired",
                "name": "Wired"
            },
            "author": "Louryn Strampe",
            "title": "41 Best Labor Day Sales on WIRED-Tested Gear (2025)",
            "description": "Summer is almost gone, and with it go these great Labor Day deals on WIRED-approved Bluetooth speakers, power banks, pizza ovens, and more.",
            "url": "https://www.wired.com/story/best-labor-day-sales-deals-2025-2/",
            "urlToImage": "https://media.wired.com/photos/68b1dd3b9bb6bcd03e004618/191:100/w_1280,c_limit/The%20Best%20Labor%20Day%20Deals%20and%20Sales.png",
            "publishedAt": "2025-09-01T14:45:42Z",
            "content": "Labor Day weekend is almost over, but there is still a pontoon boat load of deals to score while shopping with a cold beer and a hot dog. The unofficial end of summer brings with it bargains on WIRED… [+30140 chars]"
        },
        {
            "source": {
                "id": "the-verge",
                "name": "The Verge"
            },
            "author": "Sean Hollister",
            "title": "Hands-on: Nvidia’s GeForce Now RTX 5080 is better and worse than I hoped",
            "description": "Today, Nvidia is soft-launching its latest gaming GPUs in the cloud - upgrading its $20-a-month GeForce Now Ultimate cloud gaming service with RTX 5080 graphics for select games, with more to come down the road. At the same time, it's also adding thousands mo…",
            "url": "https://www.theverge.com/hands-on/775222/nvidia-geforce-now-rtx-5080-hands-on",
            "urlToImage": "https://platform.theverge.com/wp-content/uploads/sites/2/2025/08/GeForce_NOW_Blackwell_KV_.jpg?quality=90&strip=all&crop=0%2C3.4613147178592%2C100%2C93.077370564282&w=1200",
            "publishedAt": "2025-09-10T07:18:16Z",
            "content": "<ul><li></li><li></li><li></li></ul>\r\nIts a good thing it doesnt cost more.\r\nIts a good thing it doesnt cost more.\r\nToday, Nvidia is soft-launching its latest gaming GPUs in the cloud upgrading its $… [+9201 chars]"
        },
        {
            "source": {
                "id": "the-verge",
                "name": "The Verge"
            },
            "author": "Sean Hollister",
            "title": "Framework is working on a giant haptic touchpad, Trackpoint nub, and eGPU for its laptops",
            "description": "Today, Framework announced the second-gen Framework Laptop 16 with two industry firsts: the first Nvidia graphics card upgrade you can perform at home in just a couple minutes, and the first complete 240W laptop charging solution over a USB-C cable. But the c…",
            "url": "https://www.theverge.com/news/766161/framework-egpu-haptic-touchpad-trackpoint-nub",
            "urlToImage": "https://platform.theverge.com/wp-content/uploads/sites/2/2025/08/framework-16-prototypes-001.jpg?quality=90&strip=all&crop=0%2C10.732984293194%2C100%2C78.534031413613&w=1200",
            "publishedAt": "2025-08-26T18:46:19Z",
            "content": "<ul><li></li><li></li><li></li></ul>\r\nFramework shares prototypes and scrapped ideas.\r\nFramework shares prototypes and scrapped ideas.\r\nToday, Framework announced the second-gen Framework Laptop 16 w… [+3727 chars]"
        },
        {
            "source": {
                "id": "the-verge",
                "name": "The Verge"
            },
            "author": "Sean Hollister",
            "title": "Framework is now selling the first gaming laptop that lets you easily upgrade its GPU — with Nvidia’s blessing",
            "description": "Framework CEO Nirav Patel said he would deliver \"the holy grail for gamers\" with the Framework Laptop 16. In 2023, he suggested it'd be the first consumer notebook to fulfil the promise of modular, upgradable graphics cards like a desktop PC. We at The Verge …",
            "url": "https://www.theverge.com/laptops/765528/framework-is-now-selling-the-first-gaming-laptop-that-lets-you-easily-upgrade-its-gpu-with-nvidias-blessing",
            "urlToImage": "https://platform.theverge.com/wp-content/uploads/sites/2/2025/08/257914_laptop_change_bg_color_CVirginia.jpg?quality=90&strip=all&crop=0%2C10.732984293194%2C100%2C78.534031413613&w=1200",
            "publishedAt": "2025-08-26T14:27:31Z",
            "content": "<ul><li></li><li></li><li></li></ul>\r\nThe second-gen Framework Laptop 16 is coming in November but the bigger deal is an upgradable laptop GPU.\r\nThe second-gen Framework Laptop 16 is coming in Novemb… [+7761 chars]"
        }
]


temp_articles = set(dataset)
for article in articles:
    temp_articles.add(article)


print(f"Total unique articles: {len(temp_articles)}")
for article in temp_articles:
    print(dict(article))

TypeError: unhashable type: 'dict'