# Web Scraping from Investing.com
- Scrape from trending news

In [1]:
import os
import re
import time
import random
import asyncio
import warnings
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor,  as_completed
import pandas as pd
import nest_asyncio
import cloudscraper
from htmldate import find_date
from bs4 import BeautifulSoup
from newspaper import Article

warnings.filterwarnings('ignore')

In [6]:
import cloudscraper
from bs4 import BeautifulSoup
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

# CONFIGURATION
MAX_WORKERS = 10
MAX_RETRIES = 5

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    ),
    "Accept": (
        "text/html,application/xhtml+xml,application/xml;"
        "q=0.9,image/avif,image/webp,*/*;q=0.8"
    ),
    "Referer": "https://www.investing.com/",
}

scraper = cloudscraper.create_scraper(
    browser={'browser': 'chrome', 'platform': 'windows'}
)

def fetch_page(page: int):
    # If page == 1, use default URL, else use pagination format
    url = "https://www.investing.com/news/most-popular-news"
    if page > 1:
        url += f"/{page}"
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = scraper.get(url, headers=HEADERS, timeout=30)
            r.raise_for_status()
            soup = BeautifulSoup(r.text, "lxml")

            anchors = soup.select(
                'ul[data-test="news-list"] '
                'li article a[data-test="article-title-link"]'
            )
            return [a["href"] for a in anchors if a.has_attr("href")]
        except Exception as e:
            if attempt < MAX_RETRIES:
                backoff = 2 ** (attempt - 1) + random.random()
                time.sleep(backoff)
            else:
                print(f"Page {page} failed after {MAX_RETRIES}: {e}")
    return []

def robust_scrape(max_pages=10):
    first = fetch_page(1)
    PER_PAGE = len(first)
    if PER_PAGE == 0:
        raise RuntimeError("Failed to fetch the first page. Please check headers or cookies.")

    print(f"Fetched {PER_PAGE} links on page 1")

    results = {1: first}

    # Auto stop if only one page
    if max_pages == 1 or PER_PAGE == 0:
        print("Only one page detected or no pagination found.")
        return first

    pages = list(range(2, max_pages + 1))

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
        futures = {pool.submit(fetch_page, p): p for p in pages}
        for fut in as_completed(futures):
            p = futures[fut]
            results[p] = fut.result()

        for round in range(1, MAX_RETRIES + 1):
            bad = [p for p, links in results.items() if len(links) != PER_PAGE]
            if not bad:
                print(f"All pages OK after {round - 1} retries")
                break
            print(f"Retry round {round} for pages: {bad}")
            futures = {pool.submit(fetch_page, p): p for p in bad}
            for fut in as_completed(futures):
                p = futures[fut]
                results[p] = fut.result()
        else:
            print("Retry limit reached; some pages may still be incomplete.")

    total_fetched = sum(len(links) for links in results.values())
    expected = PER_PAGE * max_pages
    print(f"Total links fetched (including duplicates): {total_fetched} (expected {expected})")

    all_links = set(link for links in results.values() for link in links)
    print(f"Final: got {len(all_links)} unique URLs")
    return list(all_links)

if __name__ == "__main__":
    links = robust_scrape(max_pages=1)  # You can adjust this to the actual known number of pages


Fetched 20 links on page 1
Only one page detected or no pagination found.


In [7]:
links[1]

'https://www.investing.com/news/stock-market-news/25-tariff-not-enough-to-push-apple-to-reshore-iphone-production--morgan-stanley-4064984'

In [9]:
nest_asyncio.apply()

FETCH_WORKERS = min(32, os.cpu_count() * 4)  
PROCESS_WORKERS = os.cpu_count() or 4
MAX_FETCH_RETRIES = 3                      
RETRY_DELAY = 1                             
scraper = cloudscraper.create_scraper()

def is_placeholder(html: str) -> bool:
    lower = html.lower() if html else ""
    return (
        'temporarily down for maintenance' in lower
        or 'just a moment' in lower
        or "we're temporarily down" in lower
    )

def safe_find_datetime(url, html_content=None):
    try:
        # Strategy 1: Use htmldate library to extract date from URL
        dt = find_date(url)
        if dt:
            return dt, "00:00"  # Return with default time if date found
    except:
        pass
    
    if html_content:
        # Strategy 2: Look for American format with AM/PM
        m = re.search(r"(\d{1,2}/\d{1,2}/\d{4}),\s*(\d{1,2}:\d{2}\s*(?:AM|PM))", html_content)
        if m:
            ds, ts = m.groups()
            try:
                dt = datetime.strptime(f"{ds}, {ts}", "%m/%d/%Y, %I:%M %p")
                return dt.strftime("%Y-%m-%d"), dt.strftime("%H:%M")
            except:
                pass
        
        # Strategy 3: Look for numeric dates with 24-hour time format
        m = re.search(r"(\d{2}/\d{2}/\d{4}),\s*(\d{2}:\d{2})", html_content)
        if m:
            ds, ts = m.groups()
            # Try both European and American date formats
            for fmt in ("%d/%m/%Y, %H:%M", "%m/%d/%Y, %H:%M"):
                try:
                    dt = datetime.strptime(f"{ds}, {ts}", fmt)
                    return dt.strftime("%Y-%m-%d"), dt.strftime("%H:%M")
                except:
                    continue
    
    now = datetime.now()
    return now.strftime("%Y-%m-%d"), now.strftime("%H:%M")

def fetch_html(url, idx, total):
    for attempt in range(1, MAX_FETCH_RETRIES + 1):
        try:
            resp = scraper.get(url, timeout=30)
            html = resp.text
            if is_placeholder(html):
                raise RuntimeError('Placeholder')
                
            print(f"[Fetch][{idx}/{total}][ok]")
            return url, html
            
        except Exception:
            print(f"[Fetch][{idx}/{total}][retry {attempt}]")
            if attempt < MAX_FETCH_RETRIES:
                time.sleep(RETRY_DELAY)
                
    print(f"[Fetch error] {idx}/{total}: failed after {MAX_FETCH_RETRIES} retries")
    return url, None

def process_article(arg):
    url, html = arg
    if not html:
        return None
        
    art = Article(url)
    art.set_html(html)
    
    try:
        art.parse()
    except:
        return None
        
    text = art.text or ""
    title = (art.title or "").strip() or "No title"
    
    date, tm = safe_find_datetime(url, html)
    
    # Return combined data using dictionary unpacking
    return {'publish_date': date, 'publish_time': tm,
             'title': title, 'body_text': text, 'url': url}

async def scrape_all(urls):
    total = len(urls)
    loop = asyncio.get_event_loop()
    
    # Phase 1: Fetch HTML content from all URLs in parallel
    with ThreadPoolExecutor(max_workers=FETCH_WORKERS) as fetch_pool:
        # Create fetch tasks and run them through the thread pool
        fetch_tasks = [loop.run_in_executor(fetch_pool, fetch_html, u, i+1, total)
                       for i, u in enumerate(urls)]
        # Wait for all fetch tasks to complete
        fetched = await asyncio.gather(*fetch_tasks)

    # Phase 2: Process all fetched HTML content in parallel
    records = []
    with ThreadPoolExecutor(max_workers=PROCESS_WORKERS) as proc_pool:
        # Submit processing tasks only for URLs with successful fetches
        futures = {
            proc_pool.submit(process_article, fr): fr[0]
            for fr in fetched if fr[1]  # Skip URLs where HTML is None
        }
        
        # Process results as they complete
        for i, fut in enumerate(as_completed(futures), 1):
            res = fut.result()
            print(f"[Process][{i}/{total}] {futures[fut]}")
            if res:
                records.append(res)
                
    # Convert results to DataFrame
    return pd.DataFrame(records)

# ——— Main entry point function ———
def main(links):
    df = asyncio.get_event_loop().run_until_complete(scrape_all(links))
    return df

# Execute the main function if this script is run directly
if __name__ == '__main__':
    df = main(links)

[Fetch][11/20][retry 1]
[Fetch][19/20][retry 1]
[Fetch][14/20][retry 1]
[Fetch][2/20][ok]
[Fetch][3/20][ok]
[Fetch][10/20][ok]
[Fetch][1/20][ok]
[Fetch][12/20][ok]
[Fetch][15/20][ok]
[Fetch][4/20][ok]
[Fetch][16/20][ok]
[Fetch][17/20][ok]
[Fetch][9/20][ok]
[Fetch][13/20][ok]
[Fetch][5/20][ok]
[Fetch][7/20][ok]
[Fetch][20/20][ok]
[Fetch][8/20][ok]
[Fetch][18/20][ok]
[Fetch][6/20][ok]
[Fetch][11/20][ok]
[Fetch][14/20][ok]
[Fetch][19/20][ok]
[Process][1/20] https://www.investing.com/news/economy-news/what-could-be-the-impact-of-a-50tariff-on-eu-economy-4064349
[Process][2/20] https://www.investing.com/news/stock-market-news/next-300-points-likely-up-for-the-sp-500-says-jpmorgan-4064865
[Process][3/20] https://www.investing.com/news/stock-market-news/new-surveys-show-declining-interest-in-evs-and-the-tesla-brand-around-the-world-4064783
[Process][4/20] https://www.investing.com/news/stock-market-news/citi-sees-up-to-8-downside-for-stoxx-600-on-trump-tariff-risk-4064531
[Process][5/20] http

In [12]:
df=df.sort_values(by=['publish_date', 'publish_time'], ascending=[False,False]).reset_index(drop=True)
df.head()

Unnamed: 0,publish_date,publish_time,title,body_text,url
0,2025-05-27,10:33,US Consumer bounces back sharply in May By Inv...,Investing.com -- Consumer confidence rebounded...,https://www.investing.com/news/economy-news/us...
1,2025-05-27,10:29,Eli Lilly expands pain pipeline with acquisiti...,Investing.com -- Eli Lilly and Company (NYSE: ...,https://www.investing.com/news/stock-market-ne...
2,2025-05-27,09:41,"Lynx: Risk priced in, Nvidia could surprise to...",Investing.com -- Nvidia stock could break out ...,https://www.investing.com/news/stock-market-ne...
3,2025-05-27,09:16,Trump Media & Technology Group Announces $2.5B...,Trump Media and Technology Group Corp. (Nasdaq...,https://www.investing.com/news/cryptocurrency-...
4,2025-05-27,09:15,SoundHound shares jump as Piper Sandler starts...,Investing.com -- Piper Sandler has initiated c...,https://www.investing.com/news/stock-market-ne...


In [13]:
pd.set_option('display.max_columns', None)

In [14]:
# Count empty body_text entries
empty_body_count = df[df['body_text'] == ''].shape[0]
print(f"Number of articles with empty body_text: {empty_body_count}")

Number of articles with empty body_text: 0


In [15]:
now = datetime.now()
date_time = now.strftime("%Y-%m-%d %H-%M-%S").strip().replace(' ', '_')
df.to_csv(f"Data/Headlines/{date_time}.csv", index=False)