# Web scraping stock market news for Sentiment Analysis

## 1. Install/import libraries

In [94]:
%pip install pandas nltk cloudscraper beautifulsoup4 htmldate newspaper3k lxml_html_clean

Note: you may need to restart the kernel to use updated packages.


In [1]:
# Standard library
import os
import re
import time
import random
import asyncio
import warnings
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor,  as_completed

# Third-party libraries
import pandas as pd
import nltk
import nest_asyncio
import cloudscraper
from htmldate import find_date
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from newspaper import Article

warnings.filterwarnings('ignore')
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## 2. Data collection



In [2]:
# ——— CONFIG ———
# Company ticker identifier for investing.com
COMPANY     = "us-spx-500"   
# Maximum number of pages to scrape (pagination)
MAX_PAGE    = 1000             
# Number of concurrent workers for ThreadPoolExecutor
MAX_WORKERS = 50              
# Maximum number of retry attempts for failed requests
MAX_RETRIES = 5              

# HTTP headers to mimic a legitimate web browser request
# This helps avoid detection as a bot/scraper
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    ),
    "Accept": (
        "text/html,application/xhtml+xml,application/xml;"
        "q=0.9,image/avif,image/webp,*/*;q=0.8"
    ),
    # Referrer to appear as if coming from the main site
    "Referer": "https://www.investing.com/",
}

# Create a cloudscraper instance to bypass Cloudflare protections
scraper = cloudscraper.create_scraper(
    browser={'browser': 'chrome', 'platform': 'windows'}
)

def fetch_page(page: int):
    """ดึงลิงก์ข่าวจากหน้า {page} แล้วคืน list ของ href"""
    # Construct URL for the specific page number
    url = f"https://www.investing.com/indices/{COMPANY}-news/{page}"
    # Implement retry logic for resilience
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            # Make HTTP request with timeout
            r = scraper.get(url, headers=HEADERS, timeout=30)
            r.raise_for_status()  # Raise exception for 4XX/5XX responses
            # Parse HTML with BeautifulSoup
            soup = BeautifulSoup(r.text, "lxml")
            # CSS selector to find news article links
            anchors = soup.select(
                'ul[data-test="news-list"] '
                'li article a[data-test="article-title-link"]'
            )
            # Extract href attributes from anchor tags
            return [a["href"] for a in anchors if a.has_attr("href")]
        except Exception as e:
            # Exponential backoff with jitter for retries
            if attempt < MAX_RETRIES:
                backoff = 2 ** (attempt - 1) + random.random()
                time.sleep(backoff)
            else:
                # Log failure after exhausting retries
                print(f"Page {page} failed after {MAX_RETRIES}: {e}")
    # Return empty list if all attempts fail
    return []

def robust_scrape():
    # First, fetch page 1 to determine how many links per page
    first = fetch_page(1)
    PER_PAGE = len(first)
    # Validate we got results from first page
    if PER_PAGE == 0:
        raise RuntimeError("Failed to fetch the first page. Please check headers or cookies and try again.")
    print(f"Detected {PER_PAGE} links per page, expecting {PER_PAGE * MAX_PAGE} total")

    # Initialize results dictionary with first page
    results = {1: first}
    # Create list of remaining pages to fetch
    pages = list(range(2, MAX_PAGE + 1))

    # Use thread pool for parallel execution
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
        # Submit fetch jobs for all pages
        futures = {pool.submit(fetch_page, p): p for p in pages}
        # Process completed futures as they finish
        for fut in as_completed(futures):
            p = futures[fut]
            results[p] = fut.result()

        # Retry logic for pages with incomplete results
        for round in range(1, MAX_RETRIES + 1):
            # Identify pages with fewer links than expected
            bad = [p for p, links in results.items() if len(links) != PER_PAGE]
            if not bad:
                print(f"All pages OK after {round-1} retries")
                break

            # Retry only the problematic pages
            print(f"Retry round {round} for pages: {bad}")
            futures = {pool.submit(fetch_page, p): p for p in bad}
            for fut in as_completed(futures):
                p = futures[fut]
                results[p] = fut.result()
        else:
            # This executes if the for loop completes without breaking
            print("Retry limit reached; some pages may still be incomplete.")

    # Calculate statistics on results
    total_fetched = sum(len(links) for links in results.values())
    expected = PER_PAGE * MAX_PAGE
    print(f"Total links fetched (including duplicates): {total_fetched} (expected {expected})")

    # Deduplicate links using set conversion
    all_links = set(link for links in results.values() for link in links)
    print(f"Final: got {len(all_links)} unique URLs (expected {expected})")
    return list(all_links)

# Entry point guard - only executes when run directly
if __name__ == "__main__":
    links = robust_scrape()

Detected 10 links per page, expecting 10000 total
All pages OK after 0 retries
Total links fetched (including duplicates): 10000 (expected 10000)
Final: got 10000 unique URLs (expected 10000)


In [3]:
links

['https://www.investing.com/news/stock-market-news/us-stocks-mixed-at-close-of-trade-dow-jones-industrial-average-down-017-3483985',
 'https://www.investing.com/news/stock-market-news/citi-warns-positioning-so-onesided-that-cpi-surprise-may-break-the-rally-432SI-3333730',
 'https://www.investing.com/news/stock-market-news/wall-street-fear-gauge-jumps-to-8month-high-as-stocks-sell-off-3968382',
 'https://www.investing.com/news/politics/what-does-a-second-trump-term-mean-for-investors-3721133',
 'https://www.investing.com/news/economy-news/options-market-positioned-for-us-treasury-10year-yield-to-hit-5-in-near-term-3806178',
 'https://www.investing.com/news/stock-market-news/walgreens-shares-dip-as-market-sees-mixed-results-93CH-3237168',
 'https://www.investing.com/news/stock-market-news/dow-jones-nasdaq-sp-500-weekly-preview-better-than-feared-q2-results-worse-than-expected-reactions-432SI-3146880',
 'https://www.investing.com/news/stock-market-news/be-prepared-for-another-78-potential

In [4]:
with open('SPX_urls.txt', 'w') as f:
    for link in links:
        f.write("%s\n" % link)

In [5]:
# Apply nest_asyncio to allow running asyncio within Jupyter notebook
# This is necessary because Jupyter already uses an event loop
nest_asyncio.apply()

# Configuration parameters for parallel processing and network requests
# Calculate optimal number of workers for fetching HTML (limited to 32)
FETCH_WORKERS = min(32, os.cpu_count() * 4)  
# Use available CPU cores for processing articles 
PROCESS_WORKERS = os.cpu_count() or 4       
# Maximum retry attempts for failed network requests
MAX_FETCH_RETRIES = 3                      
# Delay between retry attempts (seconds)
RETRY_DELAY = 1                             
# Stock ticker symbol used for labeling data
TICKER = 'SPX'
# Create cloudscraper instance to bypass anti-bot measures
scraper = cloudscraper.create_scraper()

def is_placeholder(html: str) -> bool:
    """
    Check if the HTML content is a placeholder/error page instead of actual content.
    
    These placeholders are common when websites detect scraping or have technical issues.
    """
    lower = html.lower() if html else ""
    return (
        'temporarily down for maintenance' in lower
        or 'just a moment' in lower
        or "we're temporarily down" in lower
    )

def safe_find_datetime(url, html_content=None):
    """
    Extract publication date and time from the URL or HTML content.
    
    Uses multiple strategies with fallbacks:
    1. Try htmldate library on URL
    2. Look for American format dates (MM/DD/YYYY, HH:MM AM/PM) in content
    3. Try both European and American formats (DD/MM/YYYY or MM/DD/YYYY)
    4. Fallback to current date and time if all else fails
    
    Returns: tuple of (date_string, time_string)
    """
    try:
        # Strategy 1: Use htmldate library to extract date from URL
        dt = find_date(url)
        if dt:
            return dt, "00:00"  # Return with default time if date found
    except:
        pass
    
    if html_content:
        # Strategy 2: Look for American format with AM/PM
        m = re.search(r"(\d{1,2}/\d{1,2}/\d{4}),\s*(\d{1,2}:\d{2}\s*(?:AM|PM))", html_content)
        if m:
            ds, ts = m.groups()
            try:
                dt = datetime.strptime(f"{ds}, {ts}", "%m/%d/%Y, %I:%M %p")
                return dt.strftime("%Y-%m-%d"), dt.strftime("%H:%M")
            except:
                pass
        
        # Strategy 3: Look for numeric dates with 24-hour time format
        m = re.search(r"(\d{2}/\d{2}/\d{4}),\s*(\d{2}:\d{2})", html_content)
        if m:
            ds, ts = m.groups()
            # Try both European and American date formats
            for fmt in ("%d/%m/%Y, %H:%M", "%m/%d/%Y, %H:%M"):
                try:
                    dt = datetime.strptime(f"{ds}, {ts}", fmt)
                    return dt.strftime("%Y-%m-%d"), dt.strftime("%H:%M")
                except:
                    continue
    
    # Fallback: Use current date and time
    now = datetime.now()
    return now.strftime("%Y-%m-%d"), now.strftime("%H:%M")

# ——— Fetch HTML with minimal retry mechanism ———
def fetch_html(url, idx, total):
    """
    Fetch the HTML content for a given URL with retry logic.
    
    Args:
        url: The URL to fetch
        idx: Current index (for progress reporting)
        total: Total number of URLs (for progress reporting)
        
    Returns:
        tuple of (url, html_content) or (url, None) if failed
    """
    for attempt in range(1, MAX_FETCH_RETRIES + 1):
        try:
            # Make HTTP request with timeout
            resp = scraper.get(url, timeout=30)
            html = resp.text
            
            # Check if we got a placeholder page instead of content
            if is_placeholder(html):
                raise RuntimeError('Placeholder')
                
            # Report success
            print(f"[Fetch][{idx}/{total}][ok]")
            return url, html
            
        except Exception:
            # Report retry attempt
            print(f"[Fetch][{idx}/{total}][retry {attempt}]")
            if attempt < MAX_FETCH_RETRIES:
                # Wait before retrying
                time.sleep(RETRY_DELAY)
                
    # Report complete failure after all retries
    print(f"[Fetch error] {idx}/{total}: failed after {MAX_FETCH_RETRIES} retries")
    return url, None

# ——— Initialize sentiment analyzer from NLTK ———
SID = SentimentIntensityAnalyzer()

def process_article(arg):
    """
    Process an article's HTML to extract content and analyze sentiment.
    
    Uses newspaper3k library to parse article content and NLTK's VADER
    for sentiment analysis.
    
    Args:
        arg: tuple of (url, html_content)
        
    Returns:
        dictionary with extracted data and sentiment scores, or None if processing failed
    """
    url, html = arg
    if not html:
        return None
        
    # Initialize newspaper Article object
    art = Article(url)
    art.set_html(html)
    
    try:
        # Parse article content
        art.parse()
    except:
        return None
        
    # Extract text and title
    text = art.text or ""
    title = (art.title or "").strip() or "No title"
    
    # Calculate sentiment scores using VADER
    pol = SID.polarity_scores(text)
    
    # Extract publication date and time
    date, tm = safe_find_datetime(url, html)
    
    # Return combined data using dictionary unpacking
    return {**{'ticker': TICKER, 'publish_date': date, 'publish_time': tm,
             'title': title, 'body_text': text, 'url': url}, **pol}

async def scrape_all(urls):
    """
    Asynchronous pipeline to fetch and process multiple URLs in parallel.
    
    Implements a two-phase approach:
    1. Fetch all URLs concurrently using threads
    2. Process all fetched content concurrently using threads
    
    Args:
        urls: List of URLs to process
        
    Returns:
        pandas DataFrame containing all processed articles
    """
    total = len(urls)
    loop = asyncio.get_event_loop()
    
    # Phase 1: Fetch HTML content from all URLs in parallel
    with ThreadPoolExecutor(max_workers=FETCH_WORKERS) as fetch_pool:
        # Create fetch tasks and run them through the thread pool
        fetch_tasks = [loop.run_in_executor(fetch_pool, fetch_html, u, i+1, total)
                       for i, u in enumerate(urls)]
        # Wait for all fetch tasks to complete
        fetched = await asyncio.gather(*fetch_tasks)

    # Phase 2: Process all fetched HTML content in parallel
    records = []
    with ThreadPoolExecutor(max_workers=PROCESS_WORKERS) as proc_pool:
        # Submit processing tasks only for URLs with successful fetches
        futures = {
            proc_pool.submit(process_article, fr): fr[0]
            for fr in fetched if fr[1]  # Skip URLs where HTML is None
        }
        
        # Process results as they complete
        for i, fut in enumerate(as_completed(futures), 1):
            res = fut.result()
            print(f"[Process][{i}/{total}] {futures[fut]}")
            if res:
                records.append(res)
                
    # Convert results to DataFrame
    return pd.DataFrame(records)

# ——— Main entry point function ———
def main(links):
    """
    Main function that runs the entire scraping pipeline.
    
    Args:
        links: List of URLs to process
        
    Returns:
        pandas DataFrame with processed articles
    """
    df = asyncio.get_event_loop().run_until_complete(scrape_all(links))
    return df

# Execute the main function if this script is run directly
if __name__ == '__main__':
    df = main(links)
    print(df.shape)
    print(df.head())

[Fetch][21/10000][retry 1]
[Fetch][23/10000][retry 1]
[Fetch][27/10000][retry 1]
[Fetch][26/10000][retry 1]
[Fetch][32/10000][retry 1]
[Fetch][8/10000][ok]
[Fetch][11/10000][ok]
[Fetch][6/10000][ok]
[Fetch][3/10000][ok]
[Fetch][1/10000][ok]
[Fetch][16/10000][ok]
[Fetch][5/10000][ok]
[Fetch][4/10000][ok]
[Fetch][13/10000][ok]
[Fetch][9/10000][ok]
[Fetch][10/10000][ok]
[Fetch][2/10000][ok]
[Fetch][19/10000][ok]
[Fetch][18/10000][ok]
[Fetch][14/10000][ok]
[Fetch][15/10000][ok]
[Fetch][30/10000][ok]
[Fetch][20/10000][ok]
[Fetch][7/10000][ok]
[Fetch][25/10000][ok]
[Fetch][31/10000][ok]
[Fetch][29/10000][ok]
[Fetch][35/10000][ok]
[Fetch][34/10000][ok]
[Fetch][37/10000][ok]
[Fetch][24/10000][ok]
[Fetch][38/10000][ok]
[Fetch][39/10000][ok]
[Fetch][36/10000][ok]
[Fetch][33/10000][ok]
[Fetch][40/10000][ok]
[Fetch][41/10000][ok]
[Fetch][22/10000][ok]
[Fetch][44/10000][ok]
[Fetch][45/10000][ok]
[Fetch][12/10000][ok]
[Fetch][46/10000][ok]
[Fetch][47/10000][ok]
[Fetch][50/10000][ok]
[Fetch][51/10000

In [6]:
df

Unnamed: 0,ticker,publish_date,publish_time,title,body_text,url,neg,neu,pos,compound
0,SPX,2024-06-18,06:16,IQVIA Holdings receives Investment Bank Analys...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/quintis-trl...,0.076,0.893,0.032,-0.8437
1,SPX,2025-04-07,03:24,Be prepared for another 7-8% potential downsid...,Investing.com -- Morgan Stanley is warning inv...,https://www.investing.com/news/stock-market-ne...,0.128,0.824,0.047,-0.9875
2,SPX,2023-11-16,21:20,Walgreens shares dip as market sees mixed resu...,NEW YORK - Walgreens Boots Alliance Inc. (NASD...,https://www.investing.com/news/stock-market-ne...,0.031,0.865,0.104,0.9682
3,SPX,2025-01-31,08:04,Vanguard dials back diversity language for US ...,By Ross Kerber\n\n(Reuters) - Top mutual fund ...,https://www.investing.com/news/stock-market-ne...,0.036,0.915,0.049,0.4401
4,SPX,2025-01-23,16:48,'dovish hike' incoming? By Reuters,By Jamie McGeever\n\n(Reuters) - A look at the...,https://www.investing.com/news/economy-news/mo...,0.041,0.873,0.086,0.9730
...,...,...,...,...,...,...,...,...,...,...
9701,SPX,2025-02-05,05:42,"Barclays on investing in Trump era: ""Look thro...",Investing.com -- Equities have had a strong st...,https://www.investing.com/news/stock-market-ne...,0.084,0.733,0.182,0.9940
9702,SPX,2025-02-04,08:34,Evercore upgrades Marriott to outperform on st...,Investing.com -- Evercore ISI upgraded Marriot...,https://www.investing.com/news/stock-market-ne...,0.061,0.823,0.115,0.9382
9703,SPX,2023-07-06,06:10,Wall St logs sharp losses as labor market stre...,"By Lewis Krauskopf, Bansari Mayur Kamdar and J...",https://www.investing.com/news/economy/wall-st...,0.067,0.821,0.111,0.9795
9704,SPX,2025-04-07,09:04,TSX closes down amidst tariff chaos By Investi...,Investing.com -- Canada’s main stock index con...,https://www.investing.com/news/stock-market-ne...,0.069,0.877,0.054,-0.8248


In [7]:
#sort by publish_date and publish_time
df = df.sort_values(by=['publish_date', 'publish_time'], ascending=[False,False]).reset_index(drop=True)

In [8]:
pd.set_option('display.max_columns', None)

In [9]:
# Count empty body_text entries
empty_body_count = df[df['body_text'] == ''].shape[0]
print(f"Number of articles with empty body_text: {empty_body_count}")

Number of articles with empty body_text: 0


In [10]:
df

Unnamed: 0,ticker,publish_date,publish_time,title,body_text,url,neg,neu,pos,compound
0,SPX,2025-05-08,20:28,US stock futures steady after Wall St jumps on...,Investing.com-- U.S. stock futures were largel...,https://www.investing.com/news/stock-market-ne...,0.059,0.844,0.097,0.8979
1,SPX,2025-05-08,17:08,"Trading Day: Tariff tensions cool, markets siz...","By Jamie McGeever\n\nORLANDO, Florida (Reuters...",https://www.investing.com/news/economy-news/tr...,0.096,0.798,0.107,0.9497
2,SPX,2025-05-08,16:30,U.S. stocks higher at close of trade; Dow Jone...,Investing.com – U.S. stocks were higher after ...,https://www.investing.com/news/stock-market-ne...,0.058,0.869,0.074,0.7003
3,SPX,2025-05-08,15:44,Trump Seeks Tax Hike On Wealthy Who Earn $2.5 ...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/trump-seeks...,0.076,0.893,0.032,-0.8437
4,SPX,2025-05-08,15:23,US weighs plan to slash China tariffs to as lo...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/us-weighs-p...,0.076,0.893,0.032,-0.8437
...,...,...,...,...,...,...,...,...,...,...
9701,SPX,2023-06-30,09:34,Apple's market value ends above $3 trillion fo...,By Noel Randewich and Tiyashi Datta\n\n(Reuter...,https://www.investing.com/news/stock-market-ne...,0.017,0.834,0.149,0.9949
9702,SPX,2023-06-30,09:26,Airline stocks set for best month in two years...,By Medha Singh\n\n(Reuters) - U.S. airline sto...,https://www.investing.com/news/stock-market-ne...,0.023,0.841,0.136,0.9922
9703,SPX,2023-06-30,06:28,"Wall St rallies; Nasdaq hits 40-yr milestone, ...","By Sinéad Carew, Sruthi Shankar and Johann M C...",https://www.investing.com/news/economy/futures...,0.017,0.826,0.158,0.9981
9704,SPX,2023-06-29,23:06,Global mergers and acquisitions plunge in Q2 b...,By Anirban Sen and Andres Gonzalez\n\nNEW YORK...,https://www.investing.com/news/economy/global-...,0.049,0.849,0.102,0.9948


In [11]:
article_sentiments = df.copy()

In [12]:
article_sentiments

Unnamed: 0,ticker,publish_date,publish_time,title,body_text,url,neg,neu,pos,compound
0,SPX,2025-05-08,20:28,US stock futures steady after Wall St jumps on...,Investing.com-- U.S. stock futures were largel...,https://www.investing.com/news/stock-market-ne...,0.059,0.844,0.097,0.8979
1,SPX,2025-05-08,17:08,"Trading Day: Tariff tensions cool, markets siz...","By Jamie McGeever\n\nORLANDO, Florida (Reuters...",https://www.investing.com/news/economy-news/tr...,0.096,0.798,0.107,0.9497
2,SPX,2025-05-08,16:30,U.S. stocks higher at close of trade; Dow Jone...,Investing.com – U.S. stocks were higher after ...,https://www.investing.com/news/stock-market-ne...,0.058,0.869,0.074,0.7003
3,SPX,2025-05-08,15:44,Trump Seeks Tax Hike On Wealthy Who Earn $2.5 ...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/trump-seeks...,0.076,0.893,0.032,-0.8437
4,SPX,2025-05-08,15:23,US weighs plan to slash China tariffs to as lo...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/us-weighs-p...,0.076,0.893,0.032,-0.8437
...,...,...,...,...,...,...,...,...,...,...
9701,SPX,2023-06-30,09:34,Apple's market value ends above $3 trillion fo...,By Noel Randewich and Tiyashi Datta\n\n(Reuter...,https://www.investing.com/news/stock-market-ne...,0.017,0.834,0.149,0.9949
9702,SPX,2023-06-30,09:26,Airline stocks set for best month in two years...,By Medha Singh\n\n(Reuters) - U.S. airline sto...,https://www.investing.com/news/stock-market-ne...,0.023,0.841,0.136,0.9922
9703,SPX,2023-06-30,06:28,"Wall St rallies; Nasdaq hits 40-yr milestone, ...","By Sinéad Carew, Sruthi Shankar and Johann M C...",https://www.investing.com/news/economy/futures...,0.017,0.826,0.158,0.9981
9704,SPX,2023-06-29,23:06,Global mergers and acquisitions plunge in Q2 b...,By Anirban Sen and Andres Gonzalez\n\nNEW YORK...,https://www.investing.com/news/economy/global-...,0.049,0.849,0.102,0.9948


In [13]:
# 1. แปลงคอลัมน์ publish_date เป็น datetime
article_sentiments['publish_date'] = pd.to_datetime(article_sentiments['publish_date'], errors='coerce')

# 2. แปลงคอลัมน์ publish_time เป็นเวลา (datetime.time)
article_sentiments['publish_time'] = pd.to_datetime(article_sentiments['publish_time'], format='%H:%M', errors='coerce').dt.time

article_sentiments['publish_datetime'] = pd.to_datetime(
    article_sentiments['publish_date'].astype(str) + ' ' + article_sentiments['publish_time'].astype(str),
    errors='coerce'
)


In [14]:


# สมมติว่า df คือ DataFrame ที่ได้จากการ scrape
# ตัวอย่างการโหลดไฟล์ pickle (ปรับ path ให้ตรงกับไฟล์ของคุณ)
# df = pd.read_pickle('/mnt/data/your_file.pkl')


def remove_html(text):
    if isinstance(text, str):
        return BeautifulSoup(text, "html.parser").get_text()
    return text

def remove_ads(text):
    if isinstance(text, str):
        # ลบข้อความที่มีรูปแบบโฆษณาที่พบได้บ่อย (สามารถปรับ regex ได้ตามความเหมาะสม)
        text = re.sub(r"here or remove ads.*?disclosureor", "", text, flags=re.IGNORECASE|re.DOTALL)
    return text

def clean_text(text):
    text = remove_html(text)      # ลบ HTML tags
    text = remove_ads(text)       # ลบข้อความโฆษณา
    text = text.strip()           # ตัดช่องว่างหัวและท้าย
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)  # แทนที่ช่องว่างหลายตัวด้วยช่องว่างเดียว
    return text

for col in ['title', 'body_text']:
    article_sentiments[col] = article_sentiments[col].apply(clean_text)

article_sentiments = article_sentiments.drop_duplicates()

article_sentiments = article_sentiments.drop(columns=['publish_date', 'publish_time'])

In [15]:
article_sentiments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9706 entries, 0 to 9705
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   ticker            9706 non-null   object        
 1   title             9706 non-null   object        
 2   body_text         9706 non-null   object        
 3   url               9706 non-null   object        
 4   neg               9706 non-null   float64       
 5   neu               9706 non-null   float64       
 6   pos               9706 non-null   float64       
 7   compound          9706 non-null   float64       
 8   publish_datetime  9706 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(4), object(4)
memory usage: 682.6+ KB


In [16]:
article_sentiments

Unnamed: 0,ticker,title,body_text,url,neg,neu,pos,compound,publish_datetime
0,SPX,US stock futures steady after Wall St jumps on...,Investing.com-- U.S. stock futures were largel...,https://www.investing.com/news/stock-market-ne...,0.059,0.844,0.097,0.8979,2025-05-08 20:28:00
1,SPX,"Trading Day: Tariff tensions cool, markets siz...","By Jamie McGeever ORLANDO, Florida (Reuters) -...",https://www.investing.com/news/economy-news/tr...,0.096,0.798,0.107,0.9497,2025-05-08 17:08:00
2,SPX,U.S. stocks higher at close of trade; Dow Jone...,Investing.com – U.S. stocks were higher after ...,https://www.investing.com/news/stock-market-ne...,0.058,0.869,0.074,0.7003,2025-05-08 16:30:00
3,SPX,Trump Seeks Tax Hike On Wealthy Who Earn $2.5 ...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/trump-seeks...,0.076,0.893,0.032,-0.8437,2025-05-08 15:44:00
4,SPX,US weighs plan to slash China tariffs to as lo...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/us-weighs-p...,0.076,0.893,0.032,-0.8437,2025-05-08 15:23:00
...,...,...,...,...,...,...,...,...,...
9701,SPX,Apple's market value ends above $3 trillion fo...,By Noel Randewich and Tiyashi Datta (Reuters) ...,https://www.investing.com/news/stock-market-ne...,0.017,0.834,0.149,0.9949,2023-06-30 09:34:00
9702,SPX,Airline stocks set for best month in two years...,By Medha Singh (Reuters) - U.S. airline stocks...,https://www.investing.com/news/stock-market-ne...,0.023,0.841,0.136,0.9922,2023-06-30 09:26:00
9703,SPX,"Wall St rallies; Nasdaq hits 40-yr milestone, ...","By Sinéad Carew, Sruthi Shankar and Johann M C...",https://www.investing.com/news/economy/futures...,0.017,0.826,0.158,0.9981,2023-06-30 06:28:00
9704,SPX,Global mergers and acquisitions plunge in Q2 b...,By Anirban Sen and Andres Gonzalez NEW YORK/LO...,https://www.investing.com/news/economy/global-...,0.049,0.849,0.102,0.9948,2023-06-29 23:06:00


In [17]:
article_sentiments = article_sentiments.sort_values(
    by='publish_datetime', 
    ascending=False
).reset_index(drop=True)

In [18]:
article_sentiments

Unnamed: 0,ticker,title,body_text,url,neg,neu,pos,compound,publish_datetime
0,SPX,US stock futures steady after Wall St jumps on...,Investing.com-- U.S. stock futures were largel...,https://www.investing.com/news/stock-market-ne...,0.059,0.844,0.097,0.8979,2025-05-08 20:28:00
1,SPX,"Trading Day: Tariff tensions cool, markets siz...","By Jamie McGeever ORLANDO, Florida (Reuters) -...",https://www.investing.com/news/economy-news/tr...,0.096,0.798,0.107,0.9497,2025-05-08 17:08:00
2,SPX,U.S. stocks higher at close of trade; Dow Jone...,Investing.com – U.S. stocks were higher after ...,https://www.investing.com/news/stock-market-ne...,0.058,0.869,0.074,0.7003,2025-05-08 16:30:00
3,SPX,Trump Seeks Tax Hike On Wealthy Who Earn $2.5 ...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/trump-seeks...,0.076,0.893,0.032,-0.8437,2025-05-08 15:44:00
4,SPX,US weighs plan to slash China tariffs to as lo...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/us-weighs-p...,0.076,0.893,0.032,-0.8437,2025-05-08 15:23:00
...,...,...,...,...,...,...,...,...,...
9701,SPX,Apple's market value ends above $3 trillion fo...,By Noel Randewich and Tiyashi Datta (Reuters) ...,https://www.investing.com/news/stock-market-ne...,0.017,0.834,0.149,0.9949,2023-06-30 09:34:00
9702,SPX,Airline stocks set for best month in two years...,By Medha Singh (Reuters) - U.S. airline stocks...,https://www.investing.com/news/stock-market-ne...,0.023,0.841,0.136,0.9922,2023-06-30 09:26:00
9703,SPX,"Wall St rallies; Nasdaq hits 40-yr milestone, ...","By Sinéad Carew, Sruthi Shankar and Johann M C...",https://www.investing.com/news/economy/futures...,0.017,0.826,0.158,0.9981,2023-06-30 06:28:00
9704,SPX,Global mergers and acquisitions plunge in Q2 b...,By Anirban Sen and Andres Gonzalez NEW YORK/LO...,https://www.investing.com/news/economy/global-...,0.049,0.849,0.102,0.9948,2023-06-29 23:06:00


In [19]:
article_sentiments = article_sentiments[
    ~article_sentiments['title']
        .str.strip()
        .str.match(r'^(Cryptocurrency News|By Investing\.com|Error 404: Page Not Found)$')
].reset_index(drop=True)


In [20]:
article_sentiments

Unnamed: 0,ticker,title,body_text,url,neg,neu,pos,compound,publish_datetime
0,SPX,US stock futures steady after Wall St jumps on...,Investing.com-- U.S. stock futures were largel...,https://www.investing.com/news/stock-market-ne...,0.059,0.844,0.097,0.8979,2025-05-08 20:28:00
1,SPX,"Trading Day: Tariff tensions cool, markets siz...","By Jamie McGeever ORLANDO, Florida (Reuters) -...",https://www.investing.com/news/economy-news/tr...,0.096,0.798,0.107,0.9497,2025-05-08 17:08:00
2,SPX,U.S. stocks higher at close of trade; Dow Jone...,Investing.com – U.S. stocks were higher after ...,https://www.investing.com/news/stock-market-ne...,0.058,0.869,0.074,0.7003,2025-05-08 16:30:00
3,SPX,Trump Seeks Tax Hike On Wealthy Who Earn $2.5 ...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/trump-seeks...,0.076,0.893,0.032,-0.8437,2025-05-08 15:44:00
4,SPX,US weighs plan to slash China tariffs to as lo...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/us-weighs-p...,0.076,0.893,0.032,-0.8437,2025-05-08 15:23:00
...,...,...,...,...,...,...,...,...,...
9701,SPX,Apple's market value ends above $3 trillion fo...,By Noel Randewich and Tiyashi Datta (Reuters) ...,https://www.investing.com/news/stock-market-ne...,0.017,0.834,0.149,0.9949,2023-06-30 09:34:00
9702,SPX,Airline stocks set for best month in two years...,By Medha Singh (Reuters) - U.S. airline stocks...,https://www.investing.com/news/stock-market-ne...,0.023,0.841,0.136,0.9922,2023-06-30 09:26:00
9703,SPX,"Wall St rallies; Nasdaq hits 40-yr milestone, ...","By Sinéad Carew, Sruthi Shankar and Johann M C...",https://www.investing.com/news/economy/futures...,0.017,0.826,0.158,0.9981,2023-06-30 06:28:00
9704,SPX,Global mergers and acquisitions plunge in Q2 b...,By Anirban Sen and Andres Gonzalez NEW YORK/LO...,https://www.investing.com/news/economy/global-...,0.049,0.849,0.102,0.9948,2023-06-29 23:06:00


In [21]:
article_sentiments.to_pickle("SPX_article_sentiments.pkl")

article_sentiments.to_csv("SPX_article_sentiments.csv", index=False) 

In [None]:
TICKERS        = ['NVDA','AAPL','AMZN','GOOGL','MSFT','META','TSLA']
START_DATE     = '2009-07-01'
END_DATE       = '2025-05-03'
TH_UP, TH_DOWN = 0.005, -0.005

df_nvda  = pd.read_pickle('data/NVDA_article_sentiments.pkl')
df_appl  = pd.read_pickle('data/AAPL_article_sentiments.pkl')
df_amzn  = pd.read_pickle('data/AMZN_article_sentiments.pkl')
df_googl = pd.read_pickle('data/GOOGL_article_sentiments.pkl')
df_msft = pd.read_pickle('data/MSFT_article_sentiments.pkl')
df_meta = pd.read_pickle('data/META_article_sentiments.pkl')
df_tsla = pd.read_pickle('data/TSLA_article_sentiments.pkl')
df_news  = pd.concat([df_nvda, df_appl, df_amzn, df_googl,df_msft,df_meta,df_tsla], axis=0, ignore_index=True)

df_news['date'] = pd.to_datetime(df_news['publish_datetime']).dt.date

In [None]:
import yfinance as yf

price_dfs = []
for tk in TICKERS:
    p = yf.download(tk, start=START_DATE, end=END_DATE, auto_adjust=False)
    # Flatten columns if MultiIndex
    if isinstance(p.columns, pd.MultiIndex):
        p.columns = p.columns.get_level_values(0)
    p = p.rename(columns={
        'Open':'open_t','High':'high_t','Low':'low_t',
        'Close':'close_t','Volume':'volume_t'
    })
    # MA, Momentum, ATR, RSI, MACD_hist
    p['MA5_t']       = p['close_t'].rolling(5).mean()
    p['MA10_t']      = p['close_t'].rolling(10).mean()
    p['momentum5_t'] = p['close_t'] - p['close_t'].shift(5)
    hl = p['high_t'] - p['low_t']
    hc = (p['high_t'] - p['close_t'].shift(1)).abs()
    lc = (p['low_t']  - p['close_t'].shift(1)).abs()
    tr = pd.concat([hl,hc,lc],axis=1).max(axis=1)
    p['ATR14_t']     = tr.rolling(14).mean()
    delta = p['close_t'].diff()
    gain  = delta.clip(lower=0); loss = -delta.clip(upper=0)
    avg_g = gain.rolling(14).mean(); avg_l = loss.rolling(14).mean()
    rs    = avg_g/avg_l
    p['RSI14_t']     = 100 - (100/(1+rs))
    ema12 = p['close_t'].ewm(span=12,adjust=False).mean()
    ema26 = p['close_t'].ewm(span=26,adjust=False).mean()
    macd  = ema12 - ema26
    signal= macd.ewm(span=9,adjust=False).mean()
    p['MACD_hist_t'] = macd - signal
    # target label
    p['close_t+1']        = p['close_t'].shift(-1)
    p['future_return_1d'] = (p['close_t+1'] - p['close_t'])/p['close_t']
    p['label'] = p['future_return_1d'].apply(
        lambda r: 2 if r>TH_UP else 0 if r<TH_DOWN else 1
    )
    req = ['open_t','high_t','low_t','close_t','volume_t',
           'MA5_t','MA10_t','momentum5_t','ATR14_t','RSI14_t','MACD_hist_t',
           'future_return_1d','label']
    p = p.dropna(subset=req)
    p = p.reset_index().rename(columns={'Date':'date'})
    p['date']   = p['date'].dt.date
    p['ticker'] = tk
    price_dfs.append(p)


df_price = pd.concat(price_dfs, ignore_index=True)

In [None]:
df = pd.merge(df_news, df_price, on=['ticker','date'], how='inner')

In [None]:
df.to_pickle("data/Stock_article_sentiments.pkl")
df.to_csv("data/Stock_article_sentiments.csv", index=False) 

In [157]:
df = pd.read_pickle("data/Stock_article_sentiments.pkl")

In [None]:
df

Unnamed: 0,ticker,title,body_text,url,neg,neu,pos,compound,publish_datetime,date,Adj Close,close_t,high_t,low_t,open_t,volume_t,MA5_t,MA10_t,momentum5_t,ATR14_t,RSI14_t,MACD_hist_t,close_t+1,future_return_1d,label
0,NVDA,Deepseek Releases New Math AI Model,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/deepseek-re...,0.076,0.893,0.032,-0.8437,2025-04-30 07:25:00,2025-04-30,108.919998,108.919998,108.919998,104.080002,104.470001,235044600,108.822000,104.860000,6.209999,5.839285,43.836858,1.197380,111.610001,0.024697,2
1,NVDA,Adv Micro Device receives Investment Bank Anal...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/seaport-glo...,0.076,0.893,0.032,-0.8437,2025-04-30 07:01:00,2025-04-30,108.919998,108.919998,108.919998,104.080002,104.470001,235044600,108.822000,104.860000,6.209999,5.839285,43.836858,1.197380,111.610001,0.024697,2
2,NVDA,Nvidia Corp receives Investment Bank Analyst R...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/seaport-glo...,0.076,0.893,0.032,-0.8437,2025-04-30 07:01:00,2025-04-30,108.919998,108.919998,108.919998,104.080002,104.470001,235044600,108.822000,104.860000,6.209999,5.839285,43.836858,1.197380,111.610001,0.024697,2
3,NVDA,Taiwan’s ASE: evaluating how it will support N...,By Wen-Yee Lee and Ben Blanchard TAIPEI (Reute...,https://www.investing.com/news/stock-market-ne...,0.032,0.883,0.084,0.9423,2025-04-30 06:56:00,2025-04-30,108.919998,108.919998,108.919998,104.080002,104.470001,235044600,108.822000,104.860000,6.209999,5.839285,43.836858,1.197380,111.610001,0.024697,2
4,NVDA,"Super Micro slumps on forecast cut, analysts d...",By Aditya Soni (Reuters) -Super Micro Computer...,https://www.investing.com/news/stock-market-ne...,0.099,0.779,0.122,0.9432,2025-04-30 06:47:00,2025-04-30,108.919998,108.919998,108.919998,104.080002,104.470001,235044600,108.822000,104.860000,6.209999,5.839285,43.836858,1.197380,111.610001,0.024697,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64647,TSLA,Exclusive: Tesla supplier Panasonic eyes 20% j...,By Norihiko Shirouzu (Reuters) - Panasonic (OT...,https://www.investing.com/news/stock-market-ne...,0.042,0.817,0.141,0.9957,2022-07-13 03:59:00,2022-07-13,237.039993,237.039993,242.059998,225.033340,225.500000,97954500,239.952002,234.478668,5.306656,12.683808,50.570807,1.759872,238.313339,0.005372,2
64648,TSLA,Dow Futures Tick Higher Ahead of Key Inflation...,By Oliver Gray Investing.com - U.S. stock futu...,https://www.investing.com/news/stock-market-ne...,0.046,0.895,0.059,0.5423,2022-07-12 19:12:00,2022-07-12,233.070007,233.070007,239.773331,228.369995,236.846664,87930900,238.890671,234.041002,0.003342,12.396666,47.535828,1.935274,237.039993,0.017033,2
64649,TSLA,Twitter sues Elon Musk to hold him to $44 bill...,"By Tom Hals WILMINGTON, Del. (Reuters) - Twitt...",https://www.investing.com/news/economy/twitter...,0.101,0.822,0.077,-0.9636,2022-07-12 17:04:00,2022-07-12,233.070007,233.070007,239.773331,228.369995,236.846664,87930900,238.890671,234.041002,0.003342,12.396666,47.535828,1.935274,237.039993,0.017033,2
64650,TSLA,"Trump steps up attacks on Musk, who said Trump...",(Reuters) - Donald Trump on Tuesday ratcheted ...,https://www.investing.com/news/stock-market-ne...,0.055,0.888,0.057,0.1403,2022-07-12 02:34:00,2022-07-12,233.070007,233.070007,239.773331,228.369995,236.846664,87930900,238.890671,234.041002,0.003342,12.396666,47.535828,1.935274,237.039993,0.017033,2
