# Web scraping stock market news for Sentiment Analysis

## 1. Install/import libraries

In [None]:
%pip install pandas nltk cloudscraper beautifulsoup4 htmldate newspaper3k lxml_html_clean

^C


In [85]:
# Standard library
import os
import re
import time
import random
import asyncio
import warnings
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor,  as_completed

# Third-party libraries
import pandas as pd
import nltk
import nest_asyncio
import cloudscraper
from htmldate import find_date
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from newspaper import Article

warnings.filterwarnings('ignore')
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## 2. Data collection



In [91]:
# ——— CONFIG ———
# Company ticker identifier for investing.com
COMPANY     = "apple-computer-inc"   
# Maximum number of pages to scrape (pagination)
MAX_PAGE    = 1000             
# Number of concurrent workers for ThreadPoolExecutor
MAX_WORKERS = 50              
# Maximum number of retry attempts for failed requests
MAX_RETRIES = 5              

# HTTP headers to mimic a legitimate web browser request
# This helps avoid detection as a bot/scraper
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    ),
    "Accept": (
        "text/html,application/xhtml+xml,application/xml;"
        "q=0.9,image/avif,image/webp,*/*;q=0.8"
    ),
    # Referrer to appear as if coming from the main site
    "Referer": "https://www.investing.com/",
}

# Create a cloudscraper instance to bypass Cloudflare protections
scraper = cloudscraper.create_scraper(
    browser={'browser': 'chrome', 'platform': 'windows'}
)

def fetch_page(page: int):
    """ดึงลิงก์ข่าวจากหน้า {page} แล้วคืน list ของ href"""
    # Construct URL for the specific page number
    url = f"https://www.investing.com/equities/{COMPANY}-news/{page}"
    # Implement retry logic for resilience
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            # Make HTTP request with timeout
            r = scraper.get(url, headers=HEADERS, timeout=30)
            r.raise_for_status()  # Raise exception for 4XX/5XX responses
            # Parse HTML with BeautifulSoup
            soup = BeautifulSoup(r.text, "lxml")
            # CSS selector to find news article links
            anchors = soup.select(
                'ul[data-test="news-list"] '
                'li article a[data-test="article-title-link"]'
            )
            # Extract href attributes from anchor tags
            return [a["href"] for a in anchors if a.has_attr("href")]
        except Exception as e:
            # Exponential backoff with jitter for retries
            if attempt < MAX_RETRIES:
                backoff = 2 ** (attempt - 1) + random.random()
                time.sleep(backoff)
            else:
                # Log failure after exhausting retries
                print(f"Page {page} failed after {MAX_RETRIES}: {e}")
    # Return empty list if all attempts fail
    return []

def robust_scrape():
    # First, fetch page 1 to determine how many links per page
    first = fetch_page(1)
    PER_PAGE = len(first)
    # Validate we got results from first page
    if PER_PAGE == 0:
        raise RuntimeError("Failed to fetch the first page. Please check headers or cookies and try again.")
    print(f"Detected {PER_PAGE} links per page, expecting {PER_PAGE * MAX_PAGE} total")

    # Initialize results dictionary with first page
    results = {1: first}
    # Create list of remaining pages to fetch
    pages = list(range(2, MAX_PAGE + 1))

    # Use thread pool for parallel execution
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
        # Submit fetch jobs for all pages
        futures = {pool.submit(fetch_page, p): p for p in pages}
        # Process completed futures as they finish
        for fut in as_completed(futures):
            p = futures[fut]
            results[p] = fut.result()

        # Retry logic for pages with incomplete results
        for round in range(1, MAX_RETRIES + 1):
            # Identify pages with fewer links than expected
            bad = [p for p, links in results.items() if len(links) != PER_PAGE]
            if not bad:
                print(f"All pages OK after {round-1} retries")
                break

            # Retry only the problematic pages
            print(f"Retry round {round} for pages: {bad}")
            futures = {pool.submit(fetch_page, p): p for p in bad}
            for fut in as_completed(futures):
                p = futures[fut]
                results[p] = fut.result()
        else:
            # This executes if the for loop completes without breaking
            print("Retry limit reached; some pages may still be incomplete.")

    # Calculate statistics on results
    total_fetched = sum(len(links) for links in results.values())
    expected = PER_PAGE * MAX_PAGE
    print(f"Total links fetched (including duplicates): {total_fetched} (expected {expected})")

    # Deduplicate links using set conversion
    all_links = set(link for links in results.values() for link in links)
    print(f"Final: got {len(all_links)} unique URLs (expected {expected})")
    return list(all_links)

# Entry point guard - only executes when run directly
if __name__ == "__main__":
    links = robust_scrape()

Detected 10 links per page, expecting 10000 total
All pages OK after 0 retries
Total links fetched (including duplicates): 10000 (expected 10000)
Final: got 9998 unique URLs (expected 10000)


In [92]:
links

['https://www.investing.com/news/economy/apple-cloud-chief-abbott-to-step-down-in-april--bloomberg-news-3022039',
 'https://www.investing.com/news/stock-market-news/apple-working-on-foldable-iphone-could-be-released-in-2026--the-information-3531496',
 'https://www.investing.com/news/stock-market-news/spotify-raises-us-prices-of-its-premium-plans-in-margin-push-3467411',
 'https://www.investing.com/news/pro/qualcomm-inc-receives-investment-bank-analyst-rating-update-3715160',
 'https://www.investing.com/news/stock-market-news/muchneeded-shot-in-the-arm-analysts-weigh-in-on-apples-ipad-event-432SI-3425307',
 'https://www.investing.com/news/stock-market-news/exclusive-eu-mulls-new-unit-with-antitrust-veterans-to-enforce-tech-rules--sources-2856893',
 'https://www.investing.com/news/stock-market-news/apple-shares-in-sweet-spot-as-traders-rush-to-safety-2695849',
 'https://www.investing.com/news/stock-market-news/global-pc-market-seen-to-recover-late-2023--reports-2978334',
 'https://www.in

In [93]:
with open('aapl_urls.txt', 'w') as f:
    for link in links:
        f.write("%s\n" % link)

In [94]:
# Apply nest_asyncio to allow running asyncio within Jupyter notebook
# This is necessary because Jupyter already uses an event loop
nest_asyncio.apply()

# Configuration parameters for parallel processing and network requests
# Calculate optimal number of workers for fetching HTML (limited to 32)
FETCH_WORKERS = min(32, os.cpu_count() * 4)  
# Use available CPU cores for processing articles 
PROCESS_WORKERS = os.cpu_count() or 4       
# Maximum retry attempts for failed network requests
MAX_FETCH_RETRIES = 3                      
# Delay between retry attempts (seconds)
RETRY_DELAY = 1                             
# Stock ticker symbol used for labeling data
TICKER = 'AAPL'
# Create cloudscraper instance to bypass anti-bot measures
scraper = cloudscraper.create_scraper()

def is_placeholder(html: str) -> bool:
    """
    Check if the HTML content is a placeholder/error page instead of actual content.
    
    These placeholders are common when websites detect scraping or have technical issues.
    """
    lower = html.lower() if html else ""
    return (
        'temporarily down for maintenance' in lower
        or 'just a moment' in lower
        or "we're temporarily down" in lower
    )

def safe_find_datetime(url, html_content=None):
    """
    Extract publication date and time from the URL or HTML content.
    
    Uses multiple strategies with fallbacks:
    1. Try htmldate library on URL
    2. Look for American format dates (MM/DD/YYYY, HH:MM AM/PM) in content
    3. Try both European and American formats (DD/MM/YYYY or MM/DD/YYYY)
    4. Fallback to current date and time if all else fails
    
    Returns: tuple of (date_string, time_string)
    """
    try:
        # Strategy 1: Use htmldate library to extract date from URL
        dt = find_date(url)
        if dt:
            return dt, "00:00"  # Return with default time if date found
    except:
        pass
    
    if html_content:
        # Strategy 2: Look for American format with AM/PM
        m = re.search(r"(\d{1,2}/\d{1,2}/\d{4}),\s*(\d{1,2}:\d{2}\s*(?:AM|PM))", html_content)
        if m:
            ds, ts = m.groups()
            try:
                dt = datetime.strptime(f"{ds}, {ts}", "%m/%d/%Y, %I:%M %p")
                return dt.strftime("%Y-%m-%d"), dt.strftime("%H:%M")
            except:
                pass
        
        # Strategy 3: Look for numeric dates with 24-hour time format
        m = re.search(r"(\d{2}/\d{2}/\d{4}),\s*(\d{2}:\d{2})", html_content)
        if m:
            ds, ts = m.groups()
            # Try both European and American date formats
            for fmt in ("%d/%m/%Y, %H:%M", "%m/%d/%Y, %H:%M"):
                try:
                    dt = datetime.strptime(f"{ds}, {ts}", fmt)
                    return dt.strftime("%Y-%m-%d"), dt.strftime("%H:%M")
                except:
                    continue
    
    # Fallback: Use current date and time
    now = datetime.now()
    return now.strftime("%Y-%m-%d"), now.strftime("%H:%M")

# ——— Fetch HTML with minimal retry mechanism ———
def fetch_html(url, idx, total):
    """
    Fetch the HTML content for a given URL with retry logic.
    
    Args:
        url: The URL to fetch
        idx: Current index (for progress reporting)
        total: Total number of URLs (for progress reporting)
        
    Returns:
        tuple of (url, html_content) or (url, None) if failed
    """
    for attempt in range(1, MAX_FETCH_RETRIES + 1):
        try:
            # Make HTTP request with timeout
            resp = scraper.get(url, timeout=30)
            html = resp.text
            
            # Check if we got a placeholder page instead of content
            if is_placeholder(html):
                raise RuntimeError('Placeholder')
                
            # Report success
            print(f"[Fetch][{idx}/{total}][ok]")
            return url, html
            
        except Exception:
            # Report retry attempt
            print(f"[Fetch][{idx}/{total}][retry {attempt}]")
            if attempt < MAX_FETCH_RETRIES:
                # Wait before retrying
                time.sleep(RETRY_DELAY)
                
    # Report complete failure after all retries
    print(f"[Fetch error] {idx}/{total}: failed after {MAX_FETCH_RETRIES} retries")
    return url, None

# ——— Initialize sentiment analyzer from NLTK ———
SID = SentimentIntensityAnalyzer()

def process_article(arg):
    """
    Process an article's HTML to extract content and analyze sentiment.
    
    Uses newspaper3k library to parse article content and NLTK's VADER
    for sentiment analysis.
    
    Args:
        arg: tuple of (url, html_content)
        
    Returns:
        dictionary with extracted data and sentiment scores, or None if processing failed
    """
    url, html = arg
    if not html:
        return None
        
    # Initialize newspaper Article object
    art = Article(url)
    art.set_html(html)
    
    try:
        # Parse article content
        art.parse()
    except:
        return None
        
    # Extract text and title
    text = art.text or ""
    title = (art.title or "").strip() or "No title"
    
    # Calculate sentiment scores using VADER
    pol = SID.polarity_scores(text)
    
    # Extract publication date and time
    date, tm = safe_find_datetime(url, html)
    
    # Return combined data using dictionary unpacking
    return {**{'ticker': TICKER, 'publish_date': date, 'publish_time': tm,
             'title': title, 'body_text': text, 'url': url}, **pol}

async def scrape_all(urls):
    """
    Asynchronous pipeline to fetch and process multiple URLs in parallel.
    
    Implements a two-phase approach:
    1. Fetch all URLs concurrently using threads
    2. Process all fetched content concurrently using threads
    
    Args:
        urls: List of URLs to process
        
    Returns:
        pandas DataFrame containing all processed articles
    """
    total = len(urls)
    loop = asyncio.get_event_loop()
    
    # Phase 1: Fetch HTML content from all URLs in parallel
    with ThreadPoolExecutor(max_workers=FETCH_WORKERS) as fetch_pool:
        # Create fetch tasks and run them through the thread pool
        fetch_tasks = [loop.run_in_executor(fetch_pool, fetch_html, u, i+1, total)
                       for i, u in enumerate(urls)]
        # Wait for all fetch tasks to complete
        fetched = await asyncio.gather(*fetch_tasks)

    # Phase 2: Process all fetched HTML content in parallel
    records = []
    with ThreadPoolExecutor(max_workers=PROCESS_WORKERS) as proc_pool:
        # Submit processing tasks only for URLs with successful fetches
        futures = {
            proc_pool.submit(process_article, fr): fr[0]
            for fr in fetched if fr[1]  # Skip URLs where HTML is None
        }
        
        # Process results as they complete
        for i, fut in enumerate(as_completed(futures), 1):
            res = fut.result()
            print(f"[Process][{i}/{total}] {futures[fut]}")
            if res:
                records.append(res)
                
    # Convert results to DataFrame
    return pd.DataFrame(records)

# ——— Main entry point function ———
def main(links):
    """
    Main function that runs the entire scraping pipeline.
    
    Args:
        links: List of URLs to process
        
    Returns:
        pandas DataFrame with processed articles
    """
    df = asyncio.get_event_loop().run_until_complete(scrape_all(links))
    return df

# Execute the main function if this script is run directly
if __name__ == '__main__':
    df = main(links)
    print(df.shape)
    print(df.head())

[Fetch][1/9998][retry 1]
[Fetch][23/9998][retry 1]
[Fetch][24/9998][retry 1]
[Fetch][31/9998][retry 1]
[Fetch][11/9998][ok]
[Fetch][7/9998][ok]
[Fetch][12/9998][ok]
[Fetch][6/9998][ok]
[Fetch][10/9998][ok]
[Fetch][16/9998][ok]
[Fetch][18/9998][ok]
[Fetch][21/9998][ok]
[Fetch][4/9998][ok]
[Fetch][5/9998][ok]
[Fetch][2/9998][ok]
[Fetch][13/9998][ok]
[Fetch][17/9998][ok]
[Fetch][15/9998][ok]
[Fetch][22/9998][ok]
[Fetch][25/9998][ok]
[Fetch][26/9998][ok]
[Fetch][3/9998][ok]
[Fetch][19/9998][ok]
[Fetch][29/9998][ok]
[Fetch][27/9998][ok]
[Fetch][28/9998][ok]
[Fetch][8/9998][ok]
[Fetch][30/9998][ok]
[Fetch][20/9998][ok]
[Fetch][33/9998][ok]
[Fetch][34/9998][ok]
[Fetch][38/9998][ok]
[Fetch][37/9998][ok]
[Fetch][14/9998][ok]
[Fetch][43/9998][ok]
[Fetch][40/9998][ok]
[Fetch][41/9998][ok]
[Fetch][45/9998][ok]
[Fetch][48/9998][ok]
[Fetch][47/9998][ok]
[Fetch][42/9998][ok]
[Fetch][35/9998][ok]
[Fetch][49/9998][ok]
[Fetch][50/9998][ok]
[Fetch][44/9998][ok]
[Fetch][39/9998][ok]
[Fetch][51/9998][ok]
[

In [95]:
df

Unnamed: 0,ticker,publish_date,publish_time,title,body_text,url,neg,neu,pos,compound
0,AAPL,2024-11-11,16:22,Qualcomm receives Investment Bank Analyst Rati...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/qualcomm-in...,0.076,0.893,0.032,-0.8437
1,AAPL,2024-07-23,11:35,"Apple working on foldable iPhone, could be rel...",According to a report by The Information on Tu...,https://www.investing.com/news/stock-market-ne...,0.020,0.918,0.062,0.8910
2,AAPL,2024-06-03,06:55,Spotify raises US prices of its premium plans ...,(Reuters) -Spotify raised prices for its premi...,https://www.investing.com/news/stock-market-ne...,0.023,0.878,0.099,0.9686
3,AAPL,2023-03-03,15:38,Apple cloud chief Abbott to step down in April,(Reuters) -Apple Inc's top executive Michael A...,https://www.investing.com/news/economy/apple-c...,0.016,0.928,0.056,0.5267
4,AAPL,2022-07-28,09:08,Exclusive: EU mulls new unit with antitrust ve...,By Foo Yun Chee\n\nBRUSSELS (Reuters) - The Eu...,https://www.investing.com/news/stock-market-ne...,0.016,0.905,0.079,0.9460
...,...,...,...,...,...,...,...,...,...,...
9989,AAPL,2023-05-30,02:46,"Elon Musk kicks off China visit, Tesla expansi...",BEIJING (Reuters) -Tesla Chief Executive Elon ...,https://www.investing.com/news/stock-market-ne...,0.038,0.899,0.063,0.9211
9990,AAPL,2025-01-15,17:02,"TSMC logs record quarterly profit, sees hefty ...","By Yimou Lee, Ben Blanchard and Faith Hung\n\n...",https://www.investing.com/news/stock-market-ne...,0.029,0.869,0.102,0.9832
9991,AAPL,2024-06-03,13:51,US financial watchdog announces registry of no...,(Reuters) - The U.S. consumer financial watchd...,https://www.investing.com/news/economy/us-fina...,0.053,0.905,0.042,-0.6369
9992,AAPL,2025-01-31,05:01,Apple PT receives Investment Bank Analyst Rati...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/apple-compu...,0.076,0.893,0.032,-0.8437


In [96]:
#sort by publish_date and publish_time
df = df.sort_values(by=['publish_date', 'publish_time'], ascending=[False,False]).reset_index(drop=True)

In [97]:
pd.set_option('display.max_columns', None)

In [98]:
# Count empty body_text entries
empty_body_count = df[df['body_text'] == ''].shape[0]
print(f"Number of articles with empty body_text: {empty_body_count}")

Number of articles with empty body_text: 0


In [99]:
df

Unnamed: 0,ticker,publish_date,publish_time,title,body_text,url,neg,neu,pos,compound
0,AAPL,2025-05-01,06:46,Error 404: Page Not Found,404\n\nPage Not Found\n\nSeems like the page y...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755
1,AAPL,2025-05-01,06:42,Error 404: Page Not Found,404\n\nPage Not Found\n\nSeems like the page y...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755
2,AAPL,2025-05-01,06:31,Error 404: Page Not Found,404\n\nPage Not Found\n\nSeems like the page y...,https://www.investing.com/news/pro/here&amp;#3...,0.140,0.860,0.000,-0.2755
3,AAPL,2025-04-30,18:24,Exclusive-Ford kills project to develop Tesla-...,By Nora Eckert\n\nDETROIT (Reuters) -Ford Moto...,https://www.investing.com/news/world-news/excl...,0.031,0.909,0.060,0.9542
4,AAPL,2025-04-30,17:07,Trading Day: Trade? It’s a drag By Reuters,"ORLANDO, Florida (Reuters) - TRADING DAY\n\nMa...",https://www.investing.com/news/economy-news/tr...,0.091,0.785,0.123,0.9960
...,...,...,...,...,...,...,...,...,...,...
9989,AAPL,2021-10-29,08:24,"Move over Apple, Microsoft now the world's mos...",By Subrat Patnaik\n\n(Reuters) -Apple Inc lost...,https://www.investing.com/news/stock-market-ne...,0.064,0.829,0.107,0.8469
9990,AAPL,2021-10-29,07:08,"Wall Street shakes off Amazon, Apple weakness ...",By Chuck Mikolajczak\n\nNEW YORK (Reuters) - U...,https://www.investing.com/news/stock-market-ne...,0.058,0.825,0.118,0.9915
9991,AAPL,2021-10-29,06:13,"European shares end flat, but add nearly 5% in...",By Anisha Sircar and Ambar Warrick\n\n(Reuters...,https://www.investing.com/news/stock-market-ne...,0.058,0.780,0.162,0.9940
9992,AAPL,2021-10-29,02:38,Thousands gear up for tech fest in Lisbon in t...,By Clara-Laeila Laudette and Supantha Mukherje...,https://www.investing.com/news/stock-market-ne...,0.033,0.901,0.065,0.9423


In [100]:
article_sentiments = df.copy()

In [101]:
article_sentiments

Unnamed: 0,ticker,publish_date,publish_time,title,body_text,url,neg,neu,pos,compound
0,AAPL,2025-05-01,06:46,Error 404: Page Not Found,404\n\nPage Not Found\n\nSeems like the page y...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755
1,AAPL,2025-05-01,06:42,Error 404: Page Not Found,404\n\nPage Not Found\n\nSeems like the page y...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755
2,AAPL,2025-05-01,06:31,Error 404: Page Not Found,404\n\nPage Not Found\n\nSeems like the page y...,https://www.investing.com/news/pro/here&amp;#3...,0.140,0.860,0.000,-0.2755
3,AAPL,2025-04-30,18:24,Exclusive-Ford kills project to develop Tesla-...,By Nora Eckert\n\nDETROIT (Reuters) -Ford Moto...,https://www.investing.com/news/world-news/excl...,0.031,0.909,0.060,0.9542
4,AAPL,2025-04-30,17:07,Trading Day: Trade? It’s a drag By Reuters,"ORLANDO, Florida (Reuters) - TRADING DAY\n\nMa...",https://www.investing.com/news/economy-news/tr...,0.091,0.785,0.123,0.9960
...,...,...,...,...,...,...,...,...,...,...
9989,AAPL,2021-10-29,08:24,"Move over Apple, Microsoft now the world's mos...",By Subrat Patnaik\n\n(Reuters) -Apple Inc lost...,https://www.investing.com/news/stock-market-ne...,0.064,0.829,0.107,0.8469
9990,AAPL,2021-10-29,07:08,"Wall Street shakes off Amazon, Apple weakness ...",By Chuck Mikolajczak\n\nNEW YORK (Reuters) - U...,https://www.investing.com/news/stock-market-ne...,0.058,0.825,0.118,0.9915
9991,AAPL,2021-10-29,06:13,"European shares end flat, but add nearly 5% in...",By Anisha Sircar and Ambar Warrick\n\n(Reuters...,https://www.investing.com/news/stock-market-ne...,0.058,0.780,0.162,0.9940
9992,AAPL,2021-10-29,02:38,Thousands gear up for tech fest in Lisbon in t...,By Clara-Laeila Laudette and Supantha Mukherje...,https://www.investing.com/news/stock-market-ne...,0.033,0.901,0.065,0.9423


In [102]:
# 1. แปลงคอลัมน์ publish_date เป็น datetime
article_sentiments['publish_date'] = pd.to_datetime(article_sentiments['publish_date'], errors='coerce')

# 2. แปลงคอลัมน์ publish_time เป็นเวลา (datetime.time)
article_sentiments['publish_time'] = pd.to_datetime(article_sentiments['publish_time'], format='%H:%M', errors='coerce').dt.time

article_sentiments['publish_datetime'] = pd.to_datetime(
    article_sentiments['publish_date'].astype(str) + ' ' + article_sentiments['publish_time'].astype(str),
    errors='coerce'
)


In [103]:


# สมมติว่า df คือ DataFrame ที่ได้จากการ scrape
# ตัวอย่างการโหลดไฟล์ pickle (ปรับ path ให้ตรงกับไฟล์ของคุณ)
# df = pd.read_pickle('/mnt/data/your_file.pkl')


def remove_html(text):
    if isinstance(text, str):
        return BeautifulSoup(text, "html.parser").get_text()
    return text

def remove_ads(text):
    if isinstance(text, str):
        # ลบข้อความที่มีรูปแบบโฆษณาที่พบได้บ่อย (สามารถปรับ regex ได้ตามความเหมาะสม)
        text = re.sub(r"here or remove ads.*?disclosureor", "", text, flags=re.IGNORECASE|re.DOTALL)
    return text

def clean_text(text):
    text = remove_html(text)      # ลบ HTML tags
    text = remove_ads(text)       # ลบข้อความโฆษณา
    text = text.strip()           # ตัดช่องว่างหัวและท้าย
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)  # แทนที่ช่องว่างหลายตัวด้วยช่องว่างเดียว
    return text

for col in ['title', 'body_text']:
    article_sentiments[col] = article_sentiments[col].apply(clean_text)

article_sentiments = article_sentiments.drop_duplicates()

article_sentiments = article_sentiments.drop(columns=['publish_date', 'publish_time'])

In [104]:
article_sentiments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   ticker            9994 non-null   object        
 1   title             9994 non-null   object        
 2   body_text         9994 non-null   object        
 3   url               9994 non-null   object        
 4   neg               9994 non-null   float64       
 5   neu               9994 non-null   float64       
 6   pos               9994 non-null   float64       
 7   compound          9994 non-null   float64       
 8   publish_datetime  9994 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(4), object(4)
memory usage: 702.8+ KB


In [105]:
article_sentiments

Unnamed: 0,ticker,title,body_text,url,neg,neu,pos,compound,publish_datetime
0,AAPL,Error 404: Page Not Found,404 Page Not Found Seems like the page you wer...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755,2025-05-01 06:46:00
1,AAPL,Error 404: Page Not Found,404 Page Not Found Seems like the page you wer...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755,2025-05-01 06:42:00
2,AAPL,Error 404: Page Not Found,404 Page Not Found Seems like the page you wer...,https://www.investing.com/news/pro/here&amp;#3...,0.140,0.860,0.000,-0.2755,2025-05-01 06:31:00
3,AAPL,Exclusive-Ford kills project to develop Tesla-...,By Nora Eckert DETROIT (Reuters) -Ford Motor h...,https://www.investing.com/news/world-news/excl...,0.031,0.909,0.060,0.9542,2025-04-30 18:24:00
4,AAPL,Trading Day: Trade? It’s a drag By Reuters,"ORLANDO, Florida (Reuters) - TRADING DAY Makin...",https://www.investing.com/news/economy-news/tr...,0.091,0.785,0.123,0.9960,2025-04-30 17:07:00
...,...,...,...,...,...,...,...,...,...
9989,AAPL,"Move over Apple, Microsoft now the world's mos...",By Subrat Patnaik (Reuters) -Apple Inc lost it...,https://www.investing.com/news/stock-market-ne...,0.064,0.829,0.107,0.8469,2021-10-29 08:24:00
9990,AAPL,"Wall Street shakes off Amazon, Apple weakness ...",By Chuck Mikolajczak NEW YORK (Reuters) - U.S....,https://www.investing.com/news/stock-market-ne...,0.058,0.825,0.118,0.9915,2021-10-29 07:08:00
9991,AAPL,"European shares end flat, but add nearly 5% in...",By Anisha Sircar and Ambar Warrick (Reuters) -...,https://www.investing.com/news/stock-market-ne...,0.058,0.780,0.162,0.9940,2021-10-29 06:13:00
9992,AAPL,Thousands gear up for tech fest in Lisbon in t...,By Clara-Laeila Laudette and Supantha Mukherje...,https://www.investing.com/news/stock-market-ne...,0.033,0.901,0.065,0.9423,2021-10-29 02:38:00


In [106]:
article_sentiments = article_sentiments.sort_values(
    by='publish_datetime', 
    ascending=False
).reset_index(drop=True)

In [107]:
article_sentiments

Unnamed: 0,ticker,title,body_text,url,neg,neu,pos,compound,publish_datetime
0,AAPL,Error 404: Page Not Found,404 Page Not Found Seems like the page you wer...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755,2025-05-01 06:46:00
1,AAPL,Error 404: Page Not Found,404 Page Not Found Seems like the page you wer...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755,2025-05-01 06:42:00
2,AAPL,Error 404: Page Not Found,404 Page Not Found Seems like the page you wer...,https://www.investing.com/news/pro/here&amp;#3...,0.140,0.860,0.000,-0.2755,2025-05-01 06:31:00
3,AAPL,Exclusive-Ford kills project to develop Tesla-...,By Nora Eckert DETROIT (Reuters) -Ford Motor h...,https://www.investing.com/news/world-news/excl...,0.031,0.909,0.060,0.9542,2025-04-30 18:24:00
4,AAPL,Trading Day: Trade? It’s a drag By Reuters,"ORLANDO, Florida (Reuters) - TRADING DAY Makin...",https://www.investing.com/news/economy-news/tr...,0.091,0.785,0.123,0.9960,2025-04-30 17:07:00
...,...,...,...,...,...,...,...,...,...
9989,AAPL,"Move over Apple, Microsoft now the world's mos...",By Subrat Patnaik (Reuters) -Apple Inc lost it...,https://www.investing.com/news/stock-market-ne...,0.064,0.829,0.107,0.8469,2021-10-29 08:24:00
9990,AAPL,"Wall Street shakes off Amazon, Apple weakness ...",By Chuck Mikolajczak NEW YORK (Reuters) - U.S....,https://www.investing.com/news/stock-market-ne...,0.058,0.825,0.118,0.9915,2021-10-29 07:08:00
9991,AAPL,"European shares end flat, but add nearly 5% in...",By Anisha Sircar and Ambar Warrick (Reuters) -...,https://www.investing.com/news/stock-market-ne...,0.058,0.780,0.162,0.9940,2021-10-29 06:13:00
9992,AAPL,Thousands gear up for tech fest in Lisbon in t...,By Clara-Laeila Laudette and Supantha Mukherje...,https://www.investing.com/news/stock-market-ne...,0.033,0.901,0.065,0.9423,2021-10-29 02:38:00


In [114]:
article_sentiments = article_sentiments[
    ~article_sentiments['title']
        .str.strip()
        .str.match(r'^(Cryptocurrency News|By Investing\.com|Error 404: Page Not Found)$')
].reset_index(drop=True)


In [115]:
article_sentiments

Unnamed: 0,ticker,title,body_text,url,neg,neu,pos,compound,publish_datetime
0,AAPL,Exclusive-Ford kills project to develop Tesla-...,By Nora Eckert DETROIT (Reuters) -Ford Motor h...,https://www.investing.com/news/world-news/excl...,0.031,0.909,0.060,0.9542,2025-04-30 18:24:00
1,AAPL,Trading Day: Trade? It’s a drag By Reuters,"ORLANDO, Florida (Reuters) - TRADING DAY Makin...",https://www.investing.com/news/economy-news/tr...,0.091,0.785,0.123,0.9960,2025-04-30 17:07:00
2,AAPL,Stocks log worst 100 day start since Nixon as ...,By Saqib Iqbal Ahmed NEW YORK (Reuters) -Presi...,https://www.investing.com/news/stock-market-ne...,0.085,0.851,0.064,-0.9723,2025-04-30 16:25:00
3,AAPL,Qualcomm forecasts Trump tariffs will dent rev...,By Arsheeya Bajwa and Max A. Cherney (Reuters)...,https://www.investing.com/news/stock-market-ne...,0.071,0.856,0.073,0.4872,2025-04-30 16:09:00
4,AAPL,DeepSeek available to download again in South ...,SEOUL (Reuters) -Chinese artificial intelligen...,https://www.investing.com/news/economy-news/de...,0.033,0.881,0.086,0.8873,2025-04-30 15:05:00
...,...,...,...,...,...,...,...,...,...
9986,AAPL,"Move over Apple, Microsoft now the world's mos...",By Subrat Patnaik (Reuters) -Apple Inc lost it...,https://www.investing.com/news/stock-market-ne...,0.064,0.829,0.107,0.8469,2021-10-29 08:24:00
9987,AAPL,"Wall Street shakes off Amazon, Apple weakness ...",By Chuck Mikolajczak NEW YORK (Reuters) - U.S....,https://www.investing.com/news/stock-market-ne...,0.058,0.825,0.118,0.9915,2021-10-29 07:08:00
9988,AAPL,"European shares end flat, but add nearly 5% in...",By Anisha Sircar and Ambar Warrick (Reuters) -...,https://www.investing.com/news/stock-market-ne...,0.058,0.780,0.162,0.9940,2021-10-29 06:13:00
9989,AAPL,Thousands gear up for tech fest in Lisbon in t...,By Clara-Laeila Laudette and Supantha Mukherje...,https://www.investing.com/news/stock-market-ne...,0.033,0.901,0.065,0.9423,2021-10-29 02:38:00


In [116]:
article_sentiments.to_pickle("AAPL_article_sentiments.pkl")

article_sentiments.to_csv("AAPL_article_sentiments.csv", index=False) 

In [121]:
nvda_df = pd.read_pickle("nvda_article_sentiments.pkl")

In [122]:
nvda_df

Unnamed: 0,ticker,title,body_text,url,neg,neu,pos,compound,publish_datetime
0,NVDA,Deepseek Releases New Math AI Model,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/deepseek-re...,0.076,0.893,0.032,-0.8437,2025-04-30 07:25:00
1,NVDA,Adv Micro Device receives Investment Bank Anal...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/seaport-glo...,0.076,0.893,0.032,-0.8437,2025-04-30 07:01:00
2,NVDA,Nvidia Corp receives Investment Bank Analyst R...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/seaport-glo...,0.076,0.893,0.032,-0.8437,2025-04-30 07:01:00
3,NVDA,Taiwan’s ASE: evaluating how it will support N...,By Wen-Yee Lee and Ben Blanchard TAIPEI (Reute...,https://www.investing.com/news/stock-market-ne...,0.032,0.883,0.084,0.9423,2025-04-30 06:56:00
4,NVDA,"Super Micro slumps on forecast cut, analysts d...",By Aditya Soni (Reuters) -Super Micro Computer...,https://www.investing.com/news/stock-market-ne...,0.099,0.779,0.122,0.9432,2025-04-30 06:47:00
...,...,...,...,...,...,...,...,...,...
8918,NVDA,NVIDIA Rises 3.12% By Investing.com,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/stock-market-ne...,0.076,0.893,0.032,-0.8437,1969-12-31 19:00:00
8919,NVDA,NVIDIA Falls 3.02% By Investing.com,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/stock-market-ne...,0.076,0.893,0.032,-0.8437,1969-12-31 19:00:00
8920,NVDA,"Adobe Earnings, Revenue Beat in Q3 By Investin...",Investing.com - Adobe (NASDAQ: ) reported thir...,https://www.investing.com/news/stock-market-ne...,0.000,0.947,0.053,0.8020,1969-12-31 19:00:00
8921,NVDA,"NVIDIA Earnings, Revenue Beat in Q2 By Investi...",Investing.com - NVIDIA (NASDAQ: ) reported sec...,https://www.investing.com/news/stock-market-ne...,0.000,0.948,0.052,0.8020,1969-12-31 19:00:00
