# Web scraping stock market news for Sentiment Analysis

## 1. Install/import libraries

In [72]:
%pip install pandas nltk cloudscraper beautifulsoup4 htmldate newspaper3k lxml_html_clean




In [73]:
# Standard library
import os
import re
import time
import random
import asyncio
import warnings
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor,  as_completed

# Third-party libraries
import pandas as pd
import nltk
import nest_asyncio
import cloudscraper
from htmldate import find_date
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from newspaper import Article

warnings.filterwarnings('ignore')
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## 2. Data collection



In [74]:
# ——— CONFIG ———
# Company ticker identifier for investing.com
COMPANY     = "facebook-inc"   
# Maximum number of pages to scrape (pagination)
MAX_PAGE    = 1000             
# Number of concurrent workers for ThreadPoolExecutor
MAX_WORKERS = 50              
# Maximum number of retry attempts for failed requests
MAX_RETRIES = 5              

# HTTP headers to mimic a legitimate web browser request
# This helps avoid detection as a bot/scraper
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    ),
    "Accept": (
        "text/html,application/xhtml+xml,application/xml;"
        "q=0.9,image/avif,image/webp,*/*;q=0.8"
    ),
    # Referrer to appear as if coming from the main site
    "Referer": "https://www.investing.com/",
}

# Create a cloudscraper instance to bypass Cloudflare protections
scraper = cloudscraper.create_scraper(
    browser={'browser': 'chrome', 'platform': 'windows'}
)

def fetch_page(page: int):
    """ดึงลิงก์ข่าวจากหน้า {page} แล้วคืน list ของ href"""
    # Construct URL for the specific page number
    url = f"https://www.investing.com/equities/{COMPANY}-news/{page}"
    # Implement retry logic for resilience
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            # Make HTTP request with timeout
            r = scraper.get(url, headers=HEADERS, timeout=30)
            r.raise_for_status()  # Raise exception for 4XX/5XX responses
            # Parse HTML with BeautifulSoup
            soup = BeautifulSoup(r.text, "lxml")
            # CSS selector to find news article links
            anchors = soup.select(
                'ul[data-test="news-list"] '
                'li article a[data-test="article-title-link"]'
            )
            # Extract href attributes from anchor tags
            return [a["href"] for a in anchors if a.has_attr("href")]
        except Exception as e:
            # Exponential backoff with jitter for retries
            if attempt < MAX_RETRIES:
                backoff = 2 ** (attempt - 1) + random.random()
                time.sleep(backoff)
            else:
                # Log failure after exhausting retries
                print(f"Page {page} failed after {MAX_RETRIES}: {e}")
    # Return empty list if all attempts fail
    return []

def robust_scrape():
    # First, fetch page 1 to determine how many links per page
    first = fetch_page(1)
    PER_PAGE = len(first)
    # Validate we got results from first page
    if PER_PAGE == 0:
        raise RuntimeError("Failed to fetch the first page. Please check headers or cookies and try again.")
    print(f"Detected {PER_PAGE} links per page, expecting {PER_PAGE * MAX_PAGE} total")

    # Initialize results dictionary with first page
    results = {1: first}
    # Create list of remaining pages to fetch
    pages = list(range(2, MAX_PAGE + 1))

    # Use thread pool for parallel execution
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
        # Submit fetch jobs for all pages
        futures = {pool.submit(fetch_page, p): p for p in pages}
        # Process completed futures as they finish
        for fut in as_completed(futures):
            p = futures[fut]
            results[p] = fut.result()

        # Retry logic for pages with incomplete results
        for round in range(1, MAX_RETRIES + 1):
            # Identify pages with fewer links than expected
            bad = [p for p, links in results.items() if len(links) != PER_PAGE]
            if not bad:
                print(f"All pages OK after {round-1} retries")
                break

            # Retry only the problematic pages
            print(f"Retry round {round} for pages: {bad}")
            futures = {pool.submit(fetch_page, p): p for p in bad}
            for fut in as_completed(futures):
                p = futures[fut]
                results[p] = fut.result()
        else:
            # This executes if the for loop completes without breaking
            print("Retry limit reached; some pages may still be incomplete.")

    # Calculate statistics on results
    total_fetched = sum(len(links) for links in results.values())
    expected = PER_PAGE * MAX_PAGE
    print(f"Total links fetched (including duplicates): {total_fetched} (expected {expected})")

    # Deduplicate links using set conversion
    all_links = set(link for links in results.values() for link in links)
    print(f"Final: got {len(all_links)} unique URLs (expected {expected})")
    return list(all_links)

# Entry point guard - only executes when run directly
if __name__ == "__main__":
    links = robust_scrape()

Detected 10 links per page, expecting 10000 total
All pages OK after 0 retries
Total links fetched (including duplicates): 10000 (expected 10000)
Final: got 9999 unique URLs (expected 10000)


In [75]:
links

['https://www.investing.com/news/stock-market-news/eu-fines-meta-797-million-over-abusive-practices-benefiting-facebook-marketplace-3723142',
 'https://www.investing.com/news/cryptocurrency-news/the-metaverse-mark-zuckerbergs-brave-new-world-2587228',
 'https://www.investing.com/news/stock-market-news/ubs-downgrades-alphabet-stock-sees-better-riskreward-in-meta-and-amazon-432SI-3112943',
 'https://www.investing.com/news/cryptocurrency-news/bitcoiner-claims-to-have-found-long-lost-satoshi-bitcoin-code-with-personal-notations-2907648',
 'https://www.investing.com/news/stock-market-news/google-discussed-dropping-broadcom-as-ai-chips-supplier-the-information-3179195',
 'https://www.investing.com/news/cryptocurrency-news/house-in-portugal-sold-for-3-bitcoins-without-euro-conversion-2819603',
 'https://www.investing.com/news/world-news/soccerrepeat-racism-offenders-should-be-kicked-out-of-international-football-says-bale-2608074',
 'https://www.investing.com/news/pro/facebook-inc-receives-in

In [76]:
with open('META_urls.txt', 'w') as f:
    for link in links:
        f.write("%s\n" % link)

In [None]:
# Apply nest_asyncio to allow running asyncio within Jupyter notebook
# This is necessary because Jupyter already uses an event loop
nest_asyncio.apply()

# Configuration parameters for parallel processing and network requests
# Calculate optimal number of workers for fetching HTML (limited to 32)
FETCH_WORKERS = min(32, os.cpu_count() * 4)  
# Use available CPU cores for processing articles 
PROCESS_WORKERS = os.cpu_count() or 4       
# Maximum retry attempts for failed network requests
MAX_FETCH_RETRIES = 3                      
# Delay between retry attempts (seconds)
RETRY_DELAY = 1                             
# Stock ticker symbol used for labeling data
TICKER = 'META'
# Create cloudscraper instance to bypass anti-bot measures
scraper = cloudscraper.create_scraper()

def is_placeholder(html: str) -> bool:
    """
    Check if the HTML content is a placeholder/error page instead of actual content.
    
    These placeholders are common when websites detect scraping or have technical issues.
    """
    lower = html.lower() if html else ""
    return (
        'temporarily down for maintenance' in lower
        or 'just a moment' in lower
        or "we're temporarily down" in lower
    )

def safe_find_datetime(url, html_content=None):
    """
    Extract publication date and time from the URL or HTML content.
    
    Uses multiple strategies with fallbacks:
    1. Try htmldate library on URL
    2. Look for American format dates (MM/DD/YYYY, HH:MM AM/PM) in content
    3. Try both European and American formats (DD/MM/YYYY or MM/DD/YYYY)
    4. Fallback to current date and time if all else fails
    
    Returns: tuple of (date_string, time_string)
    """
    try:
        # Strategy 1: Use htmldate library to extract date from URL
        dt = find_date(url)
        if dt:
            return dt, "00:00"  # Return with default time if date found
    except:
        pass
    
    if html_content:
        # Strategy 2: Look for American format with AM/PM
        m = re.search(r"(\d{1,2}/\d{1,2}/\d{4}),\s*(\d{1,2}:\d{2}\s*(?:AM|PM))", html_content)
        if m:
            ds, ts = m.groups()
            try:
                dt = datetime.strptime(f"{ds}, {ts}", "%m/%d/%Y, %I:%M %p")
                return dt.strftime("%Y-%m-%d"), dt.strftime("%H:%M")
            except:
                pass
        
        # Strategy 3: Look for numeric dates with 24-hour time format
        m = re.search(r"(\d{2}/\d{2}/\d{4}),\s*(\d{2}:\d{2})", html_content)
        if m:
            ds, ts = m.groups()
            # Try both European and American date formats
            for fmt in ("%d/%m/%Y, %H:%M", "%m/%d/%Y, %H:%M"):
                try:
                    dt = datetime.strptime(f"{ds}, {ts}", fmt)
                    return dt.strftime("%Y-%m-%d"), dt.strftime("%H:%M")
                except:
                    continue
    
    # Fallback: Use current date and time
    now = datetime.now()
    return now.strftime("%Y-%m-%d"), now.strftime("%H:%M")

# ——— Fetch HTML with minimal retry mechanism ———
def fetch_html(url, idx, total):
    """
    Fetch the HTML content for a given URL with retry logic.
    
    Args:
        url: The URL to fetch
        idx: Current index (for progress reporting)
        total: Total number of URLs (for progress reporting)
        
    Returns:
        tuple of (url, html_content) or (url, None) if failed
    """
    for attempt in range(1, MAX_FETCH_RETRIES + 1):
        try:
            # Make HTTP request with timeout
            resp = scraper.get(url, timeout=30)
            html = resp.text
            
            # Check if we got a placeholder page instead of content
            if is_placeholder(html):
                raise RuntimeError('Placeholder')
                
            # Report success
            print(f"[Fetch][{idx}/{total}][ok]")
            return url, html
            
        except Exception:
            # Report retry attempt
            print(f"[Fetch][{idx}/{total}][retry {attempt}]")
            if attempt < MAX_FETCH_RETRIES:
                # Wait before retrying
                time.sleep(RETRY_DELAY)
                
    # Report complete failure after all retries
    print(f"[Fetch error] {idx}/{total}: failed after {MAX_FETCH_RETRIES} retries")
    return url, None

# ——— Initialize sentiment analyzer from NLTK ———
SID = SentimentIntensityAnalyzer()

def process_article(arg):
    """
    Process an article's HTML to extract content and analyze sentiment.
    
    Uses newspaper3k library to parse article content and NLTK's VADER
    for sentiment analysis.
    
    Args:
        arg: tuple of (url, html_content)
        
    Returns:
        dictionary with extracted data and sentiment scores, or None if processing failed
    """
    url, html = arg
    if not html:
        return None
        
    # Initialize newspaper Article object
    art = Article(url)
    art.set_html(html)
    
    try:
        # Parse article content
        art.parse()
    except:
        return None
        
    # Extract text and title
    text = art.text or ""
    title = (art.title or "").strip() or "No title"
    
    # Calculate sentiment scores using VADER
    pol = SID.polarity_scores(text)
    
    # Extract publication date and time
    date, tm = safe_find_datetime(url, html)
    
    # Return combined data using dictionary unpacking
    return {**{'ticker': TICKER, 'publish_date': date, 'publish_time': tm,
             'title': title, 'body_text': text, 'url': url}, **pol}

async def scrape_all(urls):
    """
    Asynchronous pipeline to fetch and process multiple URLs in parallel.
    
    Implements a two-phase approach:
    1. Fetch all URLs concurrently using threads
    2. Process all fetched content concurrently using threads
    
    Args:
        urls: List of URLs to process
        
    Returns:
        pandas DataFrame containing all processed articles
    """
    total = len(urls)
    loop = asyncio.get_event_loop()
    
    # Phase 1: Fetch HTML content from all URLs in parallel
    with ThreadPoolExecutor(max_workers=FETCH_WORKERS) as fetch_pool:
        # Create fetch tasks and run them through the thread pool
        fetch_tasks = [loop.run_in_executor(fetch_pool, fetch_html, u, i+1, total)
                       for i, u in enumerate(urls)]
        # Wait for all fetch tasks to complete
        fetched = await asyncio.gather(*fetch_tasks)

    # Phase 2: Process all fetched HTML content in parallel
    records = []
    with ThreadPoolExecutor(max_workers=PROCESS_WORKERS) as proc_pool:
        # Submit processing tasks only for URLs with successful fetches
        futures = {
            proc_pool.submit(process_article, fr): fr[0]
            for fr in fetched if fr[1]  # Skip URLs where HTML is None
        }
        
        # Process results as they complete
        for i, fut in enumerate(as_completed(futures), 1):
            res = fut.result()
            print(f"[Process][{i}/{total}] {futures[fut]}")
            if res:
                records.append(res)
                
    # Convert results to DataFrame
    return pd.DataFrame(records)

# ——— Main entry point function ———
def main(links):
    """
    Main function that runs the entire scraping pipeline.
    
    Args:
        links: List of URLs to process
        
    Returns:
        pandas DataFrame with processed articles
    """
    df = asyncio.get_event_loop().run_until_complete(scrape_all(links))
    return df

# Execute the main function if this script is run directly
if __name__ == '__main__':
    df = main(links)
    print(df.shape)
    print(df.head())

[Fetch][20/9999][retry 1]
[Fetch][23/9999][retry 1]
[Fetch][25/9999][retry 1]
[Fetch][22/9999][retry 1]
[Fetch][31/9999][retry 1]
[Fetch][32/9999][retry 1]
[Fetch][12/9999][ok]
[Fetch][19/9999][ok]
[Fetch][17/9999][ok]
[Fetch][14/9999][ok]
[Fetch][7/9999][ok]
[Fetch][9/9999][ok]
[Fetch][2/9999][ok]
[Fetch][21/9999][ok]
[Fetch][13/9999][ok]
[Fetch][16/9999][ok]
[Fetch][8/9999][ok]
[Fetch][11/9999][ok]
[Fetch][4/9999][ok]
[Fetch][1/9999][ok]
[Fetch][6/9999][ok]
[Fetch][10/9999][ok]
[Fetch][5/9999][ok]
[Fetch][18/9999][ok]
[Fetch][3/9999][ok]
[Fetch][15/9999][ok]
[Fetch][29/9999][ok]
[Fetch][24/9999][ok]
[Fetch][30/9999][ok]
[Fetch][26/9999][ok]
[Fetch][28/9999][ok]
[Fetch][27/9999][ok]
[Fetch][33/9999][ok]
[Fetch][34/9999][ok]
[Fetch][47/9999][ok]
[Fetch][40/9999][ok]
[Fetch][50/9999][ok]
[Fetch][36/9999][ok]
[Fetch][49/9999][ok]
[Fetch][39/9999][ok]
[Fetch][23/9999][ok]
[Fetch][38/9999][ok]
[Fetch][52/9999][ok]
[Fetch][55/9999][ok]
[Fetch][53/9999][ok]
[Fetch][46/9999][ok]
[Fetch][41/99

In [None]:
df

Unnamed: 0,ticker,publish_date,publish_time,title,body_text,url,neg,neu,pos,compound
0,GOOGL,2023-06-26,05:38,"UBS downgrades Alphabet stock, sees better ris...",UBS analysts downgraded Alphabet (NASDAQ: ) sh...,https://www.investing.com/news/stock-market-ne...,0.075,0.793,0.132,0.8236
1,GOOGL,2024-03-12,09:30,Alphabet Inc Class A receives Investment Bank ...,Risk Disclosure: Trading in financial instrume...,https://www.investing.com/news/pro/tiktok-ban-...,0.076,0.893,0.032,-0.8437
2,GOOGL,2021-09-20,10:12,President of Turkey declares war against crypt...,In a Sept 18 meeting that took place in the ci...,https://www.investing.com/news/cryptocurrency-...,0.106,0.835,0.059,-0.9398
3,GOOGL,2025-02-05,09:17,Flare Hosts AI Hackathon at UC Berkeley with G...,"Dubai, United Arab Emirates, February 5th, 202...",https://www.investing.com/news/cryptocurrency-...,0.016,0.867,0.118,0.9961
4,GOOGL,2023-06-29,13:07,Google to block news in Canada over law on pay...,By Ismail Shakil\n\nOTTAWA (Reuters) -Google s...,https://www.investing.com/news/stock-market-ne...,0.064,0.899,0.038,-0.9273
...,...,...,...,...,...,...,...,...,...,...
9988,GOOGL,2023-07-05,07:41,Canada's Quebecor to pull its ads from Faceboo...,(Reuters) -Quebecor said on Wednesday it will ...,https://www.investing.com/news/stock-market-ne...,0.060,0.895,0.045,-0.4939
9989,GOOGL,2025-04-01,01:14,Google’s DeepMind slows research publications ...,Investing.com-- Google’s (NASDAQ: ) artificial...,https://www.investing.com/news/stock-market-ne...,0.053,0.768,0.179,0.9716
9990,GOOGL,2025-02-20,08:19,Jefferies: 'Better days ahead' for Roku By Inv...,"Investing.com -- In a note Thursday, Jefferies...",https://www.investing.com/news/stock-market-ne...,0.014,0.829,0.157,0.9886
9991,GOOGL,2025-02-21,11:32,"5 big analyst AI moves: Bullish on Marvell, me...",Investing.com -- Here are the biggest analyst ...,https://www.investing.com/news/stock-market-ne...,0.026,0.836,0.138,0.9993


In [None]:
#sort by publish_date and publish_time
df = df.sort_values(by=['publish_date', 'publish_time'], ascending=[False,False]).reset_index(drop=True)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# Count empty body_text entries
empty_body_count = df[df['body_text'] == ''].shape[0]
print(f"Number of articles with empty body_text: {empty_body_count}")

Number of articles with empty body_text: 0


In [None]:
df

Unnamed: 0,ticker,publish_date,publish_time,title,body_text,url,neg,neu,pos,compound
0,GOOGL,2025-05-04,00:20,Error 404: Page Not Found,404\n\nPage Not Found\n\nSeems like the page y...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755
1,GOOGL,2025-05-03,23:25,Error 404: Page Not Found,404\n\nPage Not Found\n\nSeems like the page y...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755
2,GOOGL,2025-05-03,04:30,Here’s how Goldman Sachs expects companies to ...,Investing.com - Investors have been focused du...,https://www.investing.com/news/stock-market-ne...,0.070,0.821,0.108,0.9672
3,GOOGL,2025-05-03,03:02,Strategy’s Saylor Reveals Asset That Leaves Bi...,"U.Today - Michael Saylor, a bull and co-founde...",https://www.investing.com/news/cryptocurrency-...,0.000,0.911,0.089,0.9806
4,GOOGL,2025-05-02,11:27,Google faces September trial on ad tech antitr...,By David Shepardson and Jody Godoy\n\nALEXANDR...,https://www.investing.com/news/stock-market-ne...,0.062,0.878,0.060,-0.3907
...,...,...,...,...,...,...,...,...,...,...
9988,GOOGL,2021-05-04,15:57,S&P 500 Falls After Yellen Triggers Rate Hike ...,By Yasin Ebrahim\n\nInvesting.com – The S&P 50...,https://www.investing.com/news/stock-market-ne...,0.054,0.845,0.101,0.9750
9989,GOOGL,2021-05-04,15:54,Apple vs. Google: Which FAANG Stock is a Bette...,The COVID-19 pandemic has treated the FAANG st...,https://www.investing.com/news/stock-market-ne...,0.019,0.878,0.104,0.9813
9990,GOOGL,2021-05-04,14:03,S&P 500 Slips on Tech Wreck as Yellen Stokes I...,By Yasin Ebrahim\n\nInvesting.com – The S&P 50...,https://www.investing.com/news/stock-market-ne...,0.047,0.865,0.088,0.9480
9991,GOOGL,2021-05-04,07:13,Nasdaq ends sharply lower in tech sell-off By ...,By Krystal Hu and Shreyashi Sanyal\n\n(Reuters...,https://www.investing.com/news/stock-market-ne...,0.046,0.849,0.104,0.9820


In [None]:
article_sentiments = df.copy()

In [None]:
article_sentiments

Unnamed: 0,ticker,publish_date,publish_time,title,body_text,url,neg,neu,pos,compound
0,GOOGL,2025-05-04,00:20,Error 404: Page Not Found,404\n\nPage Not Found\n\nSeems like the page y...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755
1,GOOGL,2025-05-03,23:25,Error 404: Page Not Found,404\n\nPage Not Found\n\nSeems like the page y...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755
2,GOOGL,2025-05-03,04:30,Here’s how Goldman Sachs expects companies to ...,Investing.com - Investors have been focused du...,https://www.investing.com/news/stock-market-ne...,0.070,0.821,0.108,0.9672
3,GOOGL,2025-05-03,03:02,Strategy’s Saylor Reveals Asset That Leaves Bi...,"U.Today - Michael Saylor, a bull and co-founde...",https://www.investing.com/news/cryptocurrency-...,0.000,0.911,0.089,0.9806
4,GOOGL,2025-05-02,11:27,Google faces September trial on ad tech antitr...,By David Shepardson and Jody Godoy\n\nALEXANDR...,https://www.investing.com/news/stock-market-ne...,0.062,0.878,0.060,-0.3907
...,...,...,...,...,...,...,...,...,...,...
9988,GOOGL,2021-05-04,15:57,S&P 500 Falls After Yellen Triggers Rate Hike ...,By Yasin Ebrahim\n\nInvesting.com – The S&P 50...,https://www.investing.com/news/stock-market-ne...,0.054,0.845,0.101,0.9750
9989,GOOGL,2021-05-04,15:54,Apple vs. Google: Which FAANG Stock is a Bette...,The COVID-19 pandemic has treated the FAANG st...,https://www.investing.com/news/stock-market-ne...,0.019,0.878,0.104,0.9813
9990,GOOGL,2021-05-04,14:03,S&P 500 Slips on Tech Wreck as Yellen Stokes I...,By Yasin Ebrahim\n\nInvesting.com – The S&P 50...,https://www.investing.com/news/stock-market-ne...,0.047,0.865,0.088,0.9480
9991,GOOGL,2021-05-04,07:13,Nasdaq ends sharply lower in tech sell-off By ...,By Krystal Hu and Shreyashi Sanyal\n\n(Reuters...,https://www.investing.com/news/stock-market-ne...,0.046,0.849,0.104,0.9820


In [None]:
# 1. แปลงคอลัมน์ publish_date เป็น datetime
article_sentiments['publish_date'] = pd.to_datetime(article_sentiments['publish_date'], errors='coerce')

# 2. แปลงคอลัมน์ publish_time เป็นเวลา (datetime.time)
article_sentiments['publish_time'] = pd.to_datetime(article_sentiments['publish_time'], format='%H:%M', errors='coerce').dt.time

article_sentiments['publish_datetime'] = pd.to_datetime(
    article_sentiments['publish_date'].astype(str) + ' ' + article_sentiments['publish_time'].astype(str),
    errors='coerce'
)


In [None]:


# สมมติว่า df คือ DataFrame ที่ได้จากการ scrape
# ตัวอย่างการโหลดไฟล์ pickle (ปรับ path ให้ตรงกับไฟล์ของคุณ)
# df = pd.read_pickle('/mnt/data/your_file.pkl')


def remove_html(text):
    if isinstance(text, str):
        return BeautifulSoup(text, "html.parser").get_text()
    return text

def remove_ads(text):
    if isinstance(text, str):
        # ลบข้อความที่มีรูปแบบโฆษณาที่พบได้บ่อย (สามารถปรับ regex ได้ตามความเหมาะสม)
        text = re.sub(r"here or remove ads.*?disclosureor", "", text, flags=re.IGNORECASE|re.DOTALL)
    return text

def clean_text(text):
    text = remove_html(text)      # ลบ HTML tags
    text = remove_ads(text)       # ลบข้อความโฆษณา
    text = text.strip()           # ตัดช่องว่างหัวและท้าย
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)  # แทนที่ช่องว่างหลายตัวด้วยช่องว่างเดียว
    return text

for col in ['title', 'body_text']:
    article_sentiments[col] = article_sentiments[col].apply(clean_text)

article_sentiments = article_sentiments.drop_duplicates()

article_sentiments = article_sentiments.drop(columns=['publish_date', 'publish_time'])

In [None]:
article_sentiments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9993 entries, 0 to 9992
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   ticker            9993 non-null   object        
 1   title             9993 non-null   object        
 2   body_text         9993 non-null   object        
 3   url               9993 non-null   object        
 4   neg               9993 non-null   float64       
 5   neu               9993 non-null   float64       
 6   pos               9993 non-null   float64       
 7   compound          9993 non-null   float64       
 8   publish_datetime  9993 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(4), object(4)
memory usage: 702.8+ KB


In [None]:
article_sentiments

Unnamed: 0,ticker,title,body_text,url,neg,neu,pos,compound,publish_datetime
0,GOOGL,Error 404: Page Not Found,404 Page Not Found Seems like the page you wer...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755,2025-05-04 00:20:00
1,GOOGL,Error 404: Page Not Found,404 Page Not Found Seems like the page you wer...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755,2025-05-03 23:25:00
2,GOOGL,Here’s how Goldman Sachs expects companies to ...,Investing.com - Investors have been focused du...,https://www.investing.com/news/stock-market-ne...,0.070,0.821,0.108,0.9672,2025-05-03 04:30:00
3,GOOGL,Strategy’s Saylor Reveals Asset That Leaves Bi...,"U.Today - Michael Saylor, a bull and co-founde...",https://www.investing.com/news/cryptocurrency-...,0.000,0.911,0.089,0.9806,2025-05-03 03:02:00
4,GOOGL,Google faces September trial on ad tech antitr...,"By David Shepardson and Jody Godoy ALEXANDRIA,...",https://www.investing.com/news/stock-market-ne...,0.062,0.878,0.060,-0.3907,2025-05-02 11:27:00
...,...,...,...,...,...,...,...,...,...
9988,GOOGL,S&P 500 Falls After Yellen Triggers Rate Hike ...,By Yasin Ebrahim Investing.com – The S&P 500 f...,https://www.investing.com/news/stock-market-ne...,0.054,0.845,0.101,0.9750,2021-05-04 15:57:00
9989,GOOGL,Apple vs. Google: Which FAANG Stock is a Bette...,The COVID-19 pandemic has treated the FAANG st...,https://www.investing.com/news/stock-market-ne...,0.019,0.878,0.104,0.9813,2021-05-04 15:54:00
9990,GOOGL,S&P 500 Slips on Tech Wreck as Yellen Stokes I...,By Yasin Ebrahim Investing.com – The S&P 500 t...,https://www.investing.com/news/stock-market-ne...,0.047,0.865,0.088,0.9480,2021-05-04 14:03:00
9991,GOOGL,Nasdaq ends sharply lower in tech sell-off By ...,By Krystal Hu and Shreyashi Sanyal (Reuters) -...,https://www.investing.com/news/stock-market-ne...,0.046,0.849,0.104,0.9820,2021-05-04 07:13:00


In [None]:
article_sentiments = article_sentiments.sort_values(
    by='publish_datetime', 
    ascending=False
).reset_index(drop=True)

In [None]:
article_sentiments

Unnamed: 0,ticker,title,body_text,url,neg,neu,pos,compound,publish_datetime
0,GOOGL,Error 404: Page Not Found,404 Page Not Found Seems like the page you wer...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755,2025-05-04 00:20:00
1,GOOGL,Error 404: Page Not Found,404 Page Not Found Seems like the page you wer...,https://www.investing.com/news/cryptocurrency-...,0.140,0.860,0.000,-0.2755,2025-05-03 23:25:00
2,GOOGL,Here’s how Goldman Sachs expects companies to ...,Investing.com - Investors have been focused du...,https://www.investing.com/news/stock-market-ne...,0.070,0.821,0.108,0.9672,2025-05-03 04:30:00
3,GOOGL,Strategy’s Saylor Reveals Asset That Leaves Bi...,"U.Today - Michael Saylor, a bull and co-founde...",https://www.investing.com/news/cryptocurrency-...,0.000,0.911,0.089,0.9806,2025-05-03 03:02:00
4,GOOGL,Google faces September trial on ad tech antitr...,"By David Shepardson and Jody Godoy ALEXANDRIA,...",https://www.investing.com/news/stock-market-ne...,0.062,0.878,0.060,-0.3907,2025-05-02 11:27:00
...,...,...,...,...,...,...,...,...,...
9988,GOOGL,S&P 500 Falls After Yellen Triggers Rate Hike ...,By Yasin Ebrahim Investing.com – The S&P 500 f...,https://www.investing.com/news/stock-market-ne...,0.054,0.845,0.101,0.9750,2021-05-04 15:57:00
9989,GOOGL,Apple vs. Google: Which FAANG Stock is a Bette...,The COVID-19 pandemic has treated the FAANG st...,https://www.investing.com/news/stock-market-ne...,0.019,0.878,0.104,0.9813,2021-05-04 15:54:00
9990,GOOGL,S&P 500 Slips on Tech Wreck as Yellen Stokes I...,By Yasin Ebrahim Investing.com – The S&P 500 t...,https://www.investing.com/news/stock-market-ne...,0.047,0.865,0.088,0.9480,2021-05-04 14:03:00
9991,GOOGL,Nasdaq ends sharply lower in tech sell-off By ...,By Krystal Hu and Shreyashi Sanyal (Reuters) -...,https://www.investing.com/news/stock-market-ne...,0.046,0.849,0.104,0.9820,2021-05-04 07:13:00


In [None]:
article_sentiments = article_sentiments[
    ~article_sentiments['title']
        .str.strip()
        .str.match(r'^(Cryptocurrency News|By Investing\.com|Error 404: Page Not Found)$')
].reset_index(drop=True)


In [None]:
article_sentiments

Unnamed: 0,ticker,title,body_text,url,neg,neu,pos,compound,publish_datetime
0,GOOGL,Here’s how Goldman Sachs expects companies to ...,Investing.com - Investors have been focused du...,https://www.investing.com/news/stock-market-ne...,0.070,0.821,0.108,0.9672,2025-05-03 04:30:00
1,GOOGL,Strategy’s Saylor Reveals Asset That Leaves Bi...,"U.Today - Michael Saylor, a bull and co-founde...",https://www.investing.com/news/cryptocurrency-...,0.000,0.911,0.089,0.9806,2025-05-03 03:02:00
2,GOOGL,Google faces September trial on ad tech antitr...,"By David Shepardson and Jody Godoy ALEXANDRIA,...",https://www.investing.com/news/stock-market-ne...,0.062,0.878,0.060,-0.3907,2025-05-02 11:27:00
3,GOOGL,"Bernstein upgrades Reddit, says it’s ’executin...",Investing.com -- Bernstein upgraded Reddit to ...,https://www.investing.com/news/stock-market-ne...,0.037,0.835,0.128,0.9772,2025-05-02 09:57:00
4,GOOGL,Analyst lowers Apple rating on lack of ‘AI-dri...,Investing.com -- Rosenblatt Securities downgra...,https://www.investing.com/news/stock-market-ne...,0.066,0.841,0.093,0.9185,2025-05-02 09:05:00
...,...,...,...,...,...,...,...,...,...
9986,GOOGL,S&P 500 Falls After Yellen Triggers Rate Hike ...,By Yasin Ebrahim Investing.com – The S&P 500 f...,https://www.investing.com/news/stock-market-ne...,0.054,0.845,0.101,0.9750,2021-05-04 15:57:00
9987,GOOGL,Apple vs. Google: Which FAANG Stock is a Bette...,The COVID-19 pandemic has treated the FAANG st...,https://www.investing.com/news/stock-market-ne...,0.019,0.878,0.104,0.9813,2021-05-04 15:54:00
9988,GOOGL,S&P 500 Slips on Tech Wreck as Yellen Stokes I...,By Yasin Ebrahim Investing.com – The S&P 500 t...,https://www.investing.com/news/stock-market-ne...,0.047,0.865,0.088,0.9480,2021-05-04 14:03:00
9989,GOOGL,Nasdaq ends sharply lower in tech sell-off By ...,By Krystal Hu and Shreyashi Sanyal (Reuters) -...,https://www.investing.com/news/stock-market-ne...,0.046,0.849,0.104,0.9820,2021-05-04 07:13:00


In [None]:
article_sentiments.to_pickle("META_article_sentiments.pkl")

article_sentiments.to_csv("META_article_sentiments.csv", index=False) 