In [2]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\saysa\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [9]:
import datetime, time, random, urllib.parse, requests, feedparser, pandas as pd
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from email.utils import parsedate_to_datetime

BASE = "https://news.google.com/rss/search"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_6_1) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/125.0.0.0 Safari/537.36"
    ),
    "Accept": "application/rss+xml, text/xml;q=0.9",
}

def month_slices(start, end):
    d = datetime.date(start.year, start.month, 1)
    while d <= end:
        nxt = (d.replace(day=28) + datetime.timedelta(days=4)).replace(day=1)
        yield d, min(nxt - datetime.timedelta(days=1), end)
        d = nxt

def build_url(keywords, since, until):
    q = f'{keywords} after:{since} before:{(until + datetime.timedelta(days=1))}'
    params = {"q": q, "hl": "en-US", "gl": "US", "ceid": "US:en"}
    return f"{BASE}?{urllib.parse.urlencode(params, quote_via=urllib.parse.quote_plus)}"

def session(max_retries=5):
    s = requests.Session()
    rs = Retry(total=max_retries, backoff_factor=1,
               status_forcelist=[429, 500, 502, 503, 504])
    s.mount("https://", HTTPAdapter(max_retries=rs))
    s.headers.update(HEADERS)
    return s

def fetch_month(url, sess):
    for _ in range(2):                          # up to 1 retry on 503
        r = sess.get(url, timeout=15)
        if r.status_code == 200:
            return feedparser.parse(r.text).entries
        if r.status_code == 503:
            time.sleep(60)                      # hard back-off
    raise RuntimeError(f"Still 503 → {url}")

def scrape(keywords, start, end, outfile):
    sia, rows, sess = SentimentIntensityAnalyzer(), [], session()
    for since, until in month_slices(start, end):
        url = build_url(keywords, since, until)
        for e in fetch_month(url, sess):
            score = sia.polarity_scores(e.title)
            rows.append({
                "date": parsedate_to_datetime(e.published).strftime("%Y-%m-%d"),
                "headline": e.title, 
                **score
            })
        time.sleep(random.uniform(6, 12))       # polite spacing
    pd.DataFrame(rows).to_csv(outfile, index=False)
    print(f"Saved {len(rows)} rows → {outfile}")

# -------- run --------
ipo  = datetime.date(2025, 6, 23)
today = datetime.date.today()
scrape("RBLX stock", ipo, today, "../data/roblox_stock_sentiment.csv")


Saved 62 rows → ../data/roblox_stock_sentiment.csv


In [11]:


def flip_csv_by_date(input_csv, output_csv, date_column='date'):
    # Load CSV
    df = pd.read_csv(input_csv)

    # Convert date column to datetime
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')

    # Drop rows where date conversion failed
    df = df.dropna(subset=[date_column])

    # Sort by date descending
    df = df.sort_values(by=date_column, ascending=False)

    # Save to new CSV
    df.to_csv(output_csv, index=False)
    print(f"Flipped CSV saved to: {output_csv}")

# Example usage
flip_csv_by_date("../data/roblox_stock_sentiment.csv", "../data/Sorted_Roblox_Stock_Sentiment.csv")

Flipped CSV saved to: ../data/Sorted_Roblox_Stock_Sentiment.csv


In [3]:
#!/usr/bin/env python3
"""
dedupe_headlines_fixed.py

Just edit the INPUT_CSV and OUTPUT_CSV variables below,
then run this script. It will remove duplicate rows based
on the 'headline' column (keeping the earliest 'date').
"""

import pandas as pd

# ─── EDIT THESE ───────────────────────────────────────────
INPUT_CSV    = '../data/Merged_Roblox_Sentiment_Data.csv'   # ← change this to your source file
OUTPUT_CSV   = 'clean_news_sentiment.csv'  # ← change this to your destination file
DATE_COL     = 'date'               # ← name of your date column
HEADLINE_COL = 'headline'           # ← name of your headline column
# ─────────────────────────────────────────────────────────

def remove_duplicates(input_file, output_file,
                      date_col=DATE_COL, headline_col=HEADLINE_COL):
    # Read CSV, parsing the date column
    df = pd.read_csv(input_file, parse_dates=[date_col])

    # Sort so earliest dates come first
    df_sorted = df.sort_values(by=date_col, ascending=True)

    # Drop later duplicates of the same headline
    df_deduped = df_sorted.drop_duplicates(subset=[headline_col], keep='first')

    # Save result
    df_deduped.to_csv(output_file, index=False)
    print(f"Wrote {len(df_deduped)} unique rows to '{output_file}'")

if __name__ == "__main__":
    remove_duplicates(INPUT_CSV, OUTPUT_CSV)



Wrote 1930 unique rows to 'deduped_output.csv'
