In [None]:
!nvidia-smi

Sun Nov 16 22:33:01 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ====== 0. IMPORTS & PATHS =====================================
from pathlib import Path
from datetime import datetime, timedelta, timezone
from google.colab import drive # <--- ADDED THIS

import time
import random
import requests
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1. MOUNT GOOGLE DRIVE
# This connects your notebook to your permanent storage
drive.mount('/content/drive')

# 2. DEFINE THE PATHS
# OLD LINE (This was the problem):
# ROOT = Path.cwd()

# NEW LINE (This fixes it):
# We create a specific folder in your Drive so files don't get lost in the main folder
ROOT = Path("/content/drive/MyDrive/Colab_Project_Data")

DATA_RAW_NEWS   = ROOT / "data_raw" / "news"
DATA_CLEAN      = ROOT / "data_clean"

# 3. CREATE DIRECTORIES
# This will now create these folders inside your Google Drive
DATA_RAW_NEWS.mkdir(parents=True, exist_ok=True)
DATA_CLEAN.mkdir(parents=True, exist_ok=True)

print(f"✅ Success! Files will now be saved to: {DATA_RAW_NEWS}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Success! Files will now be saved to: /content/drive/MyDrive/Colab_Project_Data/data_raw/news


In [None]:
# ====== 1. GDELT CONFIG ========================================

GDELT_DOC_ENDPOINT = "https://api.gdeltproject.org/api/v2/doc/doc"

# GDELT query strings per ticker
# (can tweak later if one ticker looks too noisy / too sparse)
COMPANY_QUERIES = {
    "VLO":  '("Valero" OR "Valero Energy")',
    "XOM":  '("Exxon" OR "ExxonMobil" OR "Exxon Mobil")',
    "SHEL": '("Shell" OR "Royal Dutch Shell" OR "Shell plc")',
    # BP was causing trouble before; here we avoid the bare token "BP"
    "BP":   '("BP plc" OR "BP p.l.c." OR "British Petroleum")',
}

TICKERS = list(COMPANY_QUERIES.keys())

# --- date window (adjust if you like) ---
DATE_START = "2017-01-01"  # lower bound
DATE_END   = "2025-10-31"  # or any date you want

dt_start = datetime.strptime(DATE_START, "%Y-%m-%d")
dt_end   = datetime.strptime(DATE_END,   "%Y-%m-%d")

# ---- API + sampling limits ----
MAX_ARTICLES_PER_DAY  = 20   # **target**: 20 per ticker per day
MAXRECORDS_PER_CALL   = 40   # safety: GDELT will never return > this per call
SLEEP_SECONDS         = 0.5  # politeness between requests
USER_AGENT            = "thesis-sentiment-bot/1.0 (academic, non-commercial)"

print("Tickers:", TICKERS)
print("Date range:", dt_start.date(), "→", dt_end.date())


Tickers: ['VLO', 'XOM', 'SHEL', 'BP']
Date range: 2017-01-01 → 2025-10-31


In [None]:
# ====== 2. GDELT FETCH FUNCTION =================================

def gdelt_params(query, day):
    """
    Build GDELT /doc API parameters for a single day window.
    """
    # GDELT wants UTC timestamps like YYYYMMDDHHMMSS
    start_str = day.strftime("%Y%m%d000000")
    end_str   = day.strftime("%Y%m%d235959")
    return {
        "query":        query,
        "mode":         "ArtList",
        "maxrecords":   str(MAXRECORDS_PER_CALL),
        "format":       "json",
        "startdatetime": start_str,
        "enddatetime":   end_str,
    }


def fetch_gdelt_for_ticker(ticker, query, dt_start, dt_end):
    """
    Fetch up to MAX_ARTICLES_PER_DAY articles per day for a given ticker,
    between dt_start and dt_end (inclusive).
    Returns a DataFrame.
    """
    headers = {"User-Agent": USER_AGENT}
    all_rows = []

    # inclusive date range
    n_days = (dt_end - dt_start).days + 1
    dates = [dt_start + timedelta(days=i) for i in range(n_days)]

    for day in tqdm(dates, desc=f"[{ticker}] days"):
        params = gdelt_params(query, day)

        try:
            r = requests.get(GDELT_DOC_ENDPOINT, params=params, headers=headers, timeout=30)
        except Exception as e:
            print(f"[{ticker}] {day.date()} request error: {e}")
            continue

        if r.status_code != 200:
            print(f"[{ticker}] {day.date()} HTTP {r.status_code}; skipping")
            continue

        try:
            data = r.json()
        except Exception as e:
            print(f"[{ticker}] {day.date()} JSON error: {e}")
            continue

        articles = data.get("articles", [])
        if not articles:
            # nothing for this day
            time.sleep(SLEEP_SECONDS)
            continue

        # basic rows
        rows = []
        for a in articles:
            rows.append({
                "ticker":   ticker,
                "seendate": a.get("seendate"),
                "title":    a.get("title"),
                "url":      a.get("url"),
                "domain":   a.get("domain"),
                "language": a.get("language"),
                "sourcecountry": a.get("sourcecountry"),
            })

        day_df = pd.DataFrame(rows)

        # keep only English articles (you can relax this if needed)
        day_df = day_df[day_df["language"].str.lower() == "english"]

        if day_df.empty:
            time.sleep(SLEEP_SECONDS)
            continue

        # parse dates & enforce at most MAX_ARTICLES_PER_DAY
        day_df["seendate"] = pd.to_datetime(day_df["seendate"], errors="coerce")
        day_df["date"] = day_df["seendate"].dt.date

        # safety check: date should equal this 'day' (but we keep all anyway)
        # cap per day
        if len(day_df) > MAX_ARTICLES_PER_DAY:
            day_df = day_df.sample(MAX_ARTICLES_PER_DAY, random_state=42)

        all_rows.append(day_df)

        time.sleep(SLEEP_SECONDS)

    if not all_rows:
        print(f"[{ticker}] No articles found in entire window.")
        return pd.DataFrame(columns=[
            "ticker", "seendate", "title", "url",
            "domain", "language", "sourcecountry", "date"
        ])

    out = pd.concat(all_rows, ignore_index=True)
    out["date"] = pd.to_datetime(out["date"])
    out = out.sort_values(["date", "seendate"]).reset_index(drop=True)
    print(f"[{ticker}] final articles:", len(out),
          "| date range:", out["date"].min().date(), "→", out["date"].max().date())
    return out


In [None]:
# ====== 3. RUN FETCH FOR ALL TICKERS ============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

all_tickers_df = []

for tic in TICKERS:
    q = COMPANY_QUERIES[tic]
    print(f"\n========== Fetching {tic} ==========")
    df_t = fetch_gdelt_for_ticker(tic, q, dt_start, dt_end)

    # save per-ticker raw
    out_path = DATA_RAW_NEWS / f"gdelt_{tic}_raw.csv"
    df_t.to_csv(out_path, index=False)
    print(f"[{tic}] saved {len(df_t)} rows to {out_path}")

    all_tickers_df.append(df_t)

# combined raw file
if all_tickers_df:
    gdelt_all = pd.concat(all_tickers_df, ignore_index=True)
    combined_path = DATA_RAW_NEWS / "gdelt_articles_raw.csv"
    gdelt_all.to_csv(combined_path, index=False)
    print("\nCombined articles:", len(gdelt_all), "| Saved:", combined_path)
    display(gdelt_all.head())
else:
    print("No articles fetched for any ticker – check queries or date range.")


Using device: cpu



[VLO] days:   0%|          | 0/3226 [00:00<?, ?it/s]

[VLO] 2018-07-20 JSON error: Invalid control character '\n' at: line 1 column 10765 (char 10764)
[VLO] 2019-01-27 HTTP 429; skipping
[VLO] 2020-12-21 JSON error: Invalid \X escape sequence '\\': line 1 column 16161 (char 16160)
[VLO] 2021-11-17 JSON error: Invalid control character '\x07' at: line 1 column 13747 (char 13746)
[VLO] final articles: 40376 | date range: 2017-01-01 → 2025-10-31
[VLO] saved 40376 rows to /content/drive/MyDrive/Colab_Project_Data/data_raw/news/gdelt_VLO_raw.csv



[XOM] days:   0%|          | 0/3226 [00:00<?, ?it/s]

[XOM] 2020-05-15 JSON error: Invalid \X escape sequence '\\': line 1 column 8354 (char 8353)
[XOM] 2023-06-06 JSON error: Invalid \X escape sequence '\\': line 1 column 288 (char 287)
[XOM] 2024-10-24 JSON error: Invalid \X escape sequence '\\': line 1 column 10357 (char 10356)
[XOM] 2025-08-13 HTTP 429; skipping
[XOM] final articles: 62737 | date range: 2017-01-01 → 2025-10-31
[XOM] saved 62737 rows to /content/drive/MyDrive/Colab_Project_Data/data_raw/news/gdelt_XOM_raw.csv



[SHEL] days:   0%|          | 0/3226 [00:00<?, ?it/s]

[SHEL] 2017-09-21 JSON error: Invalid \X escape sequence '\\': line 1 column 2089 (char 2088)
[SHEL] 2018-11-10 JSON error: Invalid control character '\n' at: line 1 column 16512 (char 16511)
[SHEL] 2024-06-16 HTTP 429; skipping
[SHEL] final articles: 56197 | date range: 2017-01-01 → 2025-10-31
[SHEL] saved 56197 rows to /content/drive/MyDrive/Colab_Project_Data/data_raw/news/gdelt_SHEL_raw.csv



[BP] days:   0%|          | 0/3226 [00:00<?, ?it/s]

[BP] 2017-08-11 JSON error: Invalid \X escape sequence '\\': line 1 column 2204 (char 2203)
[BP] 2019-03-27 JSON error: Invalid control character '\n' at: line 1 column 807 (char 806)
[BP] 2020-12-23 JSON error: Invalid \X escape sequence '\\': line 1 column 5128 (char 5127)
[BP] final articles: 39411 | date range: 2017-01-01 → 2025-10-31
[BP] saved 39411 rows to /content/drive/MyDrive/Colab_Project_Data/data_raw/news/gdelt_BP_raw.csv

Combined articles: 198721 | Saved: /content/drive/MyDrive/Colab_Project_Data/data_raw/news/gdelt_articles_raw.csv


Unnamed: 0,ticker,seendate,title,url,domain,language,sourcecountry,date
0,VLO,2017-01-01 02:00:00+00:00,Valero Energy Corporation ( VLO ) Stake Reduce...,http://www.mideasttime.com/valero-energy-corpo...,mideasttime.com,English,United States,2017-01-01
1,VLO,2017-01-01 03:45:00+00:00,TheStreet downgraded Valero Energy Partners LP...,http://breakingfinancenews.com/investing/thest...,breakingfinancenews.com,English,,2017-01-01
2,VLO,2017-01-01 05:30:00+00:00,Delaware 105 . 9,http://delaware1059.com/story/29428-smyrna-man...,delaware1059.com,English,United States,2017-01-01
3,VLO,2017-01-01 07:30:00+00:00,Off,http://www.barcablaugranes.com/2017/1/1/141379...,barcablaugranes.com,English,Spain,2017-01-01
4,VLO,2017-01-01 08:30:00+00:00,Southern Petroleum Rebrands 26 Ky . Stations t...,http://www.cspdailynews.com/fuels-news-prices-...,cspdailynews.com,English,,2017-01-01


In [None]:
# ====== 4. FINBERT SCORING =====================================

ARTICLES_CSV = DATA_RAW_NEWS / "gdelt_articles_raw.csv"
SCORED_CSV   = DATA_RAW_NEWS / "finbert_articles_scored.csv"
DAILY_CSV    = DATA_CLEAN      / "finbert_daily_avg.csv"

MAX_PER_DAY_FOR_SCORING = 20   # final safety cap
BATCH_SIZE = 128                # adjust if you have more/less memory

articles = pd.read_csv(ARTICLES_CSV)

# parse dates
articles["date"] = pd.to_datetime(articles["date"], errors="coerce")
articles["seendate"] = pd.to_datetime(articles["seendate"], errors="coerce")

# drop missing essential fields
articles = articles.dropna(subset=["ticker", "date", "title"]).copy()
articles["ticker"] = articles["ticker"].astype(str)

articles = articles.sort_values(["ticker", "date", "seendate"]).reset_index(drop=True)

print("Total raw articles:", len(articles))
print("Date range:", articles["date"].min().date(), "→", articles["date"].max().date())
print(articles["ticker"].value_counts())

# ---- enforce <= MAX_PER_DAY_FOR_SCORING per ticker / day ----
sampled = []
for (tic, day), grp in articles.groupby(["ticker", "date"], sort=False):
    if len(grp) > MAX_PER_DAY_FOR_SCORING:
        grp = grp.sample(MAX_PER_DAY_FOR_SCORING, random_state=42)
    sampled.append(grp)

to_score = pd.concat(sampled, ignore_index=True)
to_score = to_score.sort_values(["ticker", "date", "seendate"]).reset_index(drop=True)

print("\nAfter daily cap for scoring:")
print("Articles to score:", len(to_score))
print("Date range:", to_score["date"].min().date(), "→", to_score["date"].max().date())
print(to_score["ticker"].value_counts())


Total raw articles: 198653
Date range: 2017-01-01 → 2025-10-31
ticker
XOM     62714
SHEL    56172
VLO     40365
BP      39402
Name: count, dtype: int64

After daily cap for scoring:
Articles to score: 197333
Date range: 2017-01-01 → 2025-10-31
ticker
XOM     62167
SHEL    55845
VLO     40101
BP      39220
Name: count, dtype: int64


In [None]:
# ====== 5. LOAD FINBERT & SCORE ================================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

MODEL_NAME = "ProsusAI/finbert"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(device)
model.eval()

texts = to_score["title"].astype(str).tolist()

all_pos, all_neu, all_neg = [], [], []

for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Scoring with FinBERT"):
    batch_texts = texts[i:i + BATCH_SIZE]
    enc = tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        logits = model(**enc).logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()

    # FinBERT order: [negative, neutral, positive]
    neg = probs[:, 0]
    neu = probs[:, 1]
    pos = probs[:, 2]

    all_pos.append(pos)
    all_neu.append(neu)
    all_neg.append(neg)

pos = np.concatenate(all_pos)
neu = np.concatenate(all_neu)
neg = np.concatenate(all_neg)

to_score["positive"] = pos
to_score["neutral"]  = neu
to_score["negative"] = neg
to_score["sentiment_score"] = to_score["positive"] - to_score["negative"]

to_score.to_csv(SCORED_CSV, index=False)
print("Saved article-level scores:", SCORED_CSV)
display(to_score.head())


Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Scoring with FinBERT:   0%|          | 0/1542 [00:00<?, ?it/s]

Saved article-level scores: /content/drive/MyDrive/Colab_Project_Data/data_raw/news/finbert_articles_scored.csv


Unnamed: 0,ticker,seendate,title,url,domain,language,sourcecountry,date,positive,neutral,negative,sentiment_score
0,BP,2017-01-01 02:15:00+00:00,BP p . l . c . ( NYSE : BP ) paid $1 . 3bn for...,http://www.benchmarkmonitor.com/2016/12/29/bp-...,benchmarkmonitor.com,English,United States,2017-01-01,0.886564,0.012732,0.100704,0.785859
1,BP,2017-01-01 10:00:00+00:00,SK Innovation to invest up to $2 . 5 bn in 2017,http://pulsenews.co.kr/view.php?sc=30800021&ye...,pulsenews.co.kr,English,,2017-01-01,0.763762,0.008067,0.228171,0.535592
2,BP,2017-01-01 10:15:00+00:00,Lord Lamont of Lerwick,https://wn.com/Lord_Lamont_of_Lerwick,wn.com,English,United States,2017-01-01,0.894848,0.077126,0.028026,0.866822
3,BP,2017-01-01 12:00:00+00:00,USM save Mississippi oyster industry with new ...,http://www.sunherald.com/news/local/article123...,sunherald.com,English,United States,2017-01-01,0.386003,0.008327,0.60567,-0.219666
4,BP,2017-01-01 15:00:00+00:00,Energy Voice,https://www.energyvoice.com/oilandgas/127903/e...,energyvoice.com,English,,2017-01-01,0.920554,0.030687,0.048759,0.871794


In [None]:
# ====== 6. DAILY AVERAGES ======================================

daily = (
    to_score
    .groupby(["ticker", "date"], as_index=False)
    .agg(
        sentiment_avg=("sentiment_score", "mean"),
        n_articles=("sentiment_score", "size"),
        pos_avg=("positive", "mean"),
        neg_avg=("negative", "mean"),
        neu_avg=("neutral", "mean"),
    )
    .sort_values(["ticker", "date"])
)

daily.to_csv(DAILY_CSV, index=False)
print("Saved daily averages:", DAILY_CSV)
display(daily.head())


Saved daily averages: /content/drive/MyDrive/Colab_Project_Data/data_clean/finbert_daily_avg.csv


Unnamed: 0,ticker,date,sentiment_avg,n_articles,pos_avg,neg_avg,neu_avg
0,BP,2017-01-01,0.521038,9,0.708479,0.187441,0.10408
1,BP,2017-01-02,0.423605,15,0.663194,0.239589,0.097217
2,BP,2017-01-03,0.276786,19,0.628054,0.351268,0.020678
3,BP,2017-01-04,0.357096,20,0.64133,0.284234,0.074435
4,BP,2017-01-05,0.683861,20,0.750827,0.066965,0.182208
