In [None]:
# 0) Tooling and NumPy ABI repair (prevents "numpy.dtype size changed" error)
%pip install --upgrade pip wheel setuptools
%pip install --upgrade numpy==1.24.2  # Use a compatible version of NumPy

# 1) Core data and utilties
%pip install pandas pyarrow tqdm

# 2) Modeling, evalaution, and plots
%pip install scikit-learn matplotlib seaborn

# 4) Sentiment baselines (VADER)
%pip install nltk

# Verify environment (safe guards)
import sys
print("Python:", sys.version)

try:
    import torch
    print("PyTorch:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
except Exception as e:
    torch = None
    print("PyTorch not available or failed to import:", repr(e))

# Download VADER lexicon
import nltk
try:
    nltk.download('vader_lexicon', quiet=True)
    print("VADER lexicon ready")
except Exception as e:
    print("NLTK download error:", repr(e))

# Deterministic seeds
import random, numpy as np
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
if 'torch' in globals() and torch is not None:
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(SEED)

print("Environment ready. Seed set to", SEED)


Collecting numpy==1.24.2
  Using cached numpy-1.24.2.tar.gz (10.9 MB)
  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m No available output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... [?25l[?25herror
[31mERROR: Failed to build 'numpy' when getting requirements to build wheel[0m[31m
Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
PyTorch: 2.9.0+cu126
CUDA available: True
VADER lexicon ready
Environment ready. Seed set to 42


In [None]:
!pip install datasets
!pip install huggingface_hub

!pip install --upgrade transformers



In [None]:
# Cell 1 — Mont Drive, set paths, verify FSNPID layout, and seed env

from google.colab import drive
drive.mount('/content/drive')

import os, glob, sys, random
import numpy as np

# Base directory in Drive
BASE = "/content/drive/MyDrive/NLP_Project/FSNPID"

NEWS_DIR   = f"{BASE}/Stock_news"
PRICES_DIR = f"{BASE}/Stock_price/full_history"

NEWS_ALL   = f"{NEWS_DIR}/All_external.csv"
NEWS_NAS   = f"{NEWS_DIR}/nasdaq_exteral_data.csv"

# Verifying existence
print("Base exists:         ", os.path.exists(BASE))
print("News dir exists:     ", os.path.exists(NEWS_DIR))
print("Prices dir exists:   ", os.path.exists(PRICES_DIR))
print("All_external.csv:    ", os.path.exists(NEWS_ALL))
print("nasdaq_external_data:", os.path.exists(NEWS_NAS))

# Quick file size report to check if it is present
def size_gb(p):
    try:
        return f"{os.path.getsize(p)/(1024**3):.2f} GB"
    except Exception:
        return "N/A"

if os.path.exists(NEWS_ALL):
    print("All_external.csv size:    ", size_gb(NEWS_ALL))
if os.path.exists(NEWS_NAS):
    print("nasdaq_external_data size:", size_gb(NEWS_NAS))

# Count price CSVs
price_csvs = glob.glob(os.path.join(PRICES_DIR, "*.csv"))
print("Per-ticker price CSVs found:", len(price_csvs))

# Environment info
print("Python:", sys.version)
try:
    import torch
    print("PyTorch:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
except Exception as e:
    torch = None
    print("PyTorch not available or failed to import:", repr(e))

# Deterministic seeds
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
if 'torch' in globals() and torch is not None:
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(SEED)
print("Environment ready. Seed set to", SEED)


Mounted at /content/drive
Base exists:          True
News dir exists:      True
Prices dir exists:    True
All_external.csv:     True
nasdaq_external_data: True
All_external.csv size:     5.34 GB
nasdaq_external_data size: 21.64 GB
Per-ticker price CSVs found: 7693
Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
PyTorch: 2.9.0+cu126
CUDA available: True
Environment ready. Seed set to 42


In [None]:
# Peek 5 rows from thee Drive-hosted FNSPID news CSV to inspect schema

import os
import pandas as pd

# Reusing paths from Cell 1
# NEWS_NAS preferred; fallback to NEWS_ALL
NEWS_CSV = NEWS_NAS if os.path.exists(NEWS_NAS) else NEWS_ALL
assert os.path.exists(NEWS_CSV), "News CSV not found; check NEWS_NAS/NEWS_ALL paths."

# Reading a small head with all columns as strings to avoid type coercion
df_head = pd.read_csv(
    NEWS_CSV,
    nrows=5,
    dtype="string",
    low_memory=False,
    on_bad_lines="skip"
)

print("Detected columns:", list(df_head.columns))
print("\nSample rows:")
print(df_head.head(5).T)  # transpose for compact per-column view


Detected columns: ['Unnamed: 0', 'Date', 'Article_title', 'Stock_symbol', 'Url', 'Publisher', 'Author', 'Article', 'Lsa_summary', 'Luhn_summary', 'Textrank_summary', 'Lexrank_summary']

Sample rows:
                                                                  0  \
Unnamed: 0                                                      0.0   
Date                                        2023-12-16 23:00:00 UTC   
Article_title     Interesting A Put And Call Options For August ...   
Stock_symbol                                                      A   
Url               https://www.nasdaq.com/articles/interesting-a-...   
Publisher                                                      <NA>   
Author                                                         <NA>   
Article           Investors in Agilent Technologies, Inc. (Symbo...   
Lsa_summary       Because the $125.00 strike represents an appro...   
Luhn_summary      The current analytical data (including greeks ...   
Textrank_summary  Be

In [None]:
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Reuse paths from Cell 1: NEWS_NAS, NEWS_ALL
NEWS_CSV = NEWS_NAS if os.path.exists(NEWS_NAS) else NEWS_ALL
assert os.path.exists(NEWS_CSV), "News CSV not found; check NEWS_NAS/NEWS_ALL."

# Choosing initial stickers
TARGET_TICKERS = {"AAPL","MSFT","AMZN","GOOGL","META","NVDA","TSLA","JPM","BAC","XOM"}

OUT_PARQUET = "/content/drive/MyDrive/NLP_Project/fnspid_news_2018_2024_subset.parquet"

def norm_date(s: str) -> str:
    s = str(s)
    if " " in s: s = s.split(" ")[0]
    if "T" in s: s = s.split("T")[0]
    return s

if os.path.exists(OUT_PARQUET):
    print(f"Parquet file already exists at {OUT_PARQUET}, loading directly.")
    df_news = pd.read_parquet(OUT_PARQUET)
    print("Subset shape:", df_news.shape)
    print(df_news.head(5)[["publish_date","ticker","headline"]])
else:
    print("Parquet file does not exist, processing CSV chunks and creating parquet.")
    # Confirmed columns in the file:
    # ['Unnamed: 0','Date','Article_title','Stock_symbol','Url','Publisher','Author','Article', ...]
    usecols = ["Date","Article_title","Stock_symbol","Article","Url","Publisher","Author"]

    reader = pd.read_csv(
        NEWS_CSV,
        usecols=usecols,
        dtype="string",           # keep all strings to avoid casting issues
        chunksize=250_000,
        low_memory=False,
        on_bad_lines="skip"
    )

    writer = None
    accum = 0

    for i, chunk in enumerate(reader, start=1):
        # Rename to canonical names used downstream
        chunk = chunk.rename(columns={
            "Date": "publish_date",
            "Article_title": "headline",
            "Stock_symbol": "ticker",
            "Article": "body",
        })

        # Normalize date and filter
        chunk["publish_date"] = chunk["publish_date"].map(norm_date)
        sub = chunk.loc[
            chunk["publish_date"].between("2018-01-01","2024-12-31") &
            chunk["ticker"].isin(TARGET_TICKERS),
            ["publish_date","ticker","headline","body","Url","Publisher","Author"]
        ].copy()

        if not sub.empty:
            table = pa.Table.from_pandas(sub, preserve_index=False)
            if writer is None:
                writer = pq.ParquetWriter(OUT_PARQUET, table.schema, compression="snappy")
            writer.write_table(table)
            accum += len(sub)

        if i % 10 == 0:
            print(f"Chunks processed: {i}, subset rows so far: {accum:,}")

    if writer is not None:
        writer.close()
    print("Saved subset:", OUT_PARQUET, "rows:", accum)

    # Sanity check
    if os.path.exists(OUT_PARQUET) and accum > 0:
        df_news = pd.read_parquet(OUT_PARQUET)
        print("Subset shape:", df_news.shape)
        print(df_news.head(5)[["publish_date","ticker","headline"]])
    else:
        print("No rows matched filters. Consider expanding tickers or years.")


Parquet file already exists at /content/drive/MyDrive/NLP_Project/fnspid_news_2018_2024_subset.parquet, loading directly.
Subset shape: (56506, 7)
  publish_date ticker                                           headline
0   2023-12-16   AAPL  My 6 Largest Portfolio Holdings Heading Into 2...
1   2023-12-16   AAPL  Brokers Suggest Investing in Apple (AAPL): Rea...
2   2023-12-16   AAPL                      Company News for Dec 19, 2023
3   2023-12-16   AAPL  NVIDIA (NVDA) Up 243% YTD: Will It Carry Momen...
4   2023-12-16   AAPL  Pre-Market Most Active for Dec 19, 2023 : BMY,...


In [None]:
print(df_news.sample(100))


      publish_date ticker                                           headline  \
29618   2022-04-05   NVDA           Don’t Let TWLO Stock Become Your Problem   
21722   2022-06-24   MSFT  Got $2,500? 2 Top Stocks That You Can Buy and ...   
32318   2023-10-01   TSLA  Tesla’s about to report Q3 deliveries; Don’t g...   
43607   2022-01-27    XOM  U.S. judge annuls Gulf of Mexico oil lease sal...   
26364   2023-06-07   NVDA  Momo's (MOMO) Q1 Earnings Beat Estimates, Reve...   
...            ...    ...                                                ...   
16055   2023-07-28   MSFT  Oppenheimer Reiterates Microsoft (MSFT) Outper...   
34667   2023-05-13   TSLA  New Twitter CEO says she is excited to help to...   
55050   2019-07-15    JPM  FANG Stocks Set For D.C. Mauling; More Bank Ea...   
42214   2022-10-05    XOM           Is the Oil Price Going to Keep Going Up?   
45413   2020-08-18    XOM  ANALYSIS-U.S. activists complain that virtual ...   

                                       

In [None]:
# Cell 3 — Loading & normalizing OHLCV prices (2018–2024) with ticker from filename

import os, glob, re
import pandas as pd

assert os.path.exists(PRICES_DIR), f"Prices dir not found: {PRICES_DIR}"

price_files = glob.glob(os.path.join(PRICES_DIR, "*.csv"))
print("Price CSVs found:", len(price_files))

def norm_date(s: str) -> str:
    s = str(s)
    if " " in s: s = s.split(" ")[0]
    if "T" in s: s = s.split("T")[0]
    return s

def infer_ticker(path: str) -> str:
    # e.g., "/.../AAPL.csv" -> "AAPL"; handles dots or suffixes like ".US.csv" -> "US" variant
    base = os.path.basename(path)
    name = re.sub(r"\.csv$", "", base, flags=re.IGNORECASE)
    # If names like "AAPL.US", take left part
    return name.split(".")[0].upper()

prices = []
for fp in price_files:
    try:
        dfp = pd.read_csv(fp, low_memory=False)
        # Add ticker from filename
        dfp["ticker"] = infer_ticker(fp)

        # Normalize date column
        if "date" in dfp.columns:
            dfp["date"] = dfp["date"].map(norm_date)
        elif "Date" in dfp.columns:
            dfp["date"] = dfp["Date"].map(norm_date)
        else:
            # Skip files without a date column
            # print("Skipping (no date):", fp)
            continue

        # Standardize OHLCV names
        rename_map = {}
        if "Close" in dfp.columns and "close" not in dfp.columns: rename_map["Close"] = "close"
        if "Open"  in dfp.columns and "open"  not in dfp.columns: rename_map["Open"]  = "open"
        if "High"  in dfp.columns and "high"  not in dfp.columns: rename_map["High"]  = "high"
        if "Low"   in dfp.columns and "low"   not in dfp.columns: rename_map["Low"]   = "low"
        if "Volume"in dfp.columns and "volume"not in dfp.columns: rename_map["Volume"]= "volume"
        if rename_map:
            dfp = dfp.rename(columns=rename_map)

        # Minimal columns present
        keep_cols = [c for c in ["ticker","date","open","high","low","close","volume"] if c in dfp.columns]
        dfp = dfp[keep_cols]

        # Year filter
        dfp = dfp[dfp["date"].between("2018-01-01","2024-12-31")]

        if not dfp.empty:
            prices.append(dfp)
    except Exception as e:
        print("Read error:", fp, repr(e))

# Concatenate all tickers
price_df = pd.concat(prices, ignore_index=True)
price_df = price_df.sort_values(["ticker","date"]).reset_index(drop=True)

print("Normalized prices shape:", price_df.shape)
print("Columns:", list(price_df.columns))
print(price_df.head(3))


Price CSVs found: 7693
Normalized prices shape: (8171139, 7)
Columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume']
  ticker        date       open       high        low      close     volume
0      A  2018-01-02  67.419998  67.889999  67.339996  67.599998  1047800.0
1      A  2018-01-03  67.620003  69.489998  67.599998  69.320000  1698900.0
2      A  2018-01-04  69.540001  69.820000  68.779999  68.800003  2230700.0


In [None]:
# Cell 4 — Filter price_df by target tickers and date range to align with news dataset

import pandas as pd

TARGET_TICKERS = {"AAPL","MSFT","AMZN","GOOGL","META","NVDA","TSLA","JPM","BAC","XOM"}

# Ensure date column is datetime
price_df['date'] = pd.to_datetime(price_df['date'], errors='coerce')

# Filter for target tickers
price_df = price_df[price_df['ticker'].isin(TARGET_TICKERS)]

# Filter for date range: 2018-01-01 to 2024-12-31
start_date = pd.to_datetime('2018-01-01')
end_date = pd.to_datetime('2024-12-31')
price_df = price_df[(price_df['date'] >= start_date) & (price_df['date'] <= end_date)]

# Optional: convert date back to string YYYY-MM-DD if needed to match news dataset format
price_df['date'] = price_df['date'].dt.strftime('%Y-%m-%d')

print("Filtered prices shape:", price_df.shape)
print(price_df["ticker"].value_counts())


Filtered prices shape: (11752, 7)
ticker
AAPL     1508
AMZN     1508
JPM      1508
NVDA     1508
MSFT     1508
TSLA     1508
XOM      1508
BAC       630
GOOGL     566
Name: count, dtype: int64


In [None]:
# Cell 4 — Labeling next-day move (close-to-close) and creating time-aware splits

import pandas as pd
import numpy as np

#NEWS_PARQUET = "/content/fnspid_news_2018_2024_subset.parquet"
NEWS_PARQUET = "/content/drive/MyDrive/NLP_Project/fnspid_news_2018_2024_subset.parquet"

# Load the news subset created in Cell 2
news_df = pd.read_parquet(NEWS_PARQUET)
print("News subset:", news_df.shape)

# Build close_t and close_{t+1} per ticker/date from price_df (already in memory from Cell 3)
close_df = price_df[["ticker","date","close"]].copy().sort_values(["ticker","date"])
close_df["close_next"] = close_df.groupby("ticker")["close"].shift(-1)

# Merge close_t at publish_date = t
news_df = news_df.merge(
    close_df[["ticker","date","close"]].rename(columns={"date":"publish_date","close":"close_t"}),
    on=["ticker","publish_date"], how="left"
)

# Merge close_{t+1} using the same publish_date key
news_df = news_df.merge(
    close_df[["ticker","date","close_next"]].rename(columns={"date":"publish_date"}),
    on=["ticker","publish_date"], how="left"
)

# Compute next-day return and label with deadband
news_df["ret_next"] = (news_df["close_next"] - news_df["close_t"]) / news_df["close_t"]

tau = 0.002  # 0.2% deadband
def label_ret(r):
    if pd.isna(r): return np.nan
    if r >= tau: return "Up"
    if r <= -tau: return "Down"
    return "NoChange"

news_df["label"] = news_df["ret_next"].map(label_ret)

# Drop rows without labels (e.g., missing price_t or price_{t+1})
labeled_df = news_df.dropna(subset=["label"]).copy()
print("Labeled rows:", labeled_df.shape)

# Time-aware splits
train = labeled_df[labeled_df["publish_date"].between("2018-01-01","2021-12-31")]
val   = labeled_df[labeled_df["publish_date"].between("2022-01-01","2022-12-31")]
test  = labeled_df[labeled_df["publish_date"].between("2023-01-01","2024-12-31")]

# Persist splits for Experiment 1
train.to_parquet("/content/fnspid_train.parquet", index=False)
val.to_parquet("/content/fnspid_val.parquet", index=False)
test.to_parquet("/content/fnspid_test.parquet", index=False)

print("Split sizes:", len(train), len(val), len(test))
for name, part in [("train",train),("val",val),("test",test)]:
    print(f"{name} label share:\n", part["label"].value_counts(normalize=True).round(3))


News subset: (56506, 7)
Labeled rows: (51342, 11)
Split sizes: 12323 12186 26833
train label share:
 label
Up          0.478
Down        0.439
NoChange    0.082
Name: proportion, dtype: float64
val label share:
 label
Down        0.490
Up          0.445
NoChange    0.066
Name: proportion, dtype: float64
test label share:
 label
Up          0.494
Down        0.414
NoChange    0.092
Name: proportion, dtype: float64


In [None]:
import random

# Sampling some ticker-date pairs from labeled news data
sample_size = 5
sampled_rows = labeled_df[['ticker', 'publish_date']].drop_duplicates().sample(sample_size, random_state=42)

print("Spot check samples:")

for _, row in sampled_rows.iterrows():
    ticker = row['ticker']
    date = row['publish_date']

    # Price from merged news dataset (close_t)
    price_news = labeled_df[(labeled_df['ticker'] == ticker) & (labeled_df['publish_date'] == date)]['close_t'].iloc[0]

    # Price from price_df (raw price dataset)
    # Ensuring date column in price_df is string formatted as 'YYYY-MM-DD'
    price_df_date = price_df[
        (price_df['ticker'] == ticker) &
        (price_df['date'] == date)
    ]['close']

    price_raw = price_df_date.iloc[0] if not price_df_date.empty else None

    print(f"Ticker: {ticker}, Date: {date}")
    print(f"  Merged close_t in news data: {price_news}")
    print(f"  Raw close price in price_df: {price_raw}")
    print("-" * 50)


Spot check samples:
Ticker: XOM, Date: 2020-03-30
  Merged close_t in news data: 37.5
  Raw close price in price_df: 37.5
--------------------------------------------------
Ticker: XOM, Date: 2021-11-17
  Merged close_t in news data: 64.30999755859375
  Raw close price in price_df: 64.30999755859375
--------------------------------------------------
Ticker: TSLA, Date: 2022-10-21
  Merged close_t in news data: 214.44000244140625
  Raw close price in price_df: 214.44000244140625
--------------------------------------------------
Ticker: TSLA, Date: 2020-03-24
  Merged close_t in news data: 505.0
  Raw close price in price_df: 505.0
--------------------------------------------------
Ticker: GOOGL, Date: 2018-09-11
  Merged close_t in news data: 1189.989990234375
  Raw close price in price_df: 1189.989990234375
--------------------------------------------------


In [None]:
# Cell 6 — Compute VADER sentiment features for headline/body

import pandas as pd
from tqdm import tqdm
import numpy as np
import nltk

# Confirming lexicon
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def add_vader(df, text_col, prefix):
    scores = []
    # Using fillna("") to avoid None issues
    for txt in tqdm(df[text_col].fillna(""), total=len(df), desc=f"VADER {prefix}"):
        s = sia.polarity_scores(txt)
        scores.append((s["compound"], s["pos"], s["neu"], s["neg"]))
    out = pd.DataFrame(scores, columns=[f"{prefix}_compound", f"{prefix}_pos", f"{prefix}_neu", f"{prefix}_neg"])
    return pd.concat([df.reset_index(drop=True), out], axis=1)

paths = {
    "train": "/content/fnspid_train.parquet",
    "val":   "/content/fnspid_val.parquet",
    "test":  "/content/fnspid_test.parquet",
}

aug_paths = {}

for split, path in paths.items():
    df = pd.read_parquet(path)
    # Headline VADER
    df = add_vader(df, "headline", "vader_headline")

    if "body" in df.columns:
        # Only compute for non-empty to save time; fill missing scores with zeros
        mask = df["body"].notna() & (df["body"].str.len() > 0)
        df_body = df.loc[mask].copy()
        if len(df_body):
            df_body = add_vader(df_body, "body", "vader_body")
            df.loc[mask, ["vader_body_compound","vader_body_pos","vader_body_neu","vader_body_neg"]] = \
                df_body[["vader_body_compound","vader_body_pos","vader_body_neu","vader_body_neg"]].values
        # Filling remaining NaNs for body scores with 0
        for c in ["vader_body_compound","vader_body_pos","vader_body_neu","vader_body_neg"]:
            if c in df.columns:
                df[c] = df[c].fillna(0.0)

    out = f"/content/fnspid_{split}_vader.parquet"
    df.to_parquet(out, index=False)
    aug_paths[split] = out
    print(split, "->", out, "rows:", len(df))


VADER vader_headline: 100%|██████████| 12323/12323 [00:01<00:00, 7725.71it/s]
VADER vader_body: 100%|██████████| 3290/3290 [00:27<00:00, 119.80it/s]


train -> /content/fnspid_train_vader.parquet rows: 12323


VADER vader_headline: 100%|██████████| 12186/12186 [00:01<00:00, 8995.99it/s]
VADER vader_body: 100%|██████████| 12186/12186 [01:39<00:00, 122.56it/s]


val -> /content/fnspid_val_vader.parquet rows: 12186


VADER vader_headline: 100%|██████████| 26833/26833 [00:02<00:00, 9080.71it/s]
VADER vader_body: 100%|██████████| 26833/26833 [03:21<00:00, 132.91it/s]


test -> /content/fnspid_test_vader.parquet rows: 26833


In [None]:
# Cell 7 — Assemble feature matrices and encoded labels for baselines

import pandas as pd
import numpy as np

paths = {
    "train": "/content/fnspid_train_vader.parquet",
    "val":   "/content/fnspid_val_vader.parquet",
    "test":  "/content/fnspid_test_vader.parquet",
}

def load_feats(path, include_body=False):
    df = pd.read_parquet(path)
    # Label mapping
    label_map = {"Down": 0, "NoChange": 1, "Up": 2}
    y = df["label"].map(label_map).astype(int).values

    # Headline VADER features
    cols = ["vader_headline_compound","vader_headline_pos","vader_headline_neu","vader_headline_neg"]

    # Optionally add body VADER
    if include_body:
        body_cols = ["vader_body_compound","vader_body_pos","vader_body_neu","vader_body_neg"]
        # If body columns don't exist (or are all NaN), fill with zeros
        for c in body_cols:
            if c not in df.columns:
                df[c] = 0.0
            else:
                df[c] = df[c].fillna(0.0)
        cols += body_cols

    X = df[cols].astype(float).values
    return X, y, df

# Prepare two settings: headline-only and headline+body
Xtr_h, ytr, dftr = load_feats(paths["train"], include_body=False)
Xva_h, yva, dfva = load_feats(paths["val"],   include_body=False)
Xte_h, yte, dfte = load_feats(paths["test"],  include_body=False)

Xtr_hb, _, _ = load_feats(paths["train"], include_body=True)
Xva_hb, _, _ = load_feats(paths["val"],   include_body=True)
Xte_hb, _, _ = load_feats(paths["test"],  include_body=True)

print("Headline-only shapes:", Xtr_h.shape, Xva_h.shape, Xte_h.shape)
print("Headline+body shapes:", Xtr_hb.shape, Xva_hb.shape, Xte_hb.shape)


Headline-only shapes: (12323, 4) (12186, 4) (26833, 4)
Headline+body shapes: (12323, 8) (12186, 8) (26833, 8)


In [None]:
# Cell 8 — Baseline models with VADER features

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import numpy as np

def eval_model(clf, Xtr, ytr, Xva, yva, Xte, yte, name):
    clf.fit(Xtr, ytr)
    pva = clf.predict(Xva)
    pte = clf.predict(Xte)
    val_acc = accuracy_score(yva, pva)
    val_f1  = f1_score(yva, pva, average="macro")
    test_acc = accuracy_score(yte, pte)
    test_f1  = f1_score(yte, pte, average="macro")
    print(f"\n=== {name} ===")
    print(f"Val  Acc: {val_acc:.4f}  Macro-F1: {val_f1:.4f}")
    print(f"Test Acc: {test_acc:.4f}  Macro-F1: {test_f1:.4f}")
    print("Val report:\n", classification_report(yva, pva, digits=3))
    print("Test report:\n", classification_report(yte, pte, digits=3))
    print("Test CM:\n", confusion_matrix(yte, pte))


# Headline-only
lr_h = LogisticRegression(max_iter=200, n_jobs=-1, class_weight=None, solver="lbfgs")
rf_h = RandomForestClassifier(n_estimators=300, max_depth=None, n_jobs=-1, random_state=42)

eval_model(lr_h, Xtr_h, ytr, Xva_h, yva, Xte_h, yte, "LogReg (headline VADER)")   # [web:4]
eval_model(rf_h, Xtr_h, ytr, Xva_h, yva, Xte_h, yte, "RF (headline VADER)")       # [web:4]

# Headline+body
lr_hb = LogisticRegression(max_iter=200, n_jobs=-1, class_weight=None, solver="lbfgs")
rf_hb = RandomForestClassifier(n_estimators=300, max_depth=None, n_jobs=-1, random_state=42)

eval_model(lr_hb, Xtr_hb, ytr, Xva_hb, yva, Xte_hb, yte, "LogReg (headline+body VADER)")  # [web:4]
eval_model(rf_hb,  Xtr_hb, ytr, Xva_hb, yva, Xte_hb, yte, "RF (headline+body VADER)")      # [web:4]



=== LogReg (headline VADER) ===
Val  Acc: 0.4443  Macro-F1: 0.2057
Test Acc: 0.4939  Macro-F1: 0.2208
Val report:
               precision    recall  f1-score   support

           0      0.333     0.001     0.002      5967
           1      0.000     0.000     0.000       799
           2      0.444     0.998     0.615      5420

    accuracy                          0.444     12186
   macro avg      0.259     0.333     0.206     12186
weighted avg      0.361     0.444     0.275     12186

Test report:
               precision    recall  f1-score   support

           0      0.375     0.001     0.001     11114
           1      0.000     0.000     0.000      2465
           2      0.494     1.000     0.661     13254

    accuracy                          0.494     26833
   macro avg      0.290     0.333     0.221     26833
weighted avg      0.399     0.494     0.327     26833

Test CM:
 [[    6     0 11108]
 [    4     0  2461]
 [    6     0 13248]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== RF (headline VADER) ===
Val  Acc: 0.4498  Macro-F1: 0.2995
Test Acc: 0.4679  Macro-F1: 0.3012
Val report:
               precision    recall  f1-score   support

           0      0.486     0.246     0.327      5967
           1      0.055     0.010     0.017       799
           2      0.444     0.739     0.555      5420

    accuracy                          0.450     12186
   macro avg      0.328     0.332     0.299     12186
weighted avg      0.439     0.450     0.408     12186

Test report:
               precision    recall  f1-score   support

           0      0.406     0.225     0.289     11114
           1      0.079     0.010     0.017      2465
           2      0.492     0.757     0.597     13254

    accuracy                          0.468     26833
   macro avg      0.326     0.331     0.301     26833
weighted avg      0.419     0.468     0.416     26833

Test CM:
 [[ 2499   137  8478]
 [  577    24  1864]
 [ 3077   144 10033]]

=== LogReg (headline+body VADER) ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== RF (headline+body VADER) ===
Val  Acc: 0.4774  Macro-F1: 0.3372
Test Acc: 0.4510  Macro-F1: 0.3179
Val report:
               precision    recall  f1-score   support

           0      0.505     0.492     0.499      5967
           1      0.116     0.013     0.023       799
           2      0.457     0.529     0.490      5420

    accuracy                          0.477     12186
   macro avg      0.359     0.345     0.337     12186
weighted avg      0.458     0.477     0.464     12186

Test report:
               precision    recall  f1-score   support

           0      0.411     0.479     0.443     11114
           1      0.090     0.005     0.010      2465
           2      0.492     0.510     0.501     13254

    accuracy                          0.451     26833
   macro avg      0.331     0.332     0.318     26833
weighted avg      0.422     0.451     0.432     26833

Test CM:
 [[5329   60 5725]
 [1209   13 1243]
 [6421   72 6761]]


In [None]:
# Cell — FinBERT sentiment probabilities on headline

%pip -q install transformers

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "ProsusAI/finbert"  # 3-class financial sentiment
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
model.eval()

def finbert_scores(texts, batch_size=64, max_len=128):
    probs_all = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch, padding=True, truncation=True, max_length=max_len,
            return_tensors="pt"
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            out = model(**enc)
            p = torch.softmax(out.logits, dim=-1).cpu().numpy()
            probs_all.append(p)
    import numpy as np
    return np.vstack(probs_all)

paths = {
    "train": "/content/fnspid_train.parquet",
    "val":   "/content/fnspid_val.parquet",
    "test":  "/content/fnspid_test.parquet",
}

aug = {}
for split, path in paths.items():
    df = pd.read_parquet(path)
    texts = df["headline"].fillna("").tolist()
    P = finbert_scores(texts, batch_size=128, max_len=64)  # headlines are short
    # FinBERT label order is typically: positive, negative, neutral
    df["fb_pos"] = P[:, 0]
    df["fb_neg"] = P[:, 1]
    df["fb_neu"] = P[:, 2]
    df["fb_score"] = df["fb_pos"] - df["fb_neg"]
    out = f"/content/fnspid_{split}_finbert.parquet"
    df.to_parquet(out, index=False)
    aug[split] = out
    print(split, "->", out, "rows:", len(df))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

train -> /content/fnspid_train_finbert.parquet rows: 12323
val -> /content/fnspid_val_finbert.parquet rows: 12186
test -> /content/fnspid_test_finbert.parquet rows: 26833


In [None]:
# Train LR/RF on FinBERT headline features

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

def load_fb(path):
    df = pd.read_parquet(path)
    X = df[["fb_pos","fb_neg","fb_neu","fb_score"]].astype(float).values
    y = df["label"].map({"Down":0,"NoChange":1,"Up":2}).astype(int).values
    return X, y, df

Xtr, ytr, dftr = load_fb("/content/fnspid_train_finbert.parquet")
Xva, yva, dfva = load_fb("/content/fnspid_val_finbert.parquet")
Xte, yte, dfte = load_fb("/content/fnspid_test_finbert.parquet")

def eval_model(clf, name):
    clf.fit(Xtr, ytr)
    pva = clf.predict(Xva); pte = clf.predict(Xte)
    print(f"\n=== {name} ===")
    print(f"Val  Acc: {accuracy_score(yva, pva):.4f}  Macro-F1: {f1_score(yva, pva, average='macro'):.4f}")
    print(f"Test Acc: {accuracy_score(yte, pte):.4f}  Macro-F1: {f1_score(yte, pte, average='macro'):.4f}")
    print("Val report:\n", classification_report(yva, pva, digits=3))
    print("Test report:\n", classification_report(yte, pte, digits=3))

eval_model(LogisticRegression(max_iter=400, n_jobs=-1, solver="lbfgs"), "LogReg (FinBERT headline)")
eval_model(RandomForestClassifier(n_estimators=400, n_jobs=-1, random_state=42), "RF (FinBERT headline)")



=== LogReg (FinBERT headline) ===
Val  Acc: 0.4448  Macro-F1: 0.2052
Test Acc: 0.4939  Macro-F1: 0.2204
Val report:
               precision    recall  f1-score   support

           0      0.000     0.000     0.000      5967
           1      0.000     0.000     0.000       799
           2      0.445     1.000     0.616      5420

    accuracy                          0.445     12186
   macro avg      0.148     0.333     0.205     12186
weighted avg      0.198     0.445     0.274     12186

Test report:
               precision    recall  f1-score   support

           0      0.000     0.000     0.000     11114
           1      0.000     0.000     0.000      2465
           2      0.494     1.000     0.661     13254

    accuracy                          0.494     26833
   macro avg      0.165     0.333     0.220     26833
weighted avg      0.244     0.494     0.327     26833



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== RF (FinBERT headline) ===
Val  Acc: 0.4561  Macro-F1: 0.3332
Test Acc: 0.4474  Macro-F1: 0.3304
Val report:
               precision    recall  f1-score   support

           0      0.496     0.454     0.474      5967
           1      0.068     0.034     0.045       799
           2      0.446     0.520     0.480      5420

    accuracy                          0.456     12186
   macro avg      0.337     0.336     0.333     12186
weighted avg      0.446     0.456     0.449     12186

Test report:
               precision    recall  f1-score   support

           0      0.417     0.456     0.436     11114
           1      0.107     0.033     0.051      2465
           2      0.492     0.517     0.504     13254

    accuracy                          0.447     26833
   macro avg      0.339     0.336     0.330     26833
weighted avg      0.426     0.447     0.434     26833



In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
model.eval()

def finbert_scores(texts, batch_size=64, max_len=128):
    probs_all = []
    for i in tqdm(range(0, len(texts), batch_size), desc="FinBERT scoring"):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch, padding=True, truncation=True, max_length=max_len,
            return_tensors="pt"
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            out = model(**enc)
            p = torch.softmax(out.logits, dim=-1).cpu().numpy()
            probs_all.append(p)
    import numpy as np
    return np.vstack(probs_all)


paths = {
    "train": "/content/fnspid_train.parquet",
    "val":   "/content/fnspid_val.parquet",
    "test":  "/content/fnspid_test.parquet",
}

aug = {}

for split, path in paths.items():
    df = pd.read_parquet(path)

    # Headline texts
    headline_texts = df["headline"].fillna("").tolist()
    P_headline = finbert_scores(headline_texts, batch_size=128, max_len=64)

    df["fb_headline_pos"] = P_headline[:, 0]
    df["fb_headline_neg"] = P_headline[:, 1]
    df["fb_headline_neu"] = P_headline[:, 2]
    df["fb_headline_score"] = df["fb_headline_pos"] - df["fb_headline_neg"]

    # Body texts (only non-empty)
    if "body" in df.columns:
        body_sentiment = pd.DataFrame({
            "fb_body_pos": 0.0,
            "fb_body_neg": 0.0,
            "fb_body_neu": 0.0,
            "fb_body_score": 0.0
        }, index=df.index)

        mask = df["body"].notna() & (df["body"].str.len() > 0)
        if mask.any():
            body_texts = df.loc[mask, "body"].tolist()
            P_body = finbert_scores(body_texts, batch_size=64, max_len=256)  # bodies can be longer
            body_sentiment.loc[mask, ["fb_body_pos", "fb_body_neg", "fb_body_neu"]] = P_body
            body_sentiment.loc[mask, "fb_body_score"] = body_sentiment.loc[mask, "fb_body_pos"] - body_sentiment.loc[mask, "fb_body_neg"]

        # Assign body sentiment columns
        for c in body_sentiment.columns:
            df[c] = body_sentiment[c]

    out = f"/content/fnspid_{split}_finbert_headline_body.parquet"
    df.to_parquet(out, index=False)
    aug[split] = out
    print(split, "->", out, "rows:", len(df))


FinBERT scoring: 100%|██████████| 97/97 [00:35<00:00,  2.75it/s]
FinBERT scoring: 100%|██████████| 52/52 [00:57<00:00,  1.11s/it]


train -> /content/fnspid_train_finbert_headline_body.parquet rows: 12323


FinBERT scoring: 100%|██████████| 96/96 [00:32<00:00,  2.96it/s]
FinBERT scoring: 100%|██████████| 191/191 [03:34<00:00,  1.12s/it]


val -> /content/fnspid_val_finbert_headline_body.parquet rows: 12186


FinBERT scoring: 100%|██████████| 210/210 [01:08<00:00,  3.06it/s]
FinBERT scoring: 100%|██████████| 420/420 [07:51<00:00,  1.12s/it]


test -> /content/fnspid_test_finbert_headline_body.parquet rows: 26833


In [None]:
import pandas as pd
import numpy as np

paths = {
    "train": "/content/fnspid_train_finbert_headline_body.parquet",
    "val":   "/content/fnspid_val_finbert_headline_body.parquet",
    "test":  "/content/fnspid_test_finbert_headline_body.parquet",
}

def load_finbert_feats(path):
    df = pd.read_parquet(path)
    label_map = {"Down": 0, "NoChange": 1, "Up": 2}
    y = df["label"].map(label_map).astype(int).values

    # Defining headline and body sentiment feature columns
    cols = [
        "fb_headline_pos", "fb_headline_neg", "fb_headline_neu", "fb_headline_score",
        "fb_body_pos", "fb_body_neg", "fb_body_neu", "fb_body_score"
    ]

    # Ensure missing body features are filled with zero (in case of missing columns)
    for c in cols:
        if c not in df.columns:
            df[c] = 0.0
        else:
            df[c] = df[c].fillna(0.0)

    X = df[cols].astype(float).values
    return X, y, df

# Load train, validation, and test sets
Xtr, ytr, dftr = load_finbert_feats(paths["train"])
Xva, yva, dfva = load_finbert_feats(paths["val"])
Xte, yte, dfte = load_finbert_feats(paths["test"])

print("Feature shapes:")
print("Train:", Xtr.shape, "Val:", Xva.shape, "Test:", Xte.shape)


Feature shapes:
Train: (12323, 8) Val: (12186, 8) Test: (26833, 8)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

def eval_model(clf, Xtr, ytr, Xva, yva, Xte, yte, name):
    clf.fit(Xtr, ytr)
    pva = clf.predict(Xva)
    pte = clf.predict(Xte)
    val_acc = accuracy_score(yva, pva)
    val_f1 = f1_score(yva, pva, average='macro')
    test_acc = accuracy_score(yte, pte)
    test_f1 = f1_score(yte, pte, average='macro')
    print(f"\n=== {name} ===")
    print(f"Val  Acc: {val_acc:.4f}  Macro-F1: {val_f1:.4f}")
    print(f"Test Acc: {test_acc:.4f}  Macro-F1: {test_f1:.4f}")
    print("Val report:\n", classification_report(yva, pva, digits=3))
    print("Test report:\n", classification_report(yte, pte, digits=3))

# Initialize models
lr = LogisticRegression(max_iter=400, n_jobs=-1, solver="lbfgs")
rf = RandomForestClassifier(n_estimators=400, n_jobs=-1, random_state=42)

eval_model(lr, Xtr, ytr, Xva, yva, Xte, yte, "LogReg (FinBERT headline+body)")
eval_model(rf, Xtr, ytr, Xva, yva, Xte, yte, "RF (FinBERT headline+body)")



=== LogReg (FinBERT headline+body) ===
Val  Acc: 0.4503  Macro-F1: 0.2791
Test Acc: 0.4775  Macro-F1: 0.3022
Val report:
               precision    recall  f1-score   support

           0      0.478     0.181     0.263      5967
           1      0.000     0.000     0.000       799
           2      0.444     0.813     0.574      5420

    accuracy                          0.450     12186
   macro avg      0.307     0.331     0.279     12186
weighted avg      0.432     0.450     0.384     12186

Test report:
               precision    recall  f1-score   support

           0      0.417     0.239     0.304     11114
           1      0.000     0.000     0.000      2465
           2      0.496     0.766     0.603     13254

    accuracy                          0.478     26833
   macro avg      0.304     0.335     0.302     26833
weighted avg      0.418     0.478     0.424     26833



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== RF (FinBERT headline+body) ===
Val  Acc: 0.4652  Macro-F1: 0.3215
Test Acc: 0.4537  Macro-F1: 0.3175
Val report:
               precision    recall  f1-score   support

           0      0.488     0.466     0.477      5967
           1      0.111     0.001     0.002       799
           2      0.445     0.532     0.485      5420

    accuracy                          0.465     12186
   macro avg      0.348     0.333     0.322     12186
weighted avg      0.444     0.465     0.449     12186

Test report:
               precision    recall  f1-score   support

           0      0.413     0.497     0.452     11114
           1      0.105     0.002     0.003      2465
           2      0.495     0.501     0.498     13254

    accuracy                          0.454     26833
   macro avg      0.338     0.333     0.318     26833
weighted avg      0.425     0.454     0.433     26833



EXPERIMENT 2 START

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

# Load parquet data
train_df = pd.read_parquet("/content/fnspid_train.parquet")
val_df = pd.read_parquet("/content/fnspid_val.parquet")
test_df = pd.read_parquet("/content/fnspid_test.parquet")

# Combine headline and body (use headline only if body missing)
def combine_text(row):
    if pd.isna(row['body']) or row['body'].strip() == "":
        return row['headline']
    else:
        return row['headline'] + " " + row['body']

train_df['text'] = train_df.apply(combine_text, axis=1)
val_df['text'] = val_df.apply(combine_text, axis=1)
test_df['text'] = test_df.apply(combine_text, axis=1)

# Encode labels
label_encoder = LabelEncoder()
train_df['label_enc'] = label_encoder.fit_transform(train_df['label'])
val_df['label_enc'] = label_encoder.transform(val_df['label'])
test_df['label_enc'] = label_encoder.transform(test_df['label'])

# Convert to Hugging Face Datasets for easy tokenizer mapping
train_dataset = Dataset.from_pandas(train_df[['text', 'label_enc']])
val_dataset = Dataset.from_pandas(val_df[['text', 'label_enc']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label_enc']])

print("Datasets prepared:")
print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))
print("Test size:", len(test_dataset))


Datasets prepared:
Train size: 12323
Validation size: 12186
Test size: 26833


In [None]:
from transformers import AutoTokenizer

model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

# Apply tokenization to datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("label_enc", "labels")
val_dataset = val_dataset.rename_column("label_enc", "labels")
test_dataset = test_dataset.rename_column("label_enc", "labels")

# Set dataset format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


print("Datasets tokenized and formatted for model input")


Map:   0%|          | 0/12323 [00:00<?, ? examples/s]

Map:   0%|          | 0/12186 [00:00<?, ? examples/s]

Map:   0%|          | 0/26833 [00:00<?, ? examples/s]

Datasets tokenized and formatted for model input


In [None]:
import transformers
print(transformers.__version__)


4.57.2


In [None]:

import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# Load the model and tokenizer for FinBERT
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Define a compute_metrics function
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.tensor(logits).argmax(dim=-1).numpy()
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average='macro')
    return {"accuracy": acc, "macro_f1": macro_f1}

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./test_output",
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    report_to=None,
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,

    tokenizer=tokenizer,
)

# Start training
train_results = trainer.train()

# Save the model
trainer.save_model("./finbert_trained_model")

# Optionally, save the tokenizer
tokenizer.save_pretrained("./finbert_trained_model")

# Evaluate on validation and test sets
print("\nEvaluating on validation set:")
val_results = trainer.evaluate(eval_dataset=val_dataset)

print("\nEvaluating on test set:")
test_results = trainer.evaluate(eval_dataset=test_dataset)

# Print evaluation results
print("\nValidation Results:")
print(val_results)

print("\nTest Results:")
print(test_results)

# Save results
val_results_df = pd.DataFrame([val_results])
test_results_df = pd.DataFrame([test_results])

val_results_df.to_csv("/content/val_results.csv", index=False)
test_results_df.to_csv("/content/test_results.csv", index=False)

print("\nResults saved!")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,Macro F1
500,0.9453,0.893803,0.444609,0.20587



Evaluating on validation set:



Evaluating on test set:

Validation Results:
{'eval_loss': 0.8943067789077759, 'eval_accuracy': 0.44493681273592645, 'eval_macro_f1': 0.20547975745877925, 'eval_runtime': 179.3305, 'eval_samples_per_second': 67.953, 'eval_steps_per_second': 1.065, 'epoch': 1.0}

Test Results:
{'eval_loss': 0.9444178342819214, 'eval_accuracy': 0.49394402414936833, 'eval_macro_f1': 0.22058902157917779, 'eval_runtime': 399.2165, 'eval_samples_per_second': 67.214, 'eval_steps_per_second': 1.052, 'epoch': 1.0}

Results saved!


In [None]:

import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
import torch

# Load the model an tokenizer for FinBERT
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Define a compute_metrics function
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.tensor(logits).argmax(dim=-1).numpy()
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average='macro')
    return {"accuracy": acc, "macro_f1": macro_f1}

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./finbert_finetune",
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=500,
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=500,
    report_to=None,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)


# Start training
train_results = trainer.train()

# Save the model
trainer.save_model("./finbert_trained_model")

# Optionally, save the tokenizer
tokenizer.save_pretrained("./finbert_trained_model")

# Evaluate on validation and test sets
print("\nEvaluating on validation set:")
val_results = trainer.evaluate(eval_dataset=val_dataset)

print("\nEvaluating on test set:")
test_results = trainer.evaluate(eval_dataset=test_dataset)

# Print evaluation results
print("\nValidation Results:")
print(val_results)

print("\nTest Results:")
print(test_results)

# Save results
val_results_df = pd.DataFrame([val_results])
test_results_df = pd.DataFrame([test_results])

val_results_df.to_csv("/content/val_results.csv", index=False)
test_results_df.to_csv("/content/test_results.csv", index=False)

print("\nResults saved!")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,Macro F1
500,1.0528,0.892016,0.447973,0.259288
1000,0.9209,0.892935,0.459544,0.307125
1500,0.9101,0.896711,0.452322,0.302426



Evaluating on validation set:



Evaluating on test set:

Validation Results:
{'eval_loss': 0.8920161724090576, 'eval_accuracy': 0.4479730838667323, 'eval_macro_f1': 0.2592881750346259, 'eval_runtime': 177.302, 'eval_samples_per_second': 68.73, 'eval_steps_per_second': 1.077, 'epoch': 1.9455252918287937}

Test Results:
{'eval_loss': 0.9388952851295471, 'eval_accuracy': 0.481273059292662, 'eval_macro_f1': 0.26557092148256684, 'eval_runtime': 397.5543, 'eval_samples_per_second': 67.495, 'eval_steps_per_second': 1.056, 'epoch': 1.9455252918287937}

Results saved!


In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, AutoConfig
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score

# Compute class weights from training labels to handle imbalance
train_labels = np.array(train_dataset["labels"])
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(train_labels), y=train_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to("cuda" if torch.cuda.is_available() else "cpu")

class WeightedFinBERT(AutoModelForSequenceClassification):
    def __init__(self, config, class_weights=None):
        super().__init__(config)
        self.class_weights = class_weights

    def forward(self, **inputs):
        labels = inputs.get("labels")
        outputs = super().forward(**inputs)
        logits = outputs.logits
        loss = None
        if labels is not None and self.class_weights is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return (loss, logits) if loss is not None else logits

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, class_weights=None, **kwargs):
        # Load model with pretrained weights using parent method
        model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        # Attach class weights to model instance
        model.class_weights = class_weights
        return model

# Load tokenizer as usual
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Initialize model with class weights by calling new from_pretrained
model = WeightedFinBERT.from_pretrained(model_name, num_labels=3, class_weights=class_weights)

# Define compute_metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.tensor(logits).argmax(dim=-1).cpu().numpy()
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average='macro')
    return {"accuracy": acc, "macro_f1": macro_f1}

# Setup TrainingArguments (same as before)
training_args = TrainingArguments(
    output_dir="./finbert_finetune",
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_steps=500,
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=500,
    report_to=None,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=6)],
)

# Start training
train_results = trainer.train()

# Save model and tokenizer
trainer.save_model("./finbert_trained_model")
tokenizer.save_pretrained("./finbert_trained_model")

# Evaluate on validation and test sets
val_results = trainer.evaluate(eval_dataset=val_dataset)
test_results = trainer.evaluate(eval_dataset=test_dataset)

print("Validation Results:", val_results)
print("Test Results:", test_results)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,Macro F1
500,1.0528,0.892016,0.447973,0.259288
1000,0.9213,0.892765,0.463319,0.308435
1500,0.9117,0.894157,0.462744,0.317169
2000,0.8773,0.921595,0.488511,0.253371
2500,0.8085,0.952096,0.478172,0.320457
3000,0.6903,0.985327,0.468242,0.323524
3500,0.5783,1.120389,0.476448,0.320093


Validation Results: {'eval_loss': 0.8920161724090576, 'eval_accuracy': 0.4479730838667323, 'eval_macro_f1': 0.2592881750346259, 'eval_runtime': 177.3566, 'eval_samples_per_second': 68.709, 'eval_steps_per_second': 1.077, 'epoch': 4.539559014267185}
Test Results: {'eval_loss': 0.9388952851295471, 'eval_accuracy': 0.481273059292662, 'eval_macro_f1': 0.26557092148256684, 'eval_runtime': 397.1863, 'eval_samples_per_second': 67.558, 'eval_steps_per_second': 1.057, 'epoch': 4.539559014267185}


INCLUDING FULL TEXT

In [None]:
# Install newspaper3k if not installed
!pip install newspaper3k
!pip install lxml[html_clean]

from newspaper import Article
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import os

news_csv_path = '/content/drive/MyDrive/NLP_Project/fnspid_news_2018_2024_subset.parquet'
checkpoint_path = '/content/drive/MyDrive/NLP_Project/fnspid_news_fulltext_checkpoint.parquet'

# Load base news dataset
news_df = pd.read_parquet(news_csv_path)
urls = news_df['Url'].tolist()

# If checkpoint exists, load full_text column to resume
if os.path.exists(checkpoint_path):
    saved_df = pd.read_parquet(checkpoint_path)
    # Initialize results array from saved data
    results = saved_df['full_text'].tolist()
    print(f"Loaded checkpoint with {sum([r is not None for r in results])} articles already fetched.")
else:
    # Initialize results list with None
    results = [None] * len(urls)

def fetch_full_text(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except Exception:
        return None

def fetch_wrapper(idx_url):
    idx, url = idx_url
    # Skip URLs already processed
    if results[idx] is not None:
        return idx, results[idx]
    return idx, fetch_full_text(url)

checkpoint_interval = 1000  # save every 1000 articles

with ThreadPoolExecutor(max_workers=20) as executor:
    futures = {executor.submit(fetch_wrapper, iu): iu[0] for iu in enumerate(urls)}
    for i, future in enumerate(tqdm(as_completed(futures), total=len(urls)), start=1):
        idx, text = future.result()
        results[idx] = text

        if i % checkpoint_interval == 0:
            print(f"Saving checkpoint at {i} articles")
            temp_df = news_df.copy()
            temp_df['full_text'] = results
            temp_df.to_parquet(checkpoint_path, index=False)

# Final save after all done
news_df['full_text'] = results
news_df.to_parquet(checkpoint_path, index=False)
print("Scraping completed and saved.")


news_df['full_text'] = results

news_df.to_parquet('/content/drive/MyDrive/NLP_Project/fnspid_news_fulltext.parquet', index=False)


Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m55.9 MB/s[0m  [33m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ..

  2%|▏         | 976/56506 [00:07<01:46, 520.20it/s]

Saving checkpoint at 1000 articles


  2%|▏         | 1030/56506 [00:09<13:48, 66.96it/s]

Saving checkpoint at 2000 articles


  4%|▎         | 2000/56506 [00:12<03:34, 253.56it/s]

Saving checkpoint at 3000 articles


  5%|▌         | 3000/56506 [00:14<02:44, 325.48it/s]

Saving checkpoint at 4000 articles


  7%|▋         | 4000/56506 [00:17<02:33, 341.28it/s]

Saving checkpoint at 5000 articles


 10%|█         | 5770/56506 [00:21<02:03, 412.43it/s]

Saving checkpoint at 6000 articles


 12%|█▏        | 6976/56506 [00:42<15:22, 53.68it/s]

Saving checkpoint at 7000 articles


 14%|█▍        | 7995/56506 [00:49<04:11, 192.58it/s]

Saving checkpoint at 8000 articles


 16%|█▌        | 8983/56506 [00:55<04:19, 183.31it/s]

Saving checkpoint at 9000 articles


 18%|█▊        | 9970/56506 [01:03<04:47, 162.09it/s]

Saving checkpoint at 10000 articles


 19%|█▉        | 10995/56506 [01:09<05:02, 150.48it/s]

Saving checkpoint at 11000 articles


 21%|██        | 11992/56506 [01:31<06:14, 118.95it/s]

Saving checkpoint at 12000 articles


 23%|██▎       | 12991/56506 [01:36<03:28, 208.38it/s]

Saving checkpoint at 13000 articles


 25%|██▍       | 13999/56506 [04:13<2:00:15,  5.89it/s]

Saving checkpoint at 14000 articles


 27%|██▋       | 14998/56506 [08:42<3:19:06,  3.47it/s]

Saving checkpoint at 15000 articles


 28%|██▊       | 15998/56506 [13:02<3:03:33,  3.68it/s]

Saving checkpoint at 16000 articles


 30%|███       | 16998/56506 [17:16<3:08:51,  3.49it/s]

Saving checkpoint at 17000 articles


 32%|███▏      | 17999/56506 [21:22<2:01:40,  5.27it/s]

Saving checkpoint at 18000 articles


 34%|███▎      | 18999/56506 [25:51<3:55:17,  2.66it/s]

Saving checkpoint at 19000 articles


 35%|███▌      | 19999/56506 [30:14<2:32:38,  3.99it/s]

Saving checkpoint at 20000 articles


 37%|███▋      | 20999/56506 [34:24<3:06:23,  3.17it/s]

Saving checkpoint at 21000 articles


 39%|███▉      | 21997/56506 [38:51<2:10:09,  4.42it/s]

Saving checkpoint at 22000 articles


 41%|████      | 22999/56506 [43:31<2:55:53,  3.17it/s]

Saving checkpoint at 23000 articles


 42%|████▏     | 23997/56506 [48:37<1:27:46,  6.17it/s]

Saving checkpoint at 24000 articles


 44%|████▍     | 24999/56506 [53:22<2:33:43,  3.42it/s]

Saving checkpoint at 25000 articles


 46%|████▌     | 25999/56506 [58:20<2:34:09,  3.30it/s]

Saving checkpoint at 26000 articles


 48%|████▊     | 26999/56506 [1:03:13<2:02:05,  4.03it/s]

Saving checkpoint at 27000 articles


 50%|████▉     | 27997/56506 [1:08:21<3:05:47,  2.56it/s]

Saving checkpoint at 28000 articles


 51%|█████▏    | 28997/56506 [1:13:04<1:41:43,  4.51it/s]

Saving checkpoint at 29000 articles


 53%|█████▎    | 29999/56506 [1:18:14<1:22:04,  5.38it/s]

Saving checkpoint at 30000 articles


 55%|█████▍    | 30999/56506 [1:22:53<1:57:17,  3.62it/s]

Saving checkpoint at 31000 articles


 57%|█████▋    | 31999/56506 [1:26:57<1:26:07,  4.74it/s]

Saving checkpoint at 32000 articles


 58%|█████▊    | 32999/56506 [1:30:56<1:00:22,  6.49it/s]

Saving checkpoint at 33000 articles


 60%|██████    | 33999/56506 [1:35:05<1:23:36,  4.49it/s]

Saving checkpoint at 34000 articles


 62%|██████▏   | 34999/56506 [1:38:58<1:40:22,  3.57it/s]

Saving checkpoint at 35000 articles


 64%|██████▎   | 35999/56506 [1:42:48<37:04,  9.22it/s]

Saving checkpoint at 36000 articles


 65%|██████▌   | 36999/56506 [1:46:27<1:02:01,  5.24it/s]

Saving checkpoint at 37000 articles


 67%|██████▋   | 37999/56506 [1:49:28<44:18,  6.96it/s]

Saving checkpoint at 38000 articles


 69%|██████▉   | 38998/56506 [1:53:00<58:27,  4.99it/s]  

Saving checkpoint at 39000 articles


 71%|███████   | 39998/56506 [1:56:25<38:06,  7.22it/s]

Saving checkpoint at 40000 articles


 73%|███████▎  | 40998/56506 [2:00:37<25:52,  9.99it/s]

Saving checkpoint at 41000 articles


 74%|███████▍  | 41999/56506 [2:04:43<32:00,  7.55it/s]

Saving checkpoint at 42000 articles


 76%|███████▌  | 42998/56506 [2:08:48<1:13:17,  3.07it/s]

Saving checkpoint at 43000 articles


 78%|███████▊  | 43996/56506 [2:12:27<18:42, 11.15it/s]

Saving checkpoint at 44000 articles


 80%|███████▉  | 44999/56506 [2:15:03<1:24:00,  2.28it/s]

Saving checkpoint at 45000 articles


 81%|████████▏ | 45998/56506 [2:18:18<1:21:06,  2.16it/s]

Saving checkpoint at 46000 articles


 83%|████████▎ | 46999/56506 [2:22:50<1:17:50,  2.04it/s]

Saving checkpoint at 47000 articles


 85%|████████▍ | 47998/56506 [2:28:26<46:06,  3.08it/s]

Saving checkpoint at 48000 articles


 87%|████████▋ | 48999/56506 [2:33:32<47:13,  2.65it/s]

Saving checkpoint at 49000 articles


 88%|████████▊ | 49999/56506 [2:38:59<46:20,  2.34it/s]

Saving checkpoint at 50000 articles


 90%|█████████ | 50999/56506 [2:44:04<26:56,  3.41it/s]

Saving checkpoint at 51000 articles


 92%|█████████▏| 51999/56506 [2:49:02<23:00,  3.26it/s]

Saving checkpoint at 52000 articles


 94%|█████████▍| 52998/56506 [2:54:14<36:55,  1.58it/s]

Saving checkpoint at 53000 articles


 96%|█████████▌| 53999/56506 [2:58:23<02:37, 15.90it/s]

Saving checkpoint at 54000 articles


 97%|█████████▋| 54999/56506 [3:01:24<02:32,  9.87it/s]

Saving checkpoint at 55000 articles


 99%|█████████▉| 55997/56506 [3:03:49<00:53,  9.58it/s]

Saving checkpoint at 56000 articles


100%|██████████| 56506/56506 [3:04:58<00:00,  5.09it/s]


Scraping completed and saved.


In [None]:

from transformers import AutoTokenizer
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Paths to labeled splits
train_path = "/content/fnspid_train.parquet"
val_path   = "/content/fnspid_val.parquet"
test_path  = "/content/fnspid_test.parquet"

# Path to full-text news
fulltext_path = "/content/drive/MyDrive/NLP_Project/fnspid_news_fulltext.parquet"

# Load labeled splits
train_df = pd.read_parquet(train_path)
val_df   = pd.read_parquet(val_path)
test_df  = pd.read_parquet(test_path)

print("Original split sizes:",
      len(train_df), len(val_df), len(test_df))

# Load full-text data and keep only the columns we need
fulltext_df = pd.read_parquet(fulltext_path)

# In case there are duplicate URLs in the fulltext file, keep first
fulltext_df = fulltext_df.drop_duplicates(subset=["Url"], keep="first")

# Merge full_text into each split based on Url (article-level key)
def merge_fulltext(split_df):
    merged = split_df.merge(
        fulltext_df[["Url", "full_text"]],
        on="Url",
        how="left"
    )
    # sanity check: no row explosion
    assert len(merged) == len(split_df), "Merge changed number of rows!"
    return merged

train_df = merge_fulltext(train_df)
val_df   = merge_fulltext(val_df)
test_df  = merge_fulltext(test_df)

print("After merge split sizes:",
      len(train_df), len(val_df), len(test_df))

# For samples missing full_text, fall back to headline (or headline+body)
def get_text(row):
    if pd.isna(row["full_text"]) or str(row["full_text"]).strip() == "":
        # Optional: you can use headline+body here if you prefer:
        # return (row["headline"] or "") + " " + (row.get("body") or "")
        return row["headline"]
    else:
        return row["full_text"]

train_df["text"] = train_df.apply(get_text, axis=1)
val_df["text"]   = val_df.apply(get_text, axis=1)
test_df["text"]  = test_df.apply(get_text, axis=1)

# Encode labels to integers
label_encoder = LabelEncoder()
train_df["label_enc"] = label_encoder.fit_transform(train_df["label"])
val_df["label_enc"]   = label_encoder.transform(val_df["label"])
test_df["label_enc"]  = label_encoder.transform(test_df["label"])

print("Label classes:", label_encoder.classes_)

# Build Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df[["text", "label_enc"]])
val_dataset   = Dataset.from_pandas(val_df[["text", "label_enc"]])
test_dataset  = Dataset.from_pandas(test_df[["text", "label_enc"]])

# Load FinBERT tokenizer
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,  # full-text: use max sequence length
    )

# Tokenize
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset   = val_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function, batched=True)

# Rename label column for Trainer
train_dataset = train_dataset.rename_column("label_enc", "labels")
val_dataset   = val_dataset.rename_column("label_enc", "labels")
test_dataset  = test_dataset.rename_column("label_enc", "labels")

# Set PyTorch format
cols = ["input_ids", "attention_mask", "labels"]
train_dataset.set_format(type="torch", columns=cols)
val_dataset.set_format(type="torch", columns=cols)
test_dataset.set_format(type="torch", columns=cols)

print("Final HF dataset sizes:",
      len(train_dataset), len(val_dataset), len(test_dataset))


Original split sizes: 12323 12186 26833
After merge split sizes: 12323 12186 26833
Label classes: ['Down' 'NoChange' 'Up']


Map:   0%|          | 0/12323 [00:00<?, ? examples/s]

Map:   0%|          | 0/12186 [00:00<?, ? examples/s]

Map:   0%|          | 0/26833 [00:00<?, ? examples/s]

Final HF dataset sizes: 12323 12186 26833


In [None]:
# View sample rows from train dataset
print("Sample rows from train_dataset:")
print(train_dataset.select(range(5)).to_pandas())

# View sample rows from validation dataset
print("\nSample rows from val_dataset:")
print(val_dataset.select(range(5)).to_pandas())

# View sample rows from test dataset
print("\nSample rows from test_dataset:")
print(test_dataset.select(range(5)).to_pandas())


Sample rows from train_dataset:
                                                text  labels  \
0              The 25 Best-Performing Stocks of 2021       2   
1  Technology investors experienced a tale of "tw...       2   
2  The New Year is here, and you are asking yours...       2   
3  Are you still on the hunt for some core long-t...       2   
4  Whilst cryptocurrencies and NFTs have taken ce...       0   

                                           input_ids  \
0  [101, 1996, 2423, 2190, 1011, 4488, 15768, 199...   
1  [101, 2974, 9387, 5281, 1037, 6925, 1997, 1000...   
2  [101, 1996, 2047, 2095, 2003, 2182, 1010, 1998...   
3  [101, 2024, 2017, 2145, 2006, 1996, 5690, 2005...   
4  [101, 5819, 19888, 10085, 3126, 7389, 9243, 19...   

                                      token_type_ids  \
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
3  [0, 0, 0, 0, 0, 0, 

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
!pip install --upgrade transformers



In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import evaluate
import numpy as np
import os

os.environ["WANDB_DISABLED"] = "true"

model_name = "ProsusAI/finbert"

# 1) Load FinBERT with 3 output labels
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# 2) Metrics: accuracy + macro-F1
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    macro_f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    return {"accuracy": acc, "macro_f1": macro_f1}

# 3) Training arguments  — use **eval_strategy** for this version
training_args = TrainingArguments(
    output_dir="./finbert_finetuned_fulltext",
    eval_strategy="epoch",          # <-- this is the key fix
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # lower to 8 or 4 if CUDA OOM
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",   # or "accuracy" if you prefer
    logging_dir="./logs_fulltext",
    logging_steps=100,
    seed=42,
    report_to=None,   # no wandb / tb
)

# 4) Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,   # full-text dataset we just built
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# 5) Train
train_result = trainer.train()

# 6) Evaluate on test set
test_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test set metrics:")
for key, value in test_metrics.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.9274,0.94,0.444773,0.205233
2,0.9159,0.914049,0.462908,0.318964
3,0.8727,0.999528,0.460118,0.307093
4,0.7863,1.191406,0.459298,0.31607


Test set metrics:
eval_loss: 0.9486
eval_accuracy: 0.4663
eval_macro_f1: 0.3211
eval_runtime: 794.1650
eval_samples_per_second: 33.7880
eval_steps_per_second: 1.0560
epoch: 4.0000


Experiment2 Full article 512 no limit and finetuned finbert

In [None]:
import pandas as pd
import numpy as np

# Paths
train_path = "/content/fnspid_train.parquet"
val_path   = "/content/fnspid_val.parquet"
test_path  = "/content/fnspid_test.parquet"
fulltext_path = "/content/drive/MyDrive/NLP_Project/fnspid_news_fulltext.parquet"

# Load splits
train_df = pd.read_parquet(train_path)
val_df   = pd.read_parquet(val_path)
test_df  = pd.read_parquet(test_path)

print("Original split sizes:",
      len(train_df), len(val_df), len(test_df))

# Load fulltext + dedupe on Url
fulltext_df = pd.read_parquet(fulltext_path)
fulltext_df = fulltext_df.drop_duplicates(subset=["Url"], keep="first")

def merge_fulltext(split_df, name):
    merged = split_df.merge(
        fulltext_df[["Url", "full_text"]],
        on="Url",
        how="left"
    )
    assert len(merged) == len(split_df), f"{name}: merge changed number of rows!"
    print(f"{name}: after merge = {len(merged)} rows")
    # basic label sanity
    print(f"{name}: labels value_counts:\n", merged["label"].value_counts(dropna=False))
    # full_text availability
    has_full = merged["full_text"].notna() & (merged["full_text"].str.strip() != "")
    print(f"{name}: full_text present for {has_full.sum()} / {len(merged)} rows "
          f"({has_full.mean():.3%})")
    return merged

train_m = merge_fulltext(train_df, "train")
val_m   = merge_fulltext(val_df,   "val")
test_m  = merge_fulltext(test_df,  "test")

# Define final text logic (same as finetuning cell)
def get_text(row):
    if pd.isna(row["full_text"]) or str(row["full_text"]).strip() == "":
        return row["headline"]
    else:
        return row["full_text"]

for name, df in [("train", train_m), ("val", val_m), ("test", test_m)]:
    df["text"] = df.apply(get_text, axis=1)
    # Check for any missing text
    missing_text = df["text"].isna() | (df["text"].astype(str).str.strip() == "")
    print(f"{name}: rows with EMPTY final text = {missing_text.sum()}")

# Spot-check a few random rows from train
print("\nSpot check examples from train:")
sample = train_m.sample(5, random_state=42)

for i, row in sample.iterrows():
    print("-" * 60)
    print(f"Idx: {i}")
    print(f"Ticker:      {row.get('ticker')}")
    print(f"Date:        {row.get('publish_date')}")
    print(f"Url:         {row.get('Url')}")
    print(f"Label:       {row.get('label')}")
    print(f"Headline:    {row.get('headline')[:120] if isinstance(row.get('headline'), str) else row.get('headline')}")
    if isinstance(row.get('full_text'), str):
        print(f"Full_text:   {row['full_text'][:200].replace('\\n',' ')}...")
    else:
        print("Full_text:   <MISSING, will fall back to headline>")


Original split sizes: 12323 12186 26833
train: after merge = 12323 rows
train: labels value_counts:
 label
Up          5896
Down        5411
NoChange    1016
Name: count, dtype: int64
train: full_text present for 9034 / 12323 rows (73.310%)
val: after merge = 12186 rows
val: labels value_counts:
 label
Down        5967
Up          5420
NoChange     799
Name: count, dtype: int64
val: full_text present for 5299 / 12186 rows (43.484%)
test: after merge = 26833 rows
test: labels value_counts:
 label
Up          13254
Down        11114
NoChange     2465
Name: count, dtype: int64
test: full_text present for 14879 / 26833 rows (55.450%)
train: rows with EMPTY final text = 0
val: rows with EMPTY final text = 0
test: rows with EMPTY final text = 0

Spot check examples from train:
------------------------------------------------------------
Idx: 9778
Ticker:      JPM
Date:        2020-05-08
Url:         http://www.gurufocus.com/news/1131263/connable-office-inc-buys-ishares-20-year-treasury-bond-

In [None]:
import torch
from torch.utils.data import Dataset as TorchDataset
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder

device = "cuda" if torch.cuda.is_available() else "cpu"

# Encode labels
label_encoder = LabelEncoder()
train_m["label_enc"] = label_encoder.fit_transform(train_m["label"])
val_m["label_enc"]   = label_encoder.transform(val_m["label"])
test_m["label_enc"]  = label_encoder.transform(test_m["label"])

print("Label classes:", label_encoder.classes_)

model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

MAX_LEN = 512
STRIDE = 128  # overlap between chunks

def chunk_encode(text):
    enc = tokenizer(
        text,
        truncation=False,
        padding=False,
        return_attention_mask=True,
        return_tensors=None,
    )
    input_ids = enc["input_ids"]
    attention_mask = enc["attention_mask"]

    chunks_input_ids = []
    chunks_attention_mask = []

    start = 0
    while start < len(input_ids):
        end = start + MAX_LEN
        window_ids = input_ids[start:end]
        window_mask = attention_mask[start:end]

        pad_len = MAX_LEN - len(window_ids)
        if pad_len > 0:
            window_ids = window_ids + [tokenizer.pad_token_id] * pad_len
            window_mask = window_mask + [0] * pad_len

        chunks_input_ids.append(window_ids)
        chunks_attention_mask.append(window_mask)

        if end >= len(input_ids):
            break
        start = end - STRIDE

    return {
        "input_ids": chunks_input_ids,
        "attention_mask": chunks_attention_mask,
    }

class ChunkedNewsDataset(TorchDataset):
    def __init__(self, df):
        self.texts = df["text"].tolist()
        self.labels = df["label_enc"].tolist()

        self.chunk_input_ids = []
        self.chunk_attention_mask = []
        self.chunk_labels = []
        self.article_ids = []

        for art_id, (txt, lab) in enumerate(zip(self.texts, self.labels)):
            encoded = chunk_encode(txt)
            for ci, am in zip(encoded["input_ids"], encoded["attention_mask"]):
                self.chunk_input_ids.append(ci)
                self.chunk_attention_mask.append(am)
                self.chunk_labels.append(lab)
                self.article_ids.append(art_id)

    def __len__(self):
        return len(self.chunk_input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.chunk_input_ids[idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.chunk_attention_mask[idx], dtype=torch.long),
            "labels": torch.tensor(self.chunk_labels[idx], dtype=torch.long),
            "article_id": torch.tensor(self.article_ids[idx], dtype=torch.long),
        }

train_chunked = ChunkedNewsDataset(train_m)
val_chunked   = ChunkedNewsDataset(val_m)
test_chunked  = ChunkedNewsDataset(test_m)

print("Num chunks - train/val/test:",
      len(train_chunked), len(val_chunked), len(test_chunked))


Label classes: ['Down' 'NoChange' 'Up']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1147 > 512). Running this sequence through the model will result in indexing errors


Num chunks - train/val/test: 23258 23982 58263


In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate
import numpy as np

os.environ["WANDB_DISABLED"] = "true"

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    macro_f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    return {"accuracy": acc, "macro_f1": macro_f1}

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3).to(device)

training_args = TrainingArguments(
    output_dir="./finbert_fulltext_chunked",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    metric_for_best_model="loss",   # use loss for best-model selection
    greater_is_better=False,
    logging_dir="./logs_fulltext_chunked",
    logging_steps=200,
    report_to=None,
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_chunked,
    eval_dataset=val_chunked,
    compute_metrics=compute_metrics,
)

train_result = trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.8926,0.947575,0.445,0.210881
2,0.8736,0.952437,0.450463,0.253406
3,0.7739,1.111088,0.466475,0.316862
4,0.7217,1.32361,0.457343,0.31981


In [None]:
from torch.utils.data import DataLoader

model.eval()

def article_level_eval(dataset):
    loader = DataLoader(dataset, batch_size=16, shuffle=False)
    all_logits = []
    all_labels = []
    all_article_ids = []

    for batch in loader:
        article_ids = batch.pop("article_id")
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            logits = outputs.logits.detach().cpu().numpy()
        labels = batch["labels"].cpu().numpy()

        all_logits.append(logits)
        all_labels.append(labels)
        all_article_ids.append(article_ids.numpy())

    all_logits = np.concatenate(all_logits, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    all_article_ids = np.concatenate(all_article_ids, axis=0)

    article_to_logits = {}
    article_to_labels = {}
    for logit, lab, aid in zip(all_logits, all_labels, all_article_ids):
        if aid not in article_to_logits:
            article_to_logits[aid] = []
            article_to_labels[aid] = lab
        article_to_logits[aid].append(logit)

    article_logits = []
    article_labels = []
    for aid, logits_list in article_to_logits.items():
        article_logits.append(np.mean(logits_list, axis=0))
        article_labels.append(article_to_labels[aid])

    article_logits = np.stack(article_logits, axis=0)
    article_labels = np.array(article_labels)

    preds = np.argmax(article_logits, axis=-1)
    acc = accuracy_metric.compute(predictions=preds, references=article_labels)["accuracy"]
    macro_f1 = f1_metric.compute(predictions=preds, references=article_labels, average="macro")["f1"]
    return acc, macro_f1

print("Article-level validation metrics:")
val_acc, val_f1 = article_level_eval(val_chunked)
print("Val accuracy:", val_acc, "Val macro_F1:", val_f1)

print("\nArticle-level test metrics:")
test_acc, test_f1 = article_level_eval(test_chunked)
print("Test accuracy:", test_acc, "Test macro_F1:", test_f1)


Article-level validation metrics:
Val accuracy: 0.44698834728376824 Val macro_F1: 0.21669970954361814

Article-level test metrics:
Test accuracy: 0.49293780046957103 Test macro_F1: 0.22475868356694476
