In [None]:
%pip install -qq pandas datasets scikit-learn transformers numpy pandas torch evaluate transformers[torch] accelerate psycopg2-binary

In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)
import evaluate, numpy as np, torch


In [3]:
def mapper(p):
    if p < -0.2:
        return 0          # negative
    elif p > 0.2:
        return 2          # positive
    else:
        return 1          # neutral

In [5]:
# ── 1. Fetch data from QuestDB ─────────────────────────────────
conn_params = {
    "host": "quest.amudhan.me",
    "port": 8812,
    "database": "qdb",
    "user": "admin",
    "password": "redacted"
}

try:
    conn = psycopg2.connect(**conn_params)

    query = """
    SELECT * FROM stock_news WHERE time BETWEEN '2005-01-01T00:00:00Z' AND '2024-12-31T23:59:59Z'

    """

    df_test = pd.read_sql_query(query, conn)
    conn.close()

except Exception as e:
    raise Exception(f"QuestDB error: {e}")

# ── 2. Inspect & clean basic types ─────────────────────────────────
# Make sure timestamps are proper datetime objects
df["time"] = pd.to_datetime(df["time"], errors="coerce")

# Drop rows with missing content, if any
df = df.dropna(subset=["content"])

# ── 3. (Optional) keep just the columns you’ll feed to a model ─────
cols_for_model = [
    "ticker",
    "time",
    "title",
    "content",
    "sentiment_polarity"   # if you want the pre‑computed label
]
df_model = df[cols_for_model].copy()
# Coerce any non‑numeric value to NaN, keep valid floats
df["sentiment_polarity"] = pd.to_numeric(df["sentiment_polarity"], errors="coerce")

# Drop the rows that became NaN (i.e. were non‑numeric originally)
df = df.dropna(subset=["sentiment_polarity"]).reset_index(drop=True)


# ── 4. Quick sanity check ──────────────────────────────────────────
print(df_model.head())
print(df_model.dtypes)


  df = pd.read_csv('full_stock_news.csv')


  ticker                      time  \
0    SYY 2011-05-09 00:00:00+00:00   
1    BBY 2011-06-14 00:00:00+00:00   
2   SNEX 2012-01-29 00:00:00+00:00   
3    JEF 2012-03-05 00:00:00+00:00   
4    JEF 2012-06-04 00:00:00+00:00   

                                            title  \
0          RIMM Hung Up, Dollar Thrifty Gassed Up   
1  Best Buy Rallies as Dollar Thrifty Gets a Flat   
2               \nTamara Walsh  |  Jan 29, 2012\n   
3            Would Warren Buffett Buy Apple Now?    
4                  Just Walk Away From Chesapeake   

                                             content  sentiment_polarity  
0  Options quiet but action seen in Research In M...               0.000  
1  Trading midday sees action in Best Buy (NYSE: ...               0.637  
2  Companies gaining market share even in a strug...               0.296  
3  Apple stock has run up by 44% since mid-Decemb...               0.000  
4  Sure it's cheap, but it also has far too many ...              -0.440  
ti

In [6]:
df["label"] = df["sentiment_polarity"].apply(mapper)
print(df["label"].value_counts())

label
2    313958
0     28967
1     27755
Name: count, dtype: int64


In [5]:
from sklearn.utils.class_weight import compute_class_weight

labels = np.array(df["label"])          # [0,1,2]
weights = compute_class_weight("balanced", classes=np.unique(labels), y=labels)
class_wt = torch.tensor(weights, dtype=torch.float32).to("cuda")
print("class weights:", class_wt)       # e.g. tensor([2.95, 6.45, 1.00])


class weights: tensor([4.2655, 4.4518, 0.3936], device='cuda:0')


In [6]:
train_df, eval_df = train_test_split(df[["content", "label"]],
                                     test_size=0.1, stratify=df["label"], random_state=42)

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "eval" : Dataset.from_pandas(eval_df.reset_index(drop=True))
})

In [7]:
model_name = "EleutherAI/gpt-neo-125M"
tokenizer   = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token                         # GPT needs an explicit pad

def tok_fn(batch):
    return tokenizer(batch["content"],
                     truncation=True,
                     padding="max_length",
                     max_length=256)

ds = ds.map(tok_fn, batched=True, remove_columns=["content"])
ds.set_format("torch")


Map:   0%|          | 0/333612 [00:00<?, ? examples/s]

Map:   0%|          | 0/37068 [00:00<?, ? examples/s]

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.config.pad_token_id = tokenizer.pad_token_id  # let HF know

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import TrainingArguments
args = TrainingArguments(
    output_dir="./gptneo125_sentiment",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,
    logging_steps=50,
    load_best_model_at_end=True,
)
print("✔ TrainingArguments initialised on stable release.")


✔ TrainingArguments initialised on stable release.


In [10]:
import transformers, sys, platform
print("Transformers version ->", transformers.__version__)
print("Python executable   ->", sys.executable)
print("Platform            ->", platform.platform())


Transformers version -> 4.51.3
Python executable   -> /usr/bin/python
Platform            -> Linux-6.8.0-57-generic-x86_64-with-glibc2.35


In [11]:
from transformers import Trainer
import torch.nn.functional as F

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits  = outputs.logits
        loss = F.cross_entropy(logits, labels, weight=class_wt)
        return (loss, outputs) if return_outputs else loss


In [12]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)


In [13]:
import evaluate, numpy as np
f1  = evaluate.load("f1")
acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "macro_f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "accuracy": acc.compute(predictions=preds, references=labels)["accuracy"],
    }


In [14]:
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset =ds["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # macro-F1 below
)


trainer.train()
trainer.save_model("./gptneo125_sentiment/final")
tokenizer.save_pretrained("./gptneo125_sentiment/final")


  trainer = WeightedTrainer(
  arr = np.array(obj)


  arr = np.array(obj)
  arr = np.array(obj)


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy
1,0.3598,0.337664,0.753674,0.87844
2,0.2077,0.296677,0.825481,0.924382
3,0.1145,0.344247,0.852192,0.939382


('./gptneo125_sentiment/final/tokenizer_config.json',
 './gptneo125_sentiment/final/special_tokens_map.json',
 './gptneo125_sentiment/final/vocab.json',
 './gptneo125_sentiment/final/merges.txt',
 './gptneo125_sentiment/final/added_tokens.json',
 './gptneo125_sentiment/final/tokenizer.json')

In [19]:
trainer.save_model("./gptneo125_sentiment/final")
tokenizer.save_pretrained("./gptneo125_sentiment/final")

('./gptneo125_sentiment/final/tokenizer_config.json',
 './gptneo125_sentiment/final/special_tokens_map.json',
 './gptneo125_sentiment/final/vocab.json',
 './gptneo125_sentiment/final/merges.txt',
 './gptneo125_sentiment/final/added_tokens.json',
 './gptneo125_sentiment/final/tokenizer.json')

In [8]:
import psycopg2
import pandas as pd

conn_params = {
    "host": "quest.amudhan.me",
    "port": 8812,
    "database": "qdb",
    "user": "admin",
    "password": "redacted"
}

try:
    conn = psycopg2.connect(**conn_params)

    query = """
    SELECT * FROM stock_news WHERE time BETWEEN '2025-01-01T00:00:00Z' AND '2025-05-15T23:59:59Z'

    """

    df_test = pd.read_sql_query(query, conn)
    conn.close()

except Exception as e:
    raise Exception(f"QuestDB error: {e}")

# Ensure datetime conversion
df_test['time'] = pd.to_datetime(df_test['time'])
df_test.head()


Unnamed: 0,ticker,time,title,content,link,symbols,tags,sentiment_polarity,sentiment_neg,sentiment_neu,sentiment_pos
0,KHC,2025-01-01 00:00:00+00:00,Is Kraft Heinz Stock in Trouble?,Is Kraft Heinz Stock in Trouble?,https://www.fool.com/investing/2025/01/01/is-k...,KHC.US,,-0.402,0.351,0.649,0.0
1,GE,2025-01-01 09:01:43+00:00,General Industrial Machinery Stocks Q3 Results...,General Industrial Machinery Stocks Q3 Results...,https://finance.yahoo.com/news/general-industr...,"GE.MX,GE.US,HI.US,HON.US,IEP.US,LXFR.US",,0.998,0.043,0.817,0.14
2,IEP,2025-01-01 09:01:43+00:00,General Industrial Machinery Stocks Q3 Results...,General Industrial Machinery Stocks Q3 Results...,https://finance.yahoo.com/news/general-industr...,"GE.MX,GE.US,HI.US,HON.US,IEP.US,LXFR.US",,0.998,0.043,0.817,0.14
3,HON,2025-01-01 09:01:43+00:00,General Industrial Machinery Stocks Q3 Results...,General Industrial Machinery Stocks Q3 Results...,https://finance.yahoo.com/news/general-industr...,"GE.MX,GE.US,HI.US,HON.US,IEP.US,LXFR.US",,0.998,0.043,0.817,0.14
4,CMI,2025-01-01 09:02:48+00:00,Heavy Transportation Equipment Stocks Q3 Recap...,Heavy Transportation Equipment Stocks Q3 Recap...,https://finance.yahoo.com/news/heavy-transport...,"0I58.LSE,BLBD.US,CMI.US,CUM.DU,CUM.F,CUM.MU,RE...",,0.999,0.031,0.813,0.156


In [9]:
df_test["label"] = df_test["sentiment_polarity"].apply(mapper)
print(df_test["label"].value_counts())

label
2    32446
0     2381
1     2008
Name: count, dtype: int64


In [10]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report, confusion_matrix

# ─────────────────── 1.  Load model & helper  ────────────────────
clf_tok  = AutoTokenizer.from_pretrained("./gptneo125_sentiment/final")
clf_model = AutoModelForSequenceClassification.from_pretrained(
               "./gptneo125_sentiment/final"
            ).eval().to("cuda")

label_map       = {0: "negative", 1: "neutral", 2: "positive"}
inverse_map     = {v: k for k, v in label_map.items()}  # handy later

@torch.inference_mode()
def predict_label(text: str) -> tuple[int, float]:
    """Return (numeric_label, confidence_of_chosen_label)."""
    toks = clf_tok(text,
                   return_tensors="pt",
                   truncation=True,
                   padding=True).to("cuda")
    logits = clf_model(**toks).logits
    probs  = torch.softmax(logits, dim=-1)[0]
    idx    = int(probs.argmax())
    return idx, float(probs[idx])

# ─────────────────── 2.  Make ground-truth labels ─────────────────
df_test["label_true"] = df_test["sentiment_polarity"].apply(mapper)

# ─────────────────── 3.  Run predictions  ────────────────────────
y_true, y_pred, y_conf = [], [], []      # collect for metrics & analysis

for text in df_test["content"]:
    pred_lbl, conf = predict_label(text)
    y_pred.append(pred_lbl)
    y_conf.append(conf)

y_true = df_test["label_true"].tolist()

# store in DataFrame (optional but convenient)
df_test["label_pred"] = y_pred
df_test["pred_conf"]  = y_conf

# ─────────────────── 4.  Evaluation  ─────────────────────────────
print("\nClassification Report")
print(classification_report(y_true, y_pred,
                            target_names=["negative", "neutral", "positive"]))

print("\nConfusion Matrix")
print(pd.DataFrame(confusion_matrix(y_true, y_pred),
                   columns=["pred_neg", "pred_neu", "pred_pos"],
                   index=["actual_neg", "actual_neu", "actual_pos"]))



Classification Report
              precision    recall  f1-score   support

    negative       0.75      0.52      0.62      2381
     neutral       0.47      0.78      0.58      2008
    positive       0.97      0.95      0.96     32446

    accuracy                           0.92     36835
   macro avg       0.73      0.75      0.72     36835
weighted avg       0.93      0.92      0.92     36835


Confusion Matrix
            pred_neg  pred_neu  pred_pos
actual_neg      1246       544       591
actual_neu       189      1568       251
actual_pos       233      1257     30956


In [11]:
clf_tok  = AutoTokenizer.from_pretrained("./gptneo125_sentiment/final")
clf_model = AutoModelForSequenceClassification.from_pretrained("./gptneo125_sentiment/final").eval().to("cuda")

label_map = {0:"negative", 1:"neutral", 2:"positive"}

def predict(text):
    tokens = clf_tok(text, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        logits = clf_model(**tokens).logits
    prob = torch.softmax(logits, dim=-1)[0]
    idx  = prob.argmax().item()
    return {"label": label_map[idx], "prob": float(prob[idx])}

print(predict(f"Tesla Sales Fell 46% in Germany. That’s Just the Latest Bad News for the Stock.."))


{'label': 'negative', 'prob': 0.9980024695396423}
