In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm

# Load dataset
train_data = pd.read_csv("train_stock_news.csv")
test_data = pd.read_csv("test_stock_news.csv")

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  # Binary classification

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [5]:
# --- Step 1: Gist Token Creation ---
def create_gist_tokens(text):
    """
    Extract key entities and compress them into a gist token.
    For simplicity, we're summarizing as [COMPANY_EVENT].
    """
    # Example: Customize based on your use case
    if "earnings" in text.lower():
        return "[EARNINGS_REPORT]"
    elif "merger" in text.lower():
        return "[MERGER_EVENT]"
    elif "stock" in text.lower():
        return "[STOCK_MOVEMENT]"
    else:
        return "[GENERAL_NEWS]"

# Apply gist token creation
train_data["gist_token"] = train_data["Text"].apply(create_gist_tokens)
test_data["gist_token"] = test_data["Text"].apply(create_gist_tokens)

# Append the gist token to the text
train_data["Text"] = train_data["gist_token"] + " " + train_data["Text"]
test_data["Text"] = test_data["gist_token"] + " " + test_data["Text"]

In [7]:
# --- Step 2: Dataset Preparation ---
class StockDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Convert stock price changes to binary labels (0 = decrease, 1 = increase)
train_data["Label"] = (train_data["Close"] > train_data["Open"]).astype(int)
test_data["Label"] = (test_data["Close"] > test_data["Open"]).astype(int)

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data["Text"], train_data["Label"], test_size=0.2, random_state=42
)

train_dataset = StockDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
val_dataset = StockDataset(val_texts.tolist(), val_labels.tolist(), tokenizer)
test_dataset = StockDataset(test_data["Text"].tolist(), test_data["Label"].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [9]:
# --- Step 3: Training the Model ---
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

def train_model(model, train_loader, val_loader, epochs=3):
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        print(f"Epoch {epoch+1}: Train Loss = {train_loss / len(train_loader)}")

        # Validation
        model.eval()
        val_loss = 0
        val_preds = []
        val_targets = []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                val_preds.extend(torch.argmax(outputs.logits, axis=1).cpu().numpy())
                val_targets.extend(labels.cpu().numpy())

        val_acc = accuracy_score(val_targets, val_preds)
        val_f1 = f1_score(val_targets, val_preds)

        print(f"Epoch {epoch+1}: Val Loss = {val_loss / len(val_loader)}, Val Acc = {val_acc}, Val F1 = {val_f1}")

train_model(model, train_loader, val_loader)


Training Epoch 1: 100%|██████████| 129/129 [1:38:50<00:00, 45.97s/it]


Epoch 1: Train Loss = 0.6995639149532762
Epoch 1: Val Loss = 0.6927662726604578, Val Acc = 0.5145631067961165, Val F1 = 0.0


Training Epoch 2: 100%|██████████| 129/129 [1:38:57<00:00, 46.03s/it]


Epoch 2: Train Loss = 0.6897268743478051
Epoch 2: Val Loss = 0.6941817691831877, Val Acc = 0.5184466019417475, Val F1 = 0.008


Training Epoch 3: 100%|██████████| 129/129 [1:36:15<00:00, 44.77s/it]


Epoch 3: Train Loss = 0.6878308076267095
Epoch 3: Val Loss = 0.6914809093330846, Val Acc = 0.516504854368932, Val F1 = 0.6322008862629247


In [11]:
# --- Step 4: Testing and Evaluation ---
model.eval()
test_preds = []
test_targets = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        test_preds.extend(torch.argmax(outputs.logits, axis=1).cpu().numpy())
        test_targets.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_targets, test_preds)
test_f1 = f1_score(test_targets, test_preds)

print(f"Test Accuracy: {test_acc}, Test F1 Score: {test_f1}")

# --- Save Predictions ---
test_data["Predicted_Label"] = test_preds
test_data.to_csv("predictions_with_gist_tokens.csv", index=False)
print("Predictions saved to predictions_with_gist_tokens.csv")

Testing: 100%|██████████| 69/69 [13:49<00:00, 12.02s/it]

Test Accuracy: 0.5108695652173914, Test F1 Score: 0.6223776223776224
Predictions saved to predictions_with_gist_tokens.csv



