In [1]:
import pandas as pd

# Load dataset
train_data = pd.read_csv("train_stock_news.csv")
test_data = pd.read_csv("test_stock_news.csv")

In [4]:
pip install torch torchvision torchaudio

Collecting torch
  Downloading torch-2.5.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting torchvision
  Downloading torchvision-0.20.1-cp312-cp312-win_amd64.whl.metadata (6.2 kB)
Collecting torchaudio
  Downloading torchaudio-2.5.1-cp312-cp312-win_amd64.whl.metadata (6.5 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.5.1-cp312-cp312-win_amd64.whl (203.0 MB)
   ---------------------------------------- 0.0/203.0 MB ? eta -:--:--
   - -------------------------------------- 10.0/203.0 MB 62.2 MB/s eta 0:00:04
   ----- ---------------------------------- 25.4/203.0 MB 67.1 MB/s eta 0:00:03
   -------- ------------------------------- 41.2/203.0 MB 70.8 MB/s eta 0:00:03
   ----------- ---------------------------- 57.4/203.0 MB 73.2 MB/s eta 0:00:02
   -------------- ------------------------- 73.4/203.0 MB 74.3 MB/s eta 0:00:02
   ----------------- ---------------------- 87.8/203.0 MB 73.8 MB/s eta 0:00:02
   ---

In [6]:
pip install transformers

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.3-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp312-none-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Using cached safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
   ---------------------------------------- 0.0/10.0 MB ? eta -:--:--
   ---------------------------------------  10.0/10.0 MB 56.4 MB/s eta 0:00:01
   ---------------------------------------- 10.0/10.0 MB 52.0 MB/s eta 0:00:00
Downloading huggingface_hub-0.26.3-py3-none-any.whl (447 kB)
Using cached safetensors-0.4.5-cp312-none-win_amd64.whl (286 kB)
Downloading tokenizers-0.20.3-cp312-none-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0

In [7]:
import re
import torch
from transformers import BertTokenizer, BertModel
from torch.nn import Module
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AdamW

# Initialize tokenizer and base model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
base_model = BertModel.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to extract entities manually
def extract_gist(text, date):
    # Define regex patterns for dates, company names, and events
    date_pattern = r"\b\d{4}-\d{2}-\d{2}\b"  # Matches YYYY-MM-DD
    company_pattern = r"\b[A-Z][a-zA-Z]+(?:\s[A-Z][a-zA-Z]+)*\b"  # Matches proper nouns (e.g., "Apple Inc.")
    event_keywords = ["earnings", "merger", "report", "acquisition", "profit", "loss", "growth", "decline"]

    # Extract entities
    dates = re.findall(date_pattern, text)
    companies = re.findall(company_pattern, text)
    events = [word for word in event_keywords if word in text.lower()]

    # Create gist token
    gist = f"[GIST] {' '.join(companies)} {' '.join(events)} [DATE] {date}"
    return gist

# Create Gist Token Dataset
class GistTokenDataset(Dataset):
    def __init__(self, texts, dates, tokenizer, max_len=512):
        self.texts = texts
        self.dates = dates
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        date = self.dates[idx]
        gist = extract_gist(text, date)

        encoding = self.tokenizer.encode_plus(
            gist + " " + text,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
        }

In [8]:
# Define Dataset
texts = train_data["Text"].tolist()
dates = train_data["Date"].tolist()
train_dataset = GistTokenDataset(texts, dates, tokenizer)

# Define DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Custom Attention Modification Module
class ModifiedAttentionModel(Module):
    def __init__(self, base_model):
        super(ModifiedAttentionModel, self).__init__()
        self.base_model = base_model
        self.attention_layer = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        # Freeze embedding and encoder layers
        with torch.no_grad():
            outputs = self.base_model(input_ids, attention_mask=attention_mask)
        
        # Add custom attention focusing on gist tokens
        hidden_states = outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_dim)
        attention_scores = self.attention_layer(hidden_states).squeeze(-1)  # Shape: (batch_size, seq_len)
        attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)
        
        # Weighted sum of token representations
        context_vector = torch.matmul(attention_weights.unsqueeze(1), hidden_states).squeeze(1)
        return context_vector


In [9]:
# Initialize modified model
model = ModifiedAttentionModel(base_model).to(device)

# Optimizer and Training Loop
optimizer = AdamW(model.parameters(), lr=1e-4)

def train_attention_model(model, dataloader, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for batch in tqdm(dataloader, desc=f"Training Epoch {epoch+1}"):
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Forward pass through the modified model
            context_vector = model(input_ids, attention_mask)
            loss = torch.mean(context_vector)  # Dummy loss for demonstration
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
        print(f"Epoch {epoch+1} Loss: {epoch_loss / len(dataloader)}")



In [10]:
# Train the model
train_attention_model(model, train_loader, optimizer)

# Save the model for downstream tasks
torch.save(model.state_dict(), "modified_attention_model.pth")

Training Epoch 1: 100%|██████████| 161/161 [41:59<00:00, 15.65s/it]


Epoch 1 Loss: -0.015062478932071917


Training Epoch 2: 100%|██████████| 161/161 [41:56<00:00, 15.63s/it]


Epoch 2 Loss: -0.019871406280291007


Training Epoch 3: 100%|██████████| 161/161 [41:56<00:00, 15.63s/it]


Epoch 3 Loss: -0.02361619373008331


In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import numpy as np
from transformers import BertTokenizer, BertModel

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define tokenizer and load the saved model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define the custom model
class ModifiedAttentionModel(torch.nn.Module):
    def __init__(self, base_model):
        super(ModifiedAttentionModel, self).__init__()
        self.base_model = base_model
        self.attention_layer = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # Freeze embeddings and encoder layers
            outputs = self.base_model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        attention_scores = self.attention_layer(hidden_states).squeeze(-1)
        attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)
        context_vector = torch.matmul(attention_weights.unsqueeze(1), hidden_states).squeeze(1)
        return context_vector

# Load the trained model
base_model = BertModel.from_pretrained("bert-base-uncased")
model = ModifiedAttentionModel(base_model)
model.load_state_dict(torch.load("modified_attention_model.pth", map_location=device))
model.eval()
model.to(device)

  model.load_state_dict(torch.load("modified_attention_model.pth", map_location=device))


ModifiedAttentionModel(
  (base_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

In [3]:
# Define dataset for embedding extraction
class GistEmbeddingDataset(Dataset):
    def __init__(self, texts, dates, tokenizer, max_len=512):
        self.texts = texts
        self.dates = dates
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        date = self.dates[idx]
        gist = f"[GIST] [DATE] {date}"

        encoding = self.tokenizer.encode_plus(
            gist + " " + text,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
        }


In [12]:
# Load train and test data
train_data = pd.read_csv("train_stock_news.csv")
test_data = pd.read_csv("test_stock_news.csv")

# Split validation set from the training set
val_split = 0.2

# Calculate labels using Open and Close prices
train_labels = (train_data["Close"] - train_data["Open"] > 0).astype(int).tolist()
test_labels = (test_data["Close"] - test_data["Open"] > 0).astype(int).tolist()

train_texts = train_data["Text"].tolist()
train_dates = train_data["Date"].tolist()

train_texts, val_texts, train_dates, val_dates, train_labels, val_labels = train_test_split(
    train_texts, train_dates, train_labels, test_size=val_split, random_state=42
)

test_texts = test_data["Text"].tolist()
test_dates = test_data["Date"].tolist()

# Create datasets and dataloaders
train_dataset = GistEmbeddingDataset(train_texts, train_dates, tokenizer)
val_dataset = GistEmbeddingDataset(val_texts, val_dates, tokenizer)
test_dataset = GistEmbeddingDataset(test_texts, test_dates, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [13]:
# Function to extract embeddings
def extract_embeddings(model, dataloader):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Extracting Embeddings"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            context_vector = model(input_ids, attention_mask)
            embeddings.append(context_vector.cpu().numpy())
    return np.vstack(embeddings)

# Extract embeddings for train, validation, and test sets
train_embeddings = extract_embeddings(model, train_loader)
val_embeddings = extract_embeddings(model, val_loader)
test_embeddings = extract_embeddings(model, test_loader)

Extracting Embeddings: 100%|██████████| 129/129 [26:45<00:00, 12.44s/it]
Extracting Embeddings: 100%|██████████| 33/33 [06:41<00:00, 12.18s/it]
Extracting Embeddings: 100%|██████████| 69/69 [14:20<00:00, 12.47s/it]


In [14]:
# Train a Random Forest classifier on the extracted embeddings
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(train_embeddings, train_labels)

# Make predictions on validation set
val_predictions = rf_classifier.predict(val_embeddings)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(val_labels, val_predictions)
print("Validation Accuracy:", val_accuracy)
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))


Validation Accuracy: 0.5650485436893203
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.62      0.60       266
           1       0.56      0.51      0.53       249

    accuracy                           0.57       515
   macro avg       0.56      0.56      0.56       515
weighted avg       0.56      0.57      0.56       515



In [15]:
# Make predictions on test set
test_predictions = rf_classifier.predict(test_embeddings)

# Evaluate the model on the test set
test_accuracy = accuracy_score(test_labels, test_predictions)
print("Test Accuracy:", test_accuracy)
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))

# Save the predictions for analysis
output = pd.DataFrame({"Text": test_texts, "True Label": test_labels, "Predicted Label": test_predictions})
output.to_csv("stock_price_predictions.csv", index=False)
print("Predictions saved to stock_price_predictions.csv")

Test Accuracy: 0.5371376811594203
Test Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.73      0.63       584
           1       0.51      0.32      0.39       520

    accuracy                           0.54      1104
   macro avg       0.53      0.53      0.51      1104
weighted avg       0.53      0.54      0.52      1104

Predictions saved to stock_price_predictions.csv


In [21]:
# ---------improve the model, using close price only--------
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import numpy as np
from transformers import BertTokenizer, BertModel
from imblearn.over_sampling import SMOTE

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define tokenizer and load the saved model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define the custom model
class ModifiedAttentionModel(torch.nn.Module):
    def __init__(self, base_model):
        super(ModifiedAttentionModel, self).__init__()
        self.base_model = base_model
        self.attention_layer = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # Freeze embeddings and encoder layers
            outputs = self.base_model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        attention_scores = self.attention_layer(hidden_states).squeeze(-1)
        attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)
        context_vector = torch.matmul(attention_weights.unsqueeze(1), hidden_states).squeeze(1)
        return context_vector

# Load the trained model
base_model = BertModel.from_pretrained("bert-base-uncased")
model = ModifiedAttentionModel(base_model)
model.load_state_dict(torch.load("modified_attention_model.pth", map_location=device))
model.eval()
model.to(device)

# Define dataset for embedding extraction
class GistEmbeddingDataset(Dataset):
    def __init__(self, texts, dates, tokenizer, max_len=512):
        self.texts = texts
        self.dates = dates
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        date = self.dates[idx]
        gist = f"[GIST] [DATE] {date}"

        encoding = self.tokenizer.encode_plus(
            gist + " " + text,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
        }

  model.load_state_dict(torch.load("modified_attention_model.pth", map_location=device))


In [22]:
# Load train and test data
train_data = pd.read_csv("train_stock_news.csv")
test_data = pd.read_csv("test_stock_news.csv")

# Split validation set from the training set
val_split = 0.2

# Calculate labels
train_labels = (train_data["Close"].diff().fillna(0) > 0).astype(int).tolist()
test_labels = (test_data["Close"].diff().fillna(0) > 0).astype(int).tolist()

train_texts = train_data["Text"].tolist()
train_dates = train_data["Date"].tolist()

train_texts, val_texts, train_dates, val_dates, train_labels, val_labels = train_test_split(
    train_texts, train_dates, train_labels, test_size=val_split, random_state=42
)

test_texts = test_data["Text"].tolist()
test_dates = test_data["Date"].tolist()

# Create datasets and dataloaders
train_dataset = GistEmbeddingDataset(train_texts, train_dates, tokenizer)
val_dataset = GistEmbeddingDataset(val_texts, val_dates, tokenizer)
test_dataset = GistEmbeddingDataset(test_texts, test_dates, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [23]:
# Function to extract embeddings
def extract_embeddings(model, dataloader):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Extracting Embeddings"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            context_vector = model(input_ids, attention_mask)
            embeddings.append(context_vector.cpu().numpy())
    return np.vstack(embeddings)

# Extract embeddings for train, validation, and test sets
train_embeddings = extract_embeddings(model, train_loader)
val_embeddings = extract_embeddings(model, val_loader)
test_embeddings = extract_embeddings(model, test_loader)

# Balance the training set using SMOTE
smote = SMOTE(random_state=42)
train_embeddings_balanced, train_labels_balanced = smote.fit_resample(train_embeddings, train_labels)

# Perform hyperparameter tuning using GridSearchCV
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_classifier, param_grid, scoring="accuracy", cv=3, n_jobs=-1, verbose=2)
grid_search.fit(train_embeddings_balanced, train_labels_balanced)

# Best Random Forest model
rf_classifier = grid_search.best_estimator_

# Make predictions on validation set
val_predictions = rf_classifier.predict(val_embeddings)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(val_labels, val_predictions)
print("Validation Accuracy:", val_accuracy)
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

Extracting Embeddings: 100%|██████████| 129/129 [48:25<00:00, 22.52s/it]
Extracting Embeddings: 100%|██████████| 33/33 [12:03<00:00, 21.92s/it]
Extracting Embeddings: 100%|██████████| 69/69 [24:54<00:00, 21.66s/it]


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Validation Accuracy: 0.7281553398058253
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       413
           1       0.29      0.26      0.28       102

    accuracy                           0.73       515
   macro avg       0.56      0.55      0.56       515
weighted avg       0.72      0.73      0.72       515



In [24]:
# Make predictions on test set
test_predictions = rf_classifier.predict(test_embeddings)

# Evaluate the model on the test set
test_accuracy = accuracy_score(test_labels, test_predictions)
print("Test Accuracy:", test_accuracy)
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))

# Save the predictions for analysis
output = pd.DataFrame(test_data)  # Include all columns from the test dataset
output["Predicted Label"] = test_predictions
output.to_csv("stock_price_predictions.csv", index=False)
print("Predictions saved to stock_price_predictions.csv")

Test Accuracy: 0.75
Test Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       895
           1       0.23      0.13      0.17       209

    accuracy                           0.75      1104
   macro avg       0.52      0.51      0.51      1104
weighted avg       0.70      0.75      0.72      1104

Predictions saved to stock_price_predictions.csv
