In [1]:
import pandas as pd

# Load the dataset
file_path = "aapl_news_content.csv"  # Update with the correct path to your dataset
data = pd.read_csv(file_path, encoding='utf-8-sig')

# Examine the dataset structure
print("Dataset Info:")
print(data.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 904 entries, 0 to 903
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    904 non-null    object
 1   Url     904 non-null    object
 2   Text    904 non-null    object
 3   Mark    904 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 28.4+ KB
None


In [2]:
# Preview the first few rows
print("\nFirst 5 Rows of the Dataset:")
print(data.head())


First 5 Rows of the Dataset:
                               Date  \
0   January 25, 2024 — 03:33 am EST   
1  February 03, 2024 — 01:07 pm EST   
2  February 03, 2024 — 10:20 am EST   
3  February 03, 2024 — 07:40 am EST   
4  February 03, 2024 — 05:00 am EST   

                                                 Url  \
0  https://www.nasdaq.com/articles/zacks-earnings...   
1  https://www.nasdaq.com/articles/could-apple-ac...   
2  https://www.nasdaq.com/articles/microsoft-tesl...   
3  https://www.nasdaq.com/articles/forget-amd-in-...   
4  https://www.nasdaq.com/articles/1-unstoppable-...   

                                                Text  Mark  
0  For Immediate Release\nChicago, IL – January 2...     1  
1  Thanks to its lineup of incredibly popular har...     1  
2  In this podcast, Motley Fool host Dylan Lewis ...     1  
3  Shares in Advanced Micro Devices (NASDAQ: AMD)...     1  
4  Investing in the stock market is one of the mo...     1  


In [3]:
print("\nRows Where 'Text' Contains Only Integers:")
integer_text_rows = data[data['Text'].astype(str).str.match(r'^\d+$')]
print(integer_text_rows)


Rows Where 'Text' Contains Only Integers:
             Date                                                Url Text  \
14              2  https://www.nasdaq.com/articles/us-stocks-wall...    2   
53              2  https://www.nasdaq.com/articles/us-stocks-wall...    2   
77              2  https://www.nasdaq.com/articles/us-stocks-futu...    2   
129             2  https://www.nasdaq.com/articles/us-stocks-wall...    2   
130             0  https://www.nasdaq.com/articles/apple-q1-earni...    0   
..            ...                                                ...  ...   
899  DEC 19, 2023  https://www.nasdaq.com/articles/49.1-of-warren...    0   
900  DEC 19, 2023  https://www.nasdaq.com/articles/3-tech-stocks-...    0   
901  DEC 19, 2023  https://www.nasdaq.com/articles/2-top-warren-b...    0   
902  DEC 19, 2023  https://www.nasdaq.com/articles/tsmc-to-promot...    0   
903  DEC 18, 2023  https://www.nasdaq.com/articles/3-metaverse-st...    0   

     Mark  
14      2  
53      

In [4]:
# Statistics about rows with integers in the 'Text' column
print("\nSummary of Rows Where 'Text' Contains Only Integers:")
print(integer_text_rows['Text'].value_counts())


Summary of Rows Where 'Text' Contains Only Integers:
Text
0    756
2      5
Name: count, dtype: int64


In [5]:
# Check how many rows have meaningful text vs. unexpected integers
print("\nCount of Rows with Non-Numeric Text vs. Numeric Text:")
is_numeric = data['Text'].astype(str).str.match(r'^\d+$')
print(f"Numeric Text Rows: {is_numeric.sum()}")
print(f"Non-Numeric Text Rows: {len(data) - is_numeric.sum()}")


Count of Rows with Non-Numeric Text vs. Numeric Text:
Numeric Text Rows: 761
Non-Numeric Text Rows: 143


In [46]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder

# Load datasets
news_data = pd.read_csv("aa_news_content.csv", encoding="utf-8-sig")
stock_data = pd.read_csv("aa_stock_processed.csv", parse_dates=["Date"])

In [48]:
import pandas as pd
import re

# Function to clean and standardize the Date column
def clean_date(date_str):
    # Remove everything after the first '—' (time and timezone)
    date_str = date_str.split('—')[0].strip()
    # Convert to datetime format
    try:
        return pd.to_datetime(date_str, format="%B %d, %Y", errors="coerce")
    except Exception as e:
        print(f"Error parsing date: {date_str}")
        return pd.NaT

# Apply the cleaning function to the 'Date' column
news_data['Date'] = news_data['Date'].apply(clean_date)

# Preview the cleaned data
print(news_data)

           Date                                                Url  \
0    2024-01-17  https://www.nasdaq.com/articles/sp-futures-sli...   
1    2024-01-17  https://www.nasdaq.com/articles/how-the-pieces...   
2    2024-01-17  https://www.nasdaq.com/articles/sp-futures-tic...   
3    2024-01-17  https://www.nasdaq.com/articles/alcoa-q4-23-ea...   
4    2024-01-16  https://www.nasdaq.com/articles/chipmakers-lea...   
...         ...                                                ...   
2242 2023-04-06  https://www.nasdaq.com/articles/alcoa-aa-stock...   
2243 2023-04-06  https://www.nasdaq.com/articles/guru-fundament...   
2244 2023-04-06  https://www.nasdaq.com/articles/may-26th-optio...   
2245 2023-04-06  https://www.nasdaq.com/articles/pre-market-mos...   
2246 2023-04-04  https://www.nasdaq.com/articles/unusual-put-op...   

                                                   Text  Mark  
0     March S&P 500 E-Mini futures (ESH24) are trend...     1  
1     Looking at the underlying

In [50]:
# Convert to datetime and extract only the date (YYYY-MM-DD)
stock_data['Date'] = pd.to_datetime(stock_data['Date'], errors='coerce').dt.date

# Preview the cleaned data
print(stock_data)

             Date       Open       High        Low      Close  Adj close  \
0      2024-02-02  29.000000  29.719999  28.549999  29.490000  29.490000   
1      2024-02-01  30.080000  30.405001  29.150000  29.690001  29.690001   
2      2024-01-31  30.490000  31.360001  29.715000  29.750000  29.750000   
3      2024-01-30  30.340000  30.840000  30.000000  30.610001  30.610001   
4      2024-01-29  30.459999  30.969999  29.688999  30.910000  30.910000   
...           ...        ...        ...        ...        ...        ...   
13637  1970-01-08   6.971203   7.008750   6.921141   6.946172   1.896820   
13638  1970-01-07   6.983719   6.983719   6.958688   6.971203   1.903655   
13639  1970-01-06   7.083844   7.108875   7.008750   7.008750   1.913908   
13640  1970-01-05   7.158938   7.209000   7.071328   7.083844   1.934414   
13641  1970-01-02   7.158938   7.234031   7.158938   7.158938   1.954920   

        Volume  
0      4954000  
1      4174600  
2      5760400  
3      4714700  
4 

In [54]:
# Ensure 'Date' in news_data is properly converted to datetime
news_data['Date'] = pd.to_datetime(news_data['Date'], errors='coerce')

# Ensure 'Date' in stock_data is properly converted to datetime
stock_data['Date'] = pd.to_datetime(stock_data['Date'], errors='coerce')

In [56]:
# Merge stock_data with same-day news
merged_same_day = pd.merge(stock_data, news_data, on="Date", how="inner", suffixes=("", "_same"))

# Add a lagged date column to news_data
news_data['Lagged_Date'] = news_data['Date'] + pd.Timedelta(days=1)

# Merge stock_data with lagged news
merged_lagged = pd.merge(stock_data, news_data, left_on="Date", right_on="Lagged_Date", how="inner", suffixes=("", "_lagged"))

# Combine same-day and lagged-day data
merged_combined = pd.concat([merged_same_day, merged_lagged], axis=0).drop_duplicates()

# Save the combined dataset to a new CSV
output_file_path = "merged_stock_news.csv"
merged_combined.to_csv(output_file_path, index=False, encoding="utf-8-sig")

print(f"New dataset saved as: {output_file_path}")
print(merged_combined.head())

New dataset saved as: merged_stock_news.csv
        Date   Open   High        Low  Close  Adj close   Volume  \
0 2024-01-17  27.17  27.67  27.049999  27.18      27.18  9606400   
1 2024-01-17  27.17  27.67  27.049999  27.18      27.18  9606400   
2 2024-01-17  27.17  27.67  27.049999  27.18      27.18  9606400   
3 2024-01-17  27.17  27.67  27.049999  27.18      27.18  9606400   
4 2024-01-17  27.17  27.67  27.049999  27.18      27.18  9606400   

                                                 Url  \
0  https://www.nasdaq.com/articles/sp-futures-sli...   
1  https://www.nasdaq.com/articles/how-the-pieces...   
2  https://www.nasdaq.com/articles/sp-futures-tic...   
3  https://www.nasdaq.com/articles/alcoa-q4-23-ea...   
4  https://www.nasdaq.com/articles/compared-to-es...   

                                                Text  Mark Lagged_Date  \
0  March S&P 500 E-Mini futures (ESH24) are trend...     1  2024-01-18   
1  Looking at the underlying holdings of the ETFs...     1  20

In [58]:
# Load the merged dataset
merged_data = pd.read_csv("merged_stock_news.csv", parse_dates=["Date"])

# Sort the dataset by date from old to new
merged_data = merged_data.sort_values(by="Date").reset_index(drop=True)

# Save the sorted dataset
sorted_file_path = "sorted_merged_stock_news.csv"
merged_data.to_csv(sorted_file_path, index=False, encoding="utf-8-sig")
print(f"Sorted dataset saved as: {sorted_file_path}")

Sorted dataset saved as: sorted_merged_stock_news.csv


In [60]:
from sklearn.model_selection import train_test_split

# Split the dataset into train (70%) and test (30%)
train_data, test_data = train_test_split(merged_data, test_size=0.3, random_state=42, shuffle=False)

# Save the train and test datasets
train_file_path = "train_stock_news.csv"
test_file_path = "test_stock_news.csv"

train_data.to_csv(train_file_path, index=False, encoding="utf-8-sig")
test_data.to_csv(test_file_path, index=False, encoding="utf-8-sig")

print(f"Train dataset saved as: {train_file_path}")
print(f"Test dataset saved as: {test_file_path}")

Train dataset saved as: train_stock_news.csv
Test dataset saved as: test_stock_news.csv


In [62]:
pip install transformers torch scikit-learn pandas




In [64]:
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Custom Dataset for BERT
class StockNewsDataset(Dataset):
    def __init__(self, texts, prices, tokenizer, max_len):
        self.texts = texts
        self.prices = prices
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        price = self.prices[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_tensors="pt",
            truncation=True,
            padding="max_length"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "target": torch.tensor(price, dtype=torch.float)
        }

# Prepare datasets for BERT
train_dataset = StockNewsDataset(
    texts=train_data["Text"].tolist(),
    prices=train_data["Close"].tolist(),
    tokenizer=tokenizer,
    max_len=128
)

test_dataset = StockNewsDataset(
    texts=test_data["Text"].tolist(),
    prices=test_data["Close"].tolist(),
    tokenizer=tokenizer,
    max_len=128
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [66]:
class BertStockPredictor(nn.Module):
    def __init__(self, bert_model):
        super(BertStockPredictor, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(768, 1)  # 768 hidden size for BERT base, 1 output for regression

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return self.fc(cls_output)

# Load pre-trained BERT
bert_model = BertModel.from_pretrained("bert-base-uncased")
model = BertStockPredictor(bert_model)

In [68]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Training loop
for epoch in range(3):  # Adjust the number of epochs as needed
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["target"].to(device)

        outputs = model(input_ids, attention_mask).squeeze()
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

Epoch 1, Loss: 593.7792147997743
Epoch 2, Loss: 493.8737049932065
Epoch 3, Loss: 462.97387515239836


In [74]:
import pandas as pd
from sklearn.metrics import mean_squared_error

# Ensure the model is in evaluation mode
model.eval()
all_preds = []
all_targets = []

# Collect predictions and targets
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["target"].to(device)

        outputs = model(input_ids, attention_mask).squeeze()
        all_preds.extend(outputs.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

# Calculate RMSE
rmse = mean_squared_error(all_targets, all_preds, squared=False)
print(f"Test RMSE: {rmse:.4f}")

# Add predictions to the test dataset
test_data = test_data.copy()  # Avoid modifying the original test_data
test_data["prediction"] = all_preds

# Save the test data with predictions to a new CSV
output_file_path = "predictions.csv"
test_data.to_csv(output_file_path, index=False, encoding="utf-8-sig")

print(f"Predictions saved to {output_file_path}")

Test RMSE: 43.0177
Predictions saved to predictions.csv




In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load and preprocess dataset
merged_data = pd.read_csv("sorted_merged_stock_news.csv", parse_dates=["Date"])

# Create Movement labels: 1 (Up), 0 (Down/No Change)
merged_data["Movement"] = (merged_data["Close"] > merged_data["Open"]).astype(int)

# Drop rows with missing values in Text or Movement
merged_data = merged_data.dropna(subset=["Text", "Movement"])

# Split dataset into train and test (70%-30%)
train_data, test_data = train_test_split(merged_data, test_size=0.3, random_state=42, shuffle=False)

# Extract texts and labels
train_texts = train_data["Text"].tolist()
train_labels = train_data["Movement"].tolist()

test_texts = test_data["Text"].tolist()
test_labels = test_data["Movement"].tolist()

In [3]:
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define custom dataset class for BERT
class StockNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_tensors="pt",
            truncation=True,
            padding="max_length"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Prepare train and test datasets
train_dataset = StockNewsDataset(
    texts=train_texts,
    labels=train_labels,
    tokenizer=tokenizer,
    max_len=128
)

test_dataset = StockNewsDataset(
    texts=test_texts,
    labels=test_labels,
    tokenizer=tokenizer,
    max_len=128
)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [5]:
from transformers import BertModel
import torch
import torch.nn as nn

class BertClassifier(nn.Module):
    def __init__(self, n_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output  # CLS token output
        output = self.dropout(pooled_output)
        return self.out(output)

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertClassifier(n_classes=2).to(device)

In [7]:
from transformers import AdamW
from transformers import get_scheduler

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 5  # Number of epochs

# Learning rate scheduler
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")



Epoch 1/5, Loss: 0.7015
Epoch 2/5, Loss: 0.6894
Epoch 3/5, Loss: 0.6592
Epoch 4/5, Loss: 0.6028
Epoch 5/5, Loss: 0.5539


In [9]:
from sklearn.metrics import classification_report, f1_score

# Evaluate the model
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate F1 Score
f1 = f1_score(all_labels, all_preds, average="weighted")
print(f"F1 Score: {f1:.4f}")

# Classification report
print(classification_report(all_labels, all_preds, target_names=["Down/No Change", "Up"]))

# Save predictions to CSV
test_data = test_data.copy()
test_data["predictions"] = all_preds
test_data["predictions"] = test_data["predictions"].map({1: "Up", 0: "Down/No Change"})
test_data["Movement"] = test_data["Movement"].map({1: "Up", 0: "Down/No Change"})

# Save to CSV
test_data.to_csv("prediction_updown.csv", index=False, encoding="utf-8-sig")
print("Predictions saved to prediction_updown.csv")

F1 Score: 0.5146
                precision    recall  f1-score   support

Down/No Change       0.54      0.56      0.55       584
            Up       0.48      0.46      0.47       520

      accuracy                           0.52      1104
     macro avg       0.51      0.51      0.51      1104
  weighted avg       0.51      0.52      0.51      1104

Predictions saved to prediction_updown.csv
