In [1]:
# Install required packages
!pip install -q transformers
!pip install -q peft==0.5.0
!pip install -q accelerate
!pip install -q bitsandbytes
!pip install -q sentencepiece
!pip install -q torch
!pip install -q pandas
!pip install -q numpy
!pip install -q scikit-learn
!pip install -q matplotlib
!pip install -q yfinance


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import torch
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
from tqdm import tqdm
import warnings
from google.colab import auth
from oauth2client.client import GoogleCredentials
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
warnings.filterwarnings('ignore')


class StockDataManager:
    def __init__(self):
        self.dirs = ['data', 'models', 'results']
        self.setup_directories()

    def setup_directories(self):
        for dir_name in self.dirs:
            os.makedirs(dir_name, exist_ok=True)
            print(f"Created/verified directory: {dir_name}/")

    # Download StockEmo data
    def download_stockemo_data(self):
        # Authenticate
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)

        file_ids = {
            "train_stockemo.csv": "14kpQhdpjt57ySe9omZSofbmFF4iYUIDc",
            "val_stockemo.csv": "1-8FC0f1RDCNSkRt8doTDMAPrmdmazQ5u",
            "test_stockemo.csv": "1-A1n7mRMbje-me1rQpce_QsfFnlH0av7",
            "processed_stockemo.csv": "1-7QLxjVIezZLJ_Og5m3DmmW32BabnH2i"
        }

        for filename, file_id in file_ids.items():
            output_path = f"data/{filename}"
            if not os.path.exists(output_path):
                print(f"Downloading {filename}...")
                downloaded = drive.CreateFile({"id": file_id})
                downloaded.GetContentFile(output_path)
                print(f"Saved to {output_path}")
            else:
                print(f"{filename} already exists")

    def get_unique_tickers_from_stockemo(self):
        """Get list of unique tickers from processed_stockemo.csv"""
        try:
            processed_df = pd.read_csv("data/processed_stockemo.csv")
            unique_tickers = processed_df['ticker'].unique().tolist()
            print(f"Found {len(unique_tickers)} unique tickers in StockEmo data")
            return unique_tickers
        except Exception as e:
            print(f"Error reading StockEmo data: {str(e)}")
            return []

    def download_stock_data(self, start_date=None, end_date=None, period=None):
        """
        Download stock data for all StockEmo tickers
        """
        # Get symbols from StockEmo
        symbols = self.get_unique_tickers_from_stockemo()

        if not symbols:
            raise ValueError("No tickers found in StockEmo data")

        # Handle dates
        if period and not (start_date or end_date):
            valid_periods = ['1d','5d','1mo','3mo','6mo','1y','2y','5y','10y','ytd','max']
            if period not in valid_periods:
                raise ValueError(f"Invalid period. Must be one of {valid_periods}")
        else:
            if not start_date:
                start_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
            if not end_date:
                end_date = datetime.now().strftime('%Y-%m-%d')

        stock_data = {}
        print("\nDownloading stock data...")
        for symbol in tqdm(symbols, desc="Downloading stock data"):
            try:
                # Create Ticker object
                ticker = yf.Ticker(symbol)

                # Get historical data
                if period:
                    hist = ticker.history(period=period)
                else:
                    hist = ticker.history(start=start_date, end=end_date)

                if not hist.empty:
                    # Convert to DataFrame and reset index
                    df = hist.reset_index()

                    # Add ticker column
                    df.insert(0, 'Ticker', symbol)

                    # Save to CSV
                    output_path = f"data/{symbol}_prices.csv"
                    df.to_csv(output_path, index=False)

                    stock_data[symbol] = df
                else:
                    print(f"\nNo data received for {symbol}")

            except Exception as e:
                print(f"\nError downloading {symbol}: {str(e)}")
                continue

        print(f"\nSuccessfully downloaded data for {len(stock_data)} tickers")
        return stock_data

def initialize_workspace(start_date=None, end_date=None, period=None):
    """Initialize workspace and download stock data"""
    print("Setting up workspace...")
    manager = StockDataManager()

    manager.download_stockemo_data()

    stock_data = manager.download_stock_data(
        start_date=start_date,
        end_date=end_date,
        period=period
    )

    # Print summary of downloaded data
    print("\nDownloaded Data Summary:")
    for symbol, df in stock_data.items():
        print(f"\n{symbol}:")
        print(f"Date Range: {df['Date'].min()} to {df['Date'].max()}")
        print(f"Number of records: {len(df)}")

    return manager, stock_data

if __name__ == "__main__":
    # Download data for all StockEmo tickers
    manager, data = initialize_workspace(
    start_date='2020-01-01',
    end_date='2020-12-31'
)

    # Print total number of tickers downloaded
    print(f"\nTotal number of tickers downloaded: {len(data)}")

Setting up workspace...
Created/verified directory: data/
Created/verified directory: models/
Created/verified directory: results/
Downloading train_stockemo.csv...
Saved to data/train_stockemo.csv
Downloading val_stockemo.csv...
Saved to data/val_stockemo.csv
Downloading test_stockemo.csv...
Saved to data/test_stockemo.csv
Downloading processed_stockemo.csv...
Saved to data/processed_stockemo.csv
Found 39 unique tickers in StockEmo data

Downloading stock data...


Downloading stock data:  15%|█▌        | 6/39 [00:00<00:04,  6.70it/s]ERROR:yfinance:$FB: possibly delisted; no timezone found
Downloading stock data:  23%|██▎       | 9/39 [00:01<00:05,  5.04it/s]


No data received for FB


Downloading stock data:  85%|████████▍ | 33/39 [00:05<00:00,  6.63it/s]ERROR:yfinance:$BRK.A: possibly delisted; no timezone found
Downloading stock data:  87%|████████▋ | 34/39 [00:06<00:01,  3.62it/s]


No data received for BRK.A


ERROR:yfinance:$BRK.B: possibly delisted; no timezone found
Downloading stock data:  92%|█████████▏| 36/39 [00:06<00:00,  3.46it/s]


No data received for BRK.B


Downloading stock data: 100%|██████████| 39/39 [00:07<00:00,  5.28it/s]



Successfully downloaded data for 36 tickers

Downloaded Data Summary:

AAPL:
Date Range: 2020-01-02 00:00:00-05:00 to 2020-12-30 00:00:00-05:00
Number of records: 252

AMT:
Date Range: 2020-01-02 00:00:00-05:00 to 2020-12-30 00:00:00-05:00
Number of records: 252

AMZN:
Date Range: 2020-01-02 00:00:00-05:00 to 2020-12-30 00:00:00-05:00
Number of records: 252

BA:
Date Range: 2020-01-02 00:00:00-05:00 to 2020-12-30 00:00:00-05:00
Number of records: 252

BKNG:
Date Range: 2020-01-02 00:00:00-05:00 to 2020-12-30 00:00:00-05:00
Number of records: 252

DIS:
Date Range: 2020-01-02 00:00:00-05:00 to 2020-12-30 00:00:00-05:00
Number of records: 252

MA:
Date Range: 2020-01-02 00:00:00-05:00 to 2020-12-30 00:00:00-05:00
Number of records: 252

GOOGL:
Date Range: 2020-01-02 00:00:00-05:00 to 2020-12-30 00:00:00-05:00
Number of records: 252

MCD:
Date Range: 2020-01-02 00:00:00-05:00 to 2020-12-30 00:00:00-05:00
Number of records: 252

GOOG:
Date Range: 2020-01-02 00:00:00-05:00 to 2020-12-30 00:

In [3]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple
import gc

class SentimentDataset(Dataset):
    def __init__(self, processed_df, price_df, sequence_length, sentiment_model):
        processed_df['date'] = pd.to_datetime(processed_df['date'])
        price_df['Date'] = pd.to_datetime(price_df['Date'])

        self.processed_df = processed_df.sort_values('date')
        self.price_df = price_df.sort_values('Date')
        self.sequence_length = sequence_length
        self.sentiment_model = sentiment_model

        self.scaler = MinMaxScaler()
        self.prices_scaled = self.scaler.fit_transform(self.price_df['Close'].values.reshape(-1, 1)).flatten()

    def __len__(self):
        return len(self.processed_df) - self.sequence_length

    def __getitem__(self, idx):
        text = self.processed_df.iloc[idx]['processed']
        sentiment_score = self.sentiment_model.get_sentiment(text)
        features = torch.tensor([sentiment_score], dtype=torch.float32)

        price_seq = torch.tensor(self.prices_scaled[idx:idx+self.sequence_length], dtype=torch.float32)
        target = torch.tensor(self.prices_scaled[idx+self.sequence_length], dtype=torch.float32)

        return features, price_seq, target

class SentimentModel:
    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device
        print(f"Initializing FinBERT model on {device}")

        self.tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
        self.model = AutoModelForSequenceClassification.from_pretrained(
            'ProsusAI/finbert'
        ).to(device)
        self.model.eval()
        print("Model loaded successfully!")

    def get_sentiment(self, text):
        try:
            inputs = self.tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                max_length=512
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs)
                probs = torch.softmax(outputs.logits, dim=1)
                score = probs[0][2].item() - probs[0][0].item()
            return score
        except Exception as e:
            print(f"Error in sentiment analysis: {str(e)}")
            return 0.0

def analyze_sentiments_by_ticker(processed_df, sentiment_model, batch_size=32):
    results = []
    processed_df['date'] = pd.to_datetime(processed_df['date'])
    grouped = processed_df.groupby('ticker')

    print("\nAnalyzing sentiments for each ticker...")
    for ticker, group in tqdm(grouped, desc="Processing tickers"):
        ticker_results = []

        for i in range(0, len(group), batch_size):
            batch = group.iloc[i:i + batch_size]

            for _, row in batch.iterrows():
                text = row['processed']
                sentiment_score = sentiment_model.get_sentiment(text)

                ticker_results.append({
                    'date': row['date'],
                    'ticker': ticker,
                    'text': text,
                    'sentiment': sentiment_score,
                    'original_sentiment': row['senti_label'],
                    'industry': row['industry']
                })

        ticker_df = pd.DataFrame(ticker_results)
        ticker_df.to_csv(f"data/{ticker}_sentiment_analysis.csv", index=False)
        results.extend(ticker_results)

    return pd.DataFrame(results)

def main():
    print("Initializing stock data download...")
    manager = StockDataManager()
    stock_data = manager.download_stock_data(start_date='2020-01-01',end_date='2020-12-31')
    print("\nLoading StockEmo data...")
    processed_df = pd.read_csv("data/processed_stockemo.csv")

    sentiment_model = SentimentModel()
    results_df = analyze_sentiments_by_ticker(processed_df, sentiment_model)
    results_df.to_csv("data/all_tickers_sentiment_analysis.csv", index=False)
    print("\nSaved sentiment analysis results to data/all_tickers_sentiment_analysis.csv")

    return sentiment_model, results_df, stock_data

if __name__ == "__main__":
    sentiment_model, sentiment_results, stock_data = main()

    print("\nAnalysis Summary:")
    print(f"Number of tickers analyzed: {len(sentiment_results['ticker'].unique())}")
    print("\nSentiment distribution:")
    print(sentiment_results['sentiment'].describe())

Initializing stock data download...
Created/verified directory: data/
Created/verified directory: models/
Created/verified directory: results/
Found 39 unique tickers in StockEmo data

Downloading stock data...


Downloading stock data:  15%|█▌        | 6/39 [00:00<00:04,  7.80it/s]ERROR:yfinance:$FB: possibly delisted; no timezone found
Downloading stock data:  18%|█▊        | 7/39 [00:01<00:09,  3.54it/s]


No data received for FB


Downloading stock data:  82%|████████▏ | 32/39 [00:02<00:00, 24.42it/s]ERROR:yfinance:$BRK.A: possibly delisted; no timezone found
ERROR:yfinance:$BRK.B: possibly delisted; no timezone found
Downloading stock data: 100%|██████████| 39/39 [00:02<00:00, 13.34it/s]



No data received for BRK.A

No data received for BRK.B

Successfully downloaded data for 36 tickers

Loading StockEmo data...
Initializing FinBERT model on cuda


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Model loaded successfully!

Analyzing sentiments for each ticker...


Processing tickers:   0%|          | 0/39 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Processing tickers: 100%|██████████| 39/39 [07:46<00:00, 11.96s/it]



Saved sentiment analysis results to data/all_tickers_sentiment_analysis.csv

Analysis Summary:
Number of tickers analyzed: 39

Sentiment distribution:
count    50281.000000
mean         0.662058
std          0.334808
min         -0.941154
25%          0.627737
50%          0.791516
75%          0.857522
max          0.933513
Name: sentiment, dtype: float64


In [4]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple
import os

class SentimentLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout=0.2):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x_sentiment, x_price):
        batch_size = x_price.size(0)
        seq_length = x_price.size(1)

        x_sentiment = x_sentiment.unsqueeze(1).repeat(1, seq_length, 1)
        x_combined = torch.cat((x_sentiment, x_price.unsqueeze(-1)), dim=2)

        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x_combined.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x_combined.device)

        out, _ = self.lstm(x_combined, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out.squeeze()

class BaselineLSTM(nn.Module):
    def __init__(self, hidden_dim, num_layers, dropout=0.2):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            1,  # Only price input
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x_price):
        batch_size = x_price.size(0)

        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x_price.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x_price.device)

        out, _ = self.lstm(x_price.unsqueeze(-1), (h0, c0))
        out = self.fc(out[:, -1, :])
        return out.squeeze()

class TickerDataset(Dataset):
    def __init__(self, price_df, sentiment_df, sequence_length):
        price_df = price_df.copy()
        sentiment_df = sentiment_df.copy()

        # Convert price dates to YYYY-MM-DD
        price_df['Date'] = price_df['Date'].astype(str).str[:10]

        # # Debug print
        # print("\nPrice dates sample:", price_df['Date'].head().tolist())
        # print("Sentiment dates sample:", sentiment_df['date'].head().tolist())
        # print("\nTotal unique price dates:", len(price_df['Date'].unique()))
        # print("Total unique sentiment dates:", len(sentiment_df['date'].unique()))
        # print("\nSample overlap:", len(set(price_df['Date']) & set(sentiment_df['date'])))

        self.price_df = price_df[price_df['Date'].isin(sentiment_df['date'])]
        self.sentiment_df = sentiment_df

        print(f"Matched data points: {len(self.price_df)}")

        if len(self.price_df) < sequence_length + 1:
            raise ValueError(f"Insufficient data points. Need at least {sequence_length + 1} points, got {len(self.price_df)}")

        self.scaler = MinMaxScaler()
        self.prices_scaled = self.scaler.fit_transform(self.price_df['Close'].values.reshape(-1, 1)).flatten()

        self.sequence_length = sequence_length
        self.sentiments = self.sentiment_df['sentiment'].values

    def __len__(self):
        return len(self.prices_scaled) - self.sequence_length - 1

    def __getitem__(self, idx):
        sentiment = torch.tensor([self.sentiments[idx]], dtype=torch.float32)
        price_seq = torch.tensor(self.prices_scaled[idx:idx+self.sequence_length], dtype=torch.float32)
        target = torch.tensor(self.prices_scaled[idx+self.sequence_length], dtype=torch.float32)
        return sentiment, price_seq, target

    def inverse_transform(self, scaled_values):
        return self.scaler.inverse_transform(scaled_values.reshape(-1, 1)).flatten()

class BaselineDataset(Dataset):
    def __init__(self, price_df, sequence_length):
        self.price_df = price_df.copy()
        self.sequence_length = sequence_length

        self.scaler = MinMaxScaler()
        self.prices_scaled = self.scaler.fit_transform(self.price_df['Close'].values.reshape(-1, 1)).flatten()

    def __len__(self):
        return len(self.prices_scaled) - self.sequence_length - 1

    def __getitem__(self, idx):
        price_seq = torch.tensor(self.prices_scaled[idx:idx+self.sequence_length], dtype=torch.float32)
        target = torch.tensor(self.prices_scaled[idx+self.sequence_length], dtype=torch.float32)
        return price_seq, target

    def inverse_transform(self, scaled_values):
        return self.scaler.inverse_transform(scaled_values.reshape(-1, 1)).flatten()


def validate_data_files(tickers: List[str]) -> List[str]:
    valid_tickers = []
    for ticker in tickers:
        sentiment_path = f"data/{ticker}_sentiment_analysis.csv"
        price_path = f"data/{ticker}_prices.csv"

        if os.path.exists(sentiment_path) and os.path.exists(price_path):
            try:
                sentiment_df = pd.read_csv(sentiment_path)
                price_df = pd.read_csv(price_path)
                if len(sentiment_df) > 0 and len(price_df) > 0:
                    valid_tickers.append(ticker)
                else:
                    print(f"Warning: Empty data files for {ticker}")
            except Exception as e:
                print(f"Error reading data for {ticker}: {str(e)}")
        else:
            print(f"Warning: Missing data files for {ticker}")

    print(f"Found {len(valid_tickers)} valid tickers out of {len(tickers)} total")
    return valid_tickers

def train_and_evaluate_ticker(ticker: str, model_params: Dict, train_params: Dict) -> Tuple[List[float], List[float]]:
    price_df = pd.read_csv(f"data/{ticker}_prices.csv")
    sentiment_df = pd.read_csv(f"data/{ticker}_sentiment_analysis.csv")

    dataset = TickerDataset(price_df, sentiment_df, model_params['sequence_length'])

    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=train_params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=train_params['batch_size'])

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = SentimentLSTM(
        model_params['input_dim'],
        model_params['hidden_dim'],
        model_params['num_layers']
    ).to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=train_params['learning_rate'])

    for epoch in range(train_params['epochs']):
        model.train()
        train_loss = 0
        for sentiment, price_seq, target in train_loader:
            sentiment, price_seq, target = sentiment.to(device), price_seq.to(device), target.to(device)

            optimizer.zero_grad()
            output = model(sentiment, price_seq)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        if epoch % 10 == 0:
            print(f'{ticker} - Epoch {epoch}: Loss = {train_loss/len(train_loader):.4f}')

    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for sentiment, price_seq, target in val_loader:
            sentiment, price_seq = sentiment.to(device), price_seq.to(device)
            output = model(sentiment, price_seq)
            predictions.extend(output.cpu().numpy())
            actuals.extend(target.numpy())

    predictions = dataset.inverse_transform(np.array(predictions))
    actuals = dataset.inverse_transform(np.array(actuals))

    return predictions, actuals

def train_and_evaluate_baseline(ticker: str, model_params: Dict, train_params: Dict) -> Tuple[List[float], List[float]]:
    price_df = pd.read_csv(f"data/{ticker}_prices.csv")
    price_df['Date'] = price_df['Date'].astype(str).str[:10]

    dataset = BaselineDataset(price_df, model_params['sequence_length'])

    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=train_params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=train_params['batch_size'])

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BaselineLSTM(
        model_params['hidden_dim'],
        model_params['num_layers']
    ).to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=train_params['learning_rate'])

    for epoch in range(train_params['epochs']):
        model.train()
        train_loss = 0
        for price_seq, target in train_loader:
            price_seq, target = price_seq.to(device), target.to(device)

            optimizer.zero_grad()
            output = model(price_seq)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        if epoch % 10 == 0:
            print(f'{ticker} Baseline - Epoch {epoch}: Loss = {train_loss/len(train_loader):.4f}')

    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for price_seq, target in val_loader:
            price_seq = price_seq.to(device)
            output = model(price_seq)
            predictions.extend(output.cpu().numpy())
            actuals.extend(target.numpy())

    predictions = dataset.inverse_transform(np.array(predictions))
    actuals = dataset.inverse_transform(np.array(actuals))

    return predictions, actuals

def plot_comparison(ticker: str, sentiment_pred: List[float], baseline_pred: List[float], actuals: List[float]):
    plt.figure(figsize=(15, 10))

    # Price Comparison Plot
    plt.plot(actuals, label='Actual', color='blue', linewidth=2)
    plt.plot(sentiment_pred, label='LSTM with Sentiment', color='red', linestyle='--', linewidth=2)
    plt.plot(baseline_pred, label='Baseline LSTM', color='green', linestyle='--', linewidth=2)

    plt.title(f'Price Predictions Comparison for {ticker}')
    plt.xlabel('Time Steps')
    plt.ylabel('Price')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.savefig(f'results/{ticker}_comparison.png')
    plt.close()


def plot_results(ticker: str, predictions: List[float], actuals: List[float]):
    plt.figure(figsize=(15, 10))

    # Create subplots
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))

    # Price Comparison Plot
    ax1.plot(actuals, label='Actual', color='blue', linewidth=2)
    ax1.plot(predictions, label='Predicted', color='red', linestyle='--', linewidth=2)
    ax1.set_title(f'Price Predictions vs Actual for {ticker}')
    ax1.set_xlabel('Time Steps')
    ax1.set_ylabel('Price')
    ax1.legend()
    ax1.grid(True)

    # Error Analysis Plot
    errors = np.array(predictions) - np.array(actuals)
    ax2.plot(errors, color='green', label='Prediction Error')
    ax2.axhline(y=0, color='r', linestyle='-', alpha=0.3)
    ax2.set_title('Prediction Error Over Time')
    ax2.set_xlabel('Time Steps')
    ax2.set_ylabel('Error (Predicted - Actual)')
    ax2.legend()
    ax2.grid(True)

    plt.tight_layout()
    plt.savefig(f'results/{ticker}_analysis.png')
    plt.close()

def plot_mse_comparison(results_df):
    # Filter successful predictions only
    df = results_df[results_df['status'] == 'success']

    plt.figure(figsize=(15, 8))

    # Create grouped bars for MSE comparison
    tickers = df.index
    x = np.arange(len(tickers))
    width = 0.35

    plt.bar(x - width/2, df['sentiment_mse'], width, label='LSTM with Sentiment', color='blue', alpha=0.7)
    plt.bar(x + width/2, df['baseline_mse'], width, label='Baseline LSTM', color='red', alpha=0.7)

    plt.xlabel('Tickers')
    plt.ylabel('Mean Squared Error (MSE)')
    plt.title('MSE Comparison: Sentiment LSTM vs Baseline LSTM')
    plt.xticks(x, tickers, rotation=45)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()

    plt.savefig('results/mse_comparison.png')
    plt.close()

    # Calculate and print average improvement
    avg_improvement = df['improvement'].mean()
    print(f"\nAverage MSE Improvement: {avg_improvement:.2f}%")

    # Print individual MSE comparisons
    print("\nMSE Comparison by Ticker:")
    for ticker in df.index:
        print(f"{ticker}:")
        print(f"  Sentiment LSTM MSE: {df.loc[ticker, 'sentiment_mse']:.6f}")
        print(f"  Baseline LSTM MSE: {df.loc[ticker, 'baseline_mse']:.6f}")
        print(f"  Improvement: {df.loc[ticker, 'improvement']:.2f}%")


def main():
    model_params = {
        'input_dim': 2, #2 for sentiment, 1 for baseline
        'hidden_dim': 25,  # Best performing hidden size
        'num_layers': 2,
        'sequence_length': 5  # Best performing window size
    }

    train_params = {
        'batch_size': 32,
        'epochs': 250,  # Best performing epoch count
        'learning_rate': 0.001,
        'train_split': 0.67  # 67% train, 33% test
    }

    all_tickers = [f.split('_')[0] for f in os.listdir('data') if f.endswith('_sentiment_analysis.csv')]
    valid_tickers = validate_data_files(all_tickers)

    os.makedirs('results', exist_ok=True)

    results = {}
    for ticker in valid_tickers:
        print(f"\nProcessing {ticker}")
        try:
            sentiment_pred, sentiment_actuals = train_and_evaluate_ticker(ticker, model_params, train_params)
            baseline_pred, baseline_actuals = train_and_evaluate_baseline(ticker, model_params, train_params)

            sentiment_mse = np.mean((sentiment_pred - sentiment_actuals) ** 2)
            sentiment_mae = np.mean(np.abs(sentiment_pred - sentiment_actuals))
            baseline_mse = np.mean((baseline_pred - baseline_actuals) ** 2)
            baseline_mae = np.mean(np.abs(baseline_pred - baseline_actuals))

            results[ticker] = {
                'sentiment_mse': sentiment_mse,
                'sentiment_mae': sentiment_mae,
                'baseline_mse': baseline_mse,
                'baseline_mae': baseline_mae,
                'improvement': ((baseline_mse - sentiment_mse) / baseline_mse) * 100,
                'status': 'success'
            }

            plot_results(ticker, sentiment_pred, sentiment_actuals)
            plot_comparison(ticker, sentiment_pred, baseline_pred, sentiment_actuals)

        except Exception as e:
            print(f"Error processing {ticker}: {str(e)}")
            results[ticker] = {
                'sentiment_mse': None,
                'sentiment_mae': None,
                'baseline_mse': None,
                'baseline_mae': None,
                'improvement': None,
                'status': f'failed: {str(e)}'
            }

    results_df = pd.DataFrame.from_dict(results, orient='index')
    results_df.to_csv('results/model_comparison_metrics.csv')
    plot_mse_comparison(results_df)


    success_count = sum(1 for v in results.values() if v['status'] == 'success')
    print(f"\nProcessing complete:")
    print(f"Successfully processed: {success_count}/{len(valid_tickers)} tickers")
    print(f"Results saved to: results/model_comparison_metrics.csv")

if __name__ == '__main__':
    main()

Found 36 valid tickers out of 40 total

Processing BA
Matched data points: 218
BA - Epoch 0: Loss = 0.2290
BA - Epoch 10: Loss = 0.0330
BA - Epoch 20: Loss = 0.0069
BA - Epoch 30: Loss = 0.0063
BA - Epoch 40: Loss = 0.0056
BA - Epoch 50: Loss = 0.0081
BA - Epoch 60: Loss = 0.0056
BA - Epoch 70: Loss = 0.0051
BA - Epoch 80: Loss = 0.0050
BA - Epoch 90: Loss = 0.0058
BA - Epoch 100: Loss = 0.0050
BA - Epoch 110: Loss = 0.0055
BA - Epoch 120: Loss = 0.0049
BA - Epoch 130: Loss = 0.0048
BA - Epoch 140: Loss = 0.0049
BA - Epoch 150: Loss = 0.0049
BA - Epoch 160: Loss = 0.0045
BA - Epoch 170: Loss = 0.0043
BA - Epoch 180: Loss = 0.0043
BA - Epoch 190: Loss = 0.0039
BA - Epoch 200: Loss = 0.0043
BA - Epoch 210: Loss = 0.0037
BA - Epoch 220: Loss = 0.0027
BA - Epoch 230: Loss = 0.0035
BA - Epoch 240: Loss = 0.0031
BA Baseline - Epoch 0: Loss = 0.2495
BA Baseline - Epoch 10: Loss = 0.0491
BA Baseline - Epoch 20: Loss = 0.0075
BA Baseline - Epoch 30: Loss = 0.0054
BA Baseline - Epoch 40: Loss = 

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>