In [None]:
import os
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

FEATURE_COLUMNS = [
    "year", "month", "intrinsic_value", "stock_exret", "stock_ticker", "comp_name", "be_me", "ni_me", "fcf_me",
    "betadown_252d", "ni_ar1", "z_score", "ebit_sale", "at_turnover",
    "market_equity", "roic", "bvps", "prev_bvps", "bvps_change", "prev_at_turnover",
    "at_turnover_change", "prev_ni_me", "ni_me_change", "prev_fcf_me", "fcf_me_change",
    "sin_month", "cos_month"
] # 27 - 4 = 23
# ignore stock_ticker, comp_name, year, month, ni_me_change, fcf_me_change

SEQ_LENGTH = 12

In [2]:
def process_time_features(df):
    """ Remove `year`, and encode `month` cyclically """
    if "year" not in df.columns or "month" not in df.columns:
        raise KeyError("The dataset does not contain 'year' or 'month' columns.")

    df["sin_month"] = np.sin(2 * np.pi * df["month"] / 12)
    df["cos_month"] = np.cos(2 * np.pi * df["month"] / 12)
    return df
    # return df.drop(columns=["year", "month"])  # Remove original time columns

In [49]:
def normalize_features(df):
    print(df)
    feature_columns = [col for col in df.columns if col not in ["year", "month", "stock_exret"]]

    # Identify non-numeric columns
    # non_numeric_cols = df[feature_columns].select_dtypes(exclude=[np.number]).columns.tolist()
    # if non_numeric_cols:
    #     print("Dropping non-numeric columns:", non_numeric_cols)
    #     df = df.drop(columns=non_numeric_cols)

    # Replace NaN and infinite values
    # df[feature_columns] = df[feature_columns].replace([np.inf, -np.inf], np.nan)  # Convert inf to NaN
    # # Check for infinity values
    # print("Has Inf values:\n", df.isin([np.inf, -np.inf]).sum())

    # # Check if any values are too large
    # print("Max values:\n", df.max())

    # # Check if any column has extreme variance
    # print("Std deviation:\n", df.std())

    df[feature_columns] = df[feature_columns].fillna(0)  # Replace NaN with 0

    # Normalize using MinMaxScaler
    scaler = MinMaxScaler()
    df[feature_columns] = scaler.fit_transform(df[feature_columns])

    return df

In [47]:
def load_company_data(company):
    file_path = f"data/{company}.csv"

    if not os.path.exists(file_path):
        print(f"Skipping {company} (No data found)")
        return None, None

    df = pd.read_csv(file_path)

    df = df.fillna(0)  # Replace NaNs with 0
    
    df = df.drop(columns=["stock_ticker", "comp_name", "ni_me_change", "fcf_me_change",
    "prev_intrinsic_value","next_intrinsic_value","prev_stock_exret","next_stock_exret","prev_be_me",
    "next_be_me","prev_ni_me","next_ni_me","prev_fcf_me","next_fcf_me","prev_betadown_252d",
    "next_betadown_252d","prev_ni_ar1","next_ni_ar1","prev_z_score","next_z_score","prev_ebit_sale",
    "next_ebit_sale","prev_at_turnover","next_at_turnover","prev_market_equity","next_market_equity"
    ,"prev_bvps","next_bvps"
    ], errors="ignore")
    
    # Normalize feature columns (excluding year, month, and stock_exret)
    df = normalize_features(df)

    
    if "year" not in df.columns or "month" not in df.columns:
        print(f"Skipping {company} (Missing 'year' or 'month' column)")
        return None, None

    # df = df.sort_values(by=["year", "month"]) # already sorted
    df_train = df[(df["year"] < 2023) | ((df["year"] == 2023) & (df["month"] < 12))].copy()
    df_test = df[(df["year"] == 2023) & (df["month"] == 12)].copy()

    df_train = process_time_features(df_train)
    df_test = process_time_features(df_test)

    return df_train, df_test

In [48]:
def create_lstm_sequences(df, seq_length, company_id):
    """
    Converts dataframe into LSTM sequences for training only.
    Uses a rolling window approach: each sequence consists of `seq_length` months,
    and the next month's value is the target.
    """
    X_train, Y_train, company_ids_train = [], [], []

    # df = df[df['year'] == 2022]
    # df = df[df['year'].isin([2022, 2023])]

    df = df.drop(columns=['year', 'month'])

    print(df)

    df_values = df.values  # Convert DataFrame to numpy array
    num_samples = len(df_values)

    for i in range(num_samples - seq_length):
        x_seq = df_values[i:i + seq_length]  # Past `seq_length` months
        y_target = df_values[i + seq_length][0]  # Predict next month's `stock_exret`

        X_train.append(x_seq)
        Y_train.append(y_target)
        company_ids_train.append(company_id)

    return np.array(X_train), np.array(Y_train), np.array(company_ids_train)

In [53]:
class StockLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, num_companies, embedding_dim):
        super(StockLSTM, self).__init__()
        
        self.input_size = input_size

        # Company embedding layer
        self.embedding = nn.Embedding(num_companies, embedding_dim)

        # LSTM
        self.lstm = nn.LSTM(input_size + embedding_dim, hidden_size, num_layers, batch_first=True)

        # Fully connected output layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, company_ids):
        # Convert company IDs to embeddings
        company_embedding = self.embedding(company_ids).unsqueeze(1)  # (batch, 1, embedding_dim)
        company_embedding = company_embedding.expand(-1, x.shape[1], -1)  # Repeat across time steps

        # Concatenate embeddings with financial data
        x = torch.cat((x, company_embedding), dim=2)
        # print(f"Model forward input shape: {x.shape}")  # (batch, seq_length, input_size)

        # Pass through LSTM
        lstm_out, _ = self.lstm(x)

        # Use last time step's output for prediction
        output = self.fc(lstm_out[:, -1, :])

        return output

In [59]:
company_files = [f.split(".csv")[0] for f in os.listdir("data") if f.endswith(".csv")]  # 2501 companies
print(len(company_files))
company_to_id = {comp: idx for idx, comp in enumerate(company_files)}
print(company_to_id)

# Hyperparameters
num_epochs = 200
batch_size = 64

# Model parameters
feature_input_size = len(FEATURE_COLUMNS)  # This should match actual features before embedding
embedding_dim = 24  # sqrt(num_companies) 547 companies
input_size = 17  # feature_input_size(25) + embedding_dim(50) = 75 17

hidden_size = 64
num_layers = 2
output_size = 1

547
{'CSCO': 0, 'PRI': 1, 'UAL': 2, 'TROW': 3, 'ISRG': 4, 'NVR': 5, 'LECO': 6, 'TPR': 7, 'DVN': 8, 'CE': 9, 'NBIX': 10, 'BA': 11, 'VRTX': 12, 'AFG': 13, 'BRK.B': 14, 'GILD': 15, 'MDU': 16, 'MKL': 17, 'HUN': 18, 'TER': 19, 'PODD': 20, 'V': 21, 'QRVO': 22, 'CIEN': 23, 'A': 24, 'MO': 25, 'SWKS': 26, 'MCHP': 27, 'CDNS': 28, 'MSCI': 29, 'CHTR': 30, 'EIX': 31, 'BBY': 32, 'PEGA': 33, 'WBA': 34, 'LVS': 35, 'HCA': 36, 'AJG': 37, 'DTE': 38, 'AN': 39, 'C': 40, 'GWRE': 41, 'BRK.A': 42, 'FAF': 43, 'T': 44, 'CF': 45, 'MGM': 46, 'JLL': 47, 'HUM': 48, 'OSK': 49, 'DAR': 50, 'DLB': 51, 'WU': 52, 'APH': 53, 'ARW': 54, 'SYY': 55, 'MSI': 56, 'BC': 57, 'FCX': 58, 'ADM': 59, 'LH': 60, 'GGG': 61, 'WLK': 62, 'LNT': 63, 'LNC': 64, 'PSX': 65, 'PPC': 66, 'GPN': 67, 'HUBB': 68, 'PPG': 69, 'TECH': 70, 'IQV': 71, 'LNG': 72, 'NOV': 73, 'TNET': 74, 'HAL': 75, 'STZ': 76, 'FLS': 77, 'DXC': 78, 'MSM': 79, 'ADI': 80, 'F': 81, 'HOG': 82, 'ADBE': 83, 'CHH': 84, 'DCI': 85, 'STLD': 86, 'CPRT': 87, 'TDG': 88, 'TFX': 89, 'XPO':

In [None]:
# Combine data from all companies
all_X_train = []
all_Y_train = []
all_company_ids = []

for company in company_files:
    company_id = company_to_id[company]
    print(company_id, ": ", company)

    df_train, df_test = load_company_data(company)

    if df_train is None:
        continue

    X_train_tensor, Y_train_tensor, company_train_tensor = create_lstm_sequences(df_train, 5, company_id)

    all_X_train.append(torch.tensor(X_train_tensor, dtype=torch.float32))
    all_Y_train.append(torch.tensor(Y_train_tensor, dtype=torch.float32).unsqueeze(1))
    all_company_ids.append(torch.tensor(company_train_tensor, dtype=torch.long))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = StockLSTM(input_size, hidden_size, num_layers, output_size, len(company_files), embedding_dim).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Concatenate all data
X_train_tensor = torch.cat(all_X_train, dim=0)
Y_train_tensor = torch.cat(all_Y_train, dim=0)
company_train_tensor = torch.cat(all_company_ids, dim=0)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, company_train_tensor, Y_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training
train_losses = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_company, batch_Y in train_dataloader:
        batch_X, batch_company, batch_Y = batch_X.to(device), batch_company.to(device), batch_Y.to(device)

        optimizer.zero_grad()
        predictions = model(batch_X, batch_company).squeeze()
        loss = criterion(predictions, batch_Y)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    train_losses.append(avg_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss:.6f}")

# Plot Loss Curve
plt.figure(figsize=(8, 5))
plt.plot(range(1, num_epochs + 1), train_losses, marker='o', linestyle='-')
plt.xlabel("Epoch")
plt.ylabel("Training Loss")
plt.title("Training Loss Over Epochs")
plt.grid(True)
plt.show()


In [None]:
# CSCO
torch.save(model.state_dict(), "trained_stock_lstm.pth")
print("\nModel saved!")

# Testing (Predict Dec 2023 & Compare to Actual)

test_company = input("\nEnter company name for testing: ")
df_train, df_test = load_company_data(test_company)

print(df_test)

# df_test is already processed because of load_company_data

if df_test is None or df_test.empty:
    print(f" No test data available for {test_company}.")
else:
    latest_financials = df_test.iloc[-1].to_dict()

    input_df = pd.DataFrame([latest_financials], columns=FEATURE_COLUMNS)
    print(input_df)

    input_df = input_df.drop(columns=["stock_ticker", "comp_name", "ni_me_change", "fcf_me_change",
    "prev_intrinsic_value","next_intrinsic_value","prev_stock_exret","next_stock_exret","prev_be_me",
    "next_be_me","prev_ni_me","next_ni_me","prev_fcf_me","next_fcf_me","prev_betadown_252d",
    "next_betadown_252d","prev_ni_ar1","next_ni_ar1","prev_z_score","next_z_score","prev_ebit_sale",
    "next_ebit_sale","prev_at_turnover","next_at_turnover","prev_market_equity","next_market_equity"
    ,"prev_bvps","next_bvps"
    ], errors="ignore")
    
    input_df = input_df.fillna(0)  # in case
    # input_df = normalize_features(input_df)
    print(input_df)
    
    input_tensor = torch.tensor(input_df.values, dtype=torch.float32).unsqueeze(0).to(device)

    print(input_tensor.shape)
    
    test_company_id = torch.tensor([company_to_id[test_company]], dtype=torch.long).to(device)
    
    model.eval()
    with torch.no_grad():
        predicted_return = model(input_tensor, test_company_id).item()

    actual_return = df_test["stock_exret"].values[0]

    print(f"\nPredicted: {predicted_return:.4f} | Actual: {actual_return:.4f}")