In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('fundamentals.csv')
prices = pd.read_csv('prices.csv')
sec = pd.read_csv('securities.csv')
psa = pd.read_csv('prices-split-adjusted.csv')

In [3]:
datasets = {'fundamentals': df, 'prices': prices, 'securities': sec, 'prices-split-adjusted': psa}

In [4]:
# Function to explore datasets
def explore_data(datasets):
    for name, data in datasets.items():
        print(f"\n=== Exploring {name} Dataset ===\n")
        
        # Shape of dataset
        print(f"Shape: {data.shape}") 
        
        # Display first few rows
        print("\nFirst 5 rows:")
        print(data.head())

        # Check missing values
        print("\nMissing Values:")
        print(data.isnull().sum())

        # Summary statistics for numerical columns
        print("\nSummary Statistics:")
        print(data.describe())

# Apply the function
explore_data(datasets)


=== Exploring fundamentals Dataset ===

Shape: (1781, 79)

First 5 rows:
   Unnamed: 0 Ticker Symbol Period Ending  Accounts Payable  \
0           0           AAL    2012-12-31      3.068000e+09   
1           1           AAL    2013-12-31      4.975000e+09   
2           2           AAL    2014-12-31      4.668000e+09   
3           3           AAL    2015-12-31      5.102000e+09   
4           4           AAP    2012-12-29      2.409453e+09   

   Accounts Receivable  Add'l income/expense items  After Tax ROE  \
0         -222000000.0               -1.961000e+09           23.0   
1          -93000000.0               -2.723000e+09           67.0   
2         -160000000.0               -1.500000e+08          143.0   
3          352000000.0               -7.080000e+08          135.0   
4          -89482000.0                6.000000e+05           32.0   

   Capital Expenditures  Capital Surplus  Cash Ratio  ...  \
0         -1.888000e+09     4.695000e+09        53.0  ...   
1         

In [5]:
print(df.columns)
print(prices.columns)
print(psa.columns)
print(sec.columns)

Index(['Unnamed: 0', 'Ticker Symbol', 'Period Ending', 'Accounts Payable',
       'Accounts Receivable', 'Add'l income/expense items', 'After Tax ROE',
       'Capital Expenditures', 'Capital Surplus', 'Cash Ratio',
       'Cash and Cash Equivalents', 'Changes in Inventories', 'Common Stocks',
       'Cost of Revenue', 'Current Ratio', 'Deferred Asset Charges',
       'Deferred Liability Charges', 'Depreciation',
       'Earnings Before Interest and Tax', 'Earnings Before Tax',
       'Effect of Exchange Rate',
       'Equity Earnings/Loss Unconsolidated Subsidiary', 'Fixed Assets',
       'Goodwill', 'Gross Margin', 'Gross Profit', 'Income Tax',
       'Intangible Assets', 'Interest Expense', 'Inventory', 'Investments',
       'Liabilities', 'Long-Term Debt', 'Long-Term Investments',
       'Minority Interest', 'Misc. Stocks', 'Net Borrowings', 'Net Cash Flow',
       'Net Cash Flow-Operating', 'Net Cash Flows-Financing',
       'Net Cash Flows-Investing', 'Net Income', 'Net Income Ad

In [8]:
# Define the DNN model
class SimpleDNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2):
        super(SimpleDNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, 1)  # Output layer (regression)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

#Example use (after defining input_size)
#input_size = 4 #number of features
#hidden_size1 = 64
#hidden_size2 = 32
#model = SimpleDNN(input_size, hidden_size1, hidden_size2)
#print(model) #Prints the architecture

In [9]:
# Select features and target variable
features = ['Accounts Payable', 'Accounts Receivable', 'Capital Expenditures', 'Cash Ratio', 'Current Ratio', 'Gross Margin', 'Inventory', 'Operating Income']  # Example features
target = 'Net Income' # Example Target

# Drop rows with missing values in selected features or target
df = df.dropna(subset=features + [target])

X = df[features]
y = df[target]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features)

# Convert to NumPy arrays
X_np = X_scaled.values
y_np = y.values

# Split into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)


In [10]:
# 2. --- Grid Search Implementation ---

# Define the DNN model (copied here for self-containment within THIS question)
class SimpleDNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2):
        super(SimpleDNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, 1)  # Output layer (regression)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

In [12]:
def train_and_evaluate(model, optimizer, loss_fn, X_train, y_train, X_test, y_test, num_epochs=10, batch_size=32):
    print("Starting train_and_evaluate")

    for epoch in range(num_epochs):
        print(f"Epoch: {epoch}")
        num_batches = len(X_train) // batch_size + (1 if len(X_train) % batch_size != 0 else 0) #Calculate the total number of batches
        for i in range(num_batches): # Iterate through the number of batches
            start = i * batch_size #The index where the batch starts
            end = min(start + batch_size, len(X_train)) #The index where the batch ends

            X_batch = X_train[start:end]
            y_batch = y_train[start:end]

            outputs = model(X_batch)
            loss = loss_fn(outputs, y_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    print("Finishing train_and_evaluate")
    # Evaluation
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test)
        test_loss = loss_fn(test_outputs, y_test).item()
    return test_loss

# Define the hyperparameter grid
param_grid = {
    'hidden_size1': [32, 64],
    'hidden_size2': [16, 32],
    'learning_rate': [0.001, 0.01]
}

# Determine input size (must be done AFTER data loading)
input_size = X_train.shape[1]

best_loss = float('inf')
best_params = None

# Grid search loop
for hidden_size1 in param_grid['hidden_size1']:
    for hidden_size2 in param_grid['hidden_size2']:
        for learning_rate in param_grid['learning_rate']:

            print(f"Training with hidden_size1={hidden_size1}, hidden_size2 = {hidden_size2} learning_rate={learning_rate}")

            # Initialize model, optimizer, and loss function
            model = SimpleDNN(input_size, hidden_size1, hidden_size2)
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
            loss_fn = nn.MSELoss()

            # Train and evaluate
            test_loss = train_and_evaluate(model, optimizer, loss_fn, X_train, y_train, X_test, y_test) # Proper call to the function


            # Check if current model is the best
            if test_loss < best_loss:
                best_loss = test_loss
                best_params = {'hidden_size1': hidden_size1, 'hidden_size2': hidden_size2, 'learning_rate': learning_rate}

print(f"Best parameters: {best_params}, Best validation loss: {best_loss}")

Training with hidden_size1=32, hidden_size2 = 16 learning_rate=0.001
Starting train_and_evaluate
Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Finishing train_and_evaluate
Training with hidden_size1=32, hidden_size2 = 16 learning_rate=0.01
Starting train_and_evaluate
Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Finishing train_and_evaluate
Training with hidden_size1=32, hidden_size2 = 32 learning_rate=0.001
Starting train_and_evaluate
Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Finishing train_and_evaluate
Training with hidden_size1=32, hidden_size2 = 32 learning_rate=0.01
Starting train_and_evaluate
Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Finishing train_and_evaluate
Training with hidden_size1=64, hidden_size2 = 16 learning_rate=0.001
Starting train_and_evaluate
Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch