In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('fundamentals.csv')
prices = pd.read_csv('prices.csv')
sec = pd.read_csv('securities.csv')
psa = pd.read_csv('prices-split-adjusted.csv')

In [3]:
datasets = {'fundamentals': df, 'prices': prices, 'securities': sec, 'prices-split-adjusted': psa}

In [4]:
# Function to explore datasets
def explore_data(datasets):
    for name, data in datasets.items():
        print(f"\n=== Exploring {name} Dataset ===\n")
        
        # Shape of dataset
        print(f"Shape: {data.shape}") 
        
        # Display first few rows
        print("\nFirst 5 rows:")
        print(data.head())

        # Check missing values
        print("\nMissing Values:")
        print(data.isnull().sum())

        # Summary statistics for numerical columns
        print("\nSummary Statistics:")
        print(data.describe())

# Apply the function
explore_data(datasets)


=== Exploring fundamentals Dataset ===

Shape: (1781, 79)

First 5 rows:
   Unnamed: 0 Ticker Symbol Period Ending  Accounts Payable  \
0           0           AAL    2012-12-31      3.068000e+09   
1           1           AAL    2013-12-31      4.975000e+09   
2           2           AAL    2014-12-31      4.668000e+09   
3           3           AAL    2015-12-31      5.102000e+09   
4           4           AAP    2012-12-29      2.409453e+09   

   Accounts Receivable  Add'l income/expense items  After Tax ROE  \
0         -222000000.0               -1.961000e+09           23.0   
1          -93000000.0               -2.723000e+09           67.0   
2         -160000000.0               -1.500000e+08          143.0   
3          352000000.0               -7.080000e+08          135.0   
4          -89482000.0                6.000000e+05           32.0   

   Capital Expenditures  Capital Surplus  Cash Ratio  ...  \
0         -1.888000e+09     4.695000e+09        53.0  ...   
1         

In [5]:
print(df.columns)
print(prices.columns)
print(psa.columns)
print(sec.columns)

Index(['Unnamed: 0', 'Ticker Symbol', 'Period Ending', 'Accounts Payable',
       'Accounts Receivable', 'Add'l income/expense items', 'After Tax ROE',
       'Capital Expenditures', 'Capital Surplus', 'Cash Ratio',
       'Cash and Cash Equivalents', 'Changes in Inventories', 'Common Stocks',
       'Cost of Revenue', 'Current Ratio', 'Deferred Asset Charges',
       'Deferred Liability Charges', 'Depreciation',
       'Earnings Before Interest and Tax', 'Earnings Before Tax',
       'Effect of Exchange Rate',
       'Equity Earnings/Loss Unconsolidated Subsidiary', 'Fixed Assets',
       'Goodwill', 'Gross Margin', 'Gross Profit', 'Income Tax',
       'Intangible Assets', 'Interest Expense', 'Inventory', 'Investments',
       'Liabilities', 'Long-Term Debt', 'Long-Term Investments',
       'Minority Interest', 'Misc. Stocks', 'Net Borrowings', 'Net Cash Flow',
       'Net Cash Flow-Operating', 'Net Cash Flows-Financing',
       'Net Cash Flows-Investing', 'Net Income', 'Net Income Ad

In [6]:
import torch
import torch.nn as nn

# Define the DNN model
class SimpleDNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2):
        super(SimpleDNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, 1)  # Output layer (regression)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

#Example use (after defining input_size)
#input_size = 4 #number of features
#hidden_size1 = 64
#hidden_size2 = 32
#model = SimpleDNN(input_size, hidden_size1, hidden_size2)
#print(model) #Prints the architecture

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Assuming 'prices' DataFrame is already loaded
# prices = pd.read_csv("prices.csv") # Replace with your actual loading

# 1. Feature Selection and Target Variable Definition
features = ['open', 'low', 'high', 'volume']  # Example: Choose these as features
target = 'close'

X = prices[features]
y = prices[target]

# 2. Data Preprocessing
# Handle missing values (example: fill with mean)
X = X.fillna(X.mean())  # Or use other imputation methods

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) #important to fit only on the training data
X_scaled = pd.DataFrame(X_scaled, columns=features)

# Convert to NumPy arrays for PyTorch
X_np = X_scaled.values
y_np = y.values

# 3. Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2, random_state=42)  # Adjust test_size

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)  # Reshape for single output
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)


# 4. Model Definition (PyTorch)
class SimpleDNN(nn.Module): #moved here from question 2 to avoid global declaration issues.
    def __init__(self, input_size, hidden_size1, hidden_size2):
        super(SimpleDNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, 1)  # Output layer (regression)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

# 5. Grid Search Implementation (Manual)
def train_and_evaluate(model, optimizer, loss_fn, X_train, y_train, X_test, y_test, num_epochs=50, batch_size = 32): #Reduced epochs for GridSearch
    train_losses = []
    test_losses = []
    for epoch in range(num_epochs):

        for i in range(0, len(X_train), batch_size):

            X_batch = X_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]

            # Forward pass
            outputs = model(X_batch)
            loss = loss_fn(outputs, y_batch)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Evaluate on test set
        with torch.no_grad():
            test_outputs = model(X_test)
            test_loss = loss_fn(test_outputs, y_test)

        train_losses.append(loss.item())
        test_losses.append(test_loss.item())



    return train_losses, test_losses, test_loss.item() # Return test_loss


input_size = X_train.shape[1]  # Number of features

param_grid = {
    'hidden_size1': [32, 64, 128],
    'hidden_size2': [16,32, 64],
    'learning_rate': [0.001, 0.01, 0.1],
}

best_loss = float('inf')
best_params = None
best_model = None

#Outerloop with the hidden size
for hidden_size1 in param_grid['hidden_size1']:

    #Innerloop with the learning rate
    for hidden_size2 in param_grid['hidden_size2']:
        for learning_rate in param_grid['learning_rate']:

            print(f"Training with hidden_size1={hidden_size1}, hidden_size2 = {hidden_size2}, learning_rate={learning_rate}")

            # Create a new model for each hyperparameter combination
            model = SimpleDNN(input_size, hidden_size1, hidden_size2)
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
            loss_fn = nn.MSELoss()

            # Train and evaluate the model
            train_losses, test_losses, test_loss = train_and_evaluate(model, optimizer, loss_fn, X_train, y_train, X_test, y_test)


            if test_loss < best_loss:
                best_loss = test_loss
                best_params = {'hidden_size1': hidden_size1, 'hidden_size2' : hidden_size2, 'learning_rate': learning_rate}
                best_model = model # Save the best model



print(f"Best parameters: {best_params}, Best validation loss: {best_loss}")

In [None]:
import matplotlib.pyplot as plt

# Visualization (Loss/Epochs) - requires best_model and its losses from Question 3

if best_model is not None:
    #best model and its losses from the previous code block

    def train_and_evaluate(model, optimizer, loss_fn, X_train, y_train, X_test, y_test, num_epochs=50, batch_size = 32): #Reduced epochs for GridSearch
        train_losses = []
        test_losses = []
        for epoch in range(num_epochs):

            for i in range(0, len(X_train), batch_size):

                X_batch = X_train[i:i+batch_size]
                y_batch = y_train[i:i+batch_size]

                # Forward pass
                outputs = model(X_batch)
                loss = loss_fn(outputs, y_batch)

                # Backward and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # Evaluate on test set
            with torch.no_grad():
                test_outputs = model(X_test)
                test_loss = loss_fn(test_outputs, y_test)

            train_losses.append(loss.item())
            test_losses.append(test_loss.item())



        return train_losses, test_losses # Return the loss values


    input_size = X_train.shape[1]  # Number of features
    optimizer = optim.Adam(best_model.parameters(), lr=best_params['learning_rate'])
    loss_fn = nn.MSELoss()

     # Train and evaluate the model
    train_losses, test_losses= train_and_evaluate(best_model, optimizer, loss_fn, X_train, y_train, X_test, y_test)
     # Get the losses corresponding to the best model (you might need to store these during training)
    # For demonstration, let's assume train_losses and test_losses from the *last* training run are representative
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Train Loss')
    plt.plot(test_losses, label='Test Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Test Loss (Best Model)')
    plt.legend()
    plt.show()
else:
    print("No best model found. Run Question 3 first.")