In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('sp500-expanded.csv')
df.columns

In [None]:
df.drop(columns=['Unnamed: 0', 'ticker'], inplace=True)
df.dropna(inplace=True)
one_hot = pd.get_dummies(df['activity_domain'], prefix='activity_domain')
df = df.drop('activity_domain', axis=1)
df = pd.concat([df, one_hot], axis=1)
columns_to_convert = [col for col in df.columns if 'activity_domain' in col]
normalization_columns = ['market_cap',
    'net_revenue_y1', 'net_revenue_y2',
    'net_revenue_y3',
    'net_revenue_y4', 
    'net_income_y1', 
    'net_income_y2',
    'net_income_y3',
    'net_income_y4',
    'free_cash_flow',
    'stock_price']
for column in columns_to_convert:
    df[column] = df[column].astype(int)
# Large cap = stocks with 10B$ or higher market cap
df = df[df['market_cap'] > pow(10, 9)]
for column in df.columns:
    if column not in columns_to_convert:
        # if column in normalization_columns:
        #     df[column] = (df[column] - df[column].mean()) / df[column].std()
        # else:
        df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())

In [None]:
len(df)

# LOOK AT DATA DIST

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_column_distributions(df):
    """
    Plot the distribution of each column in a dataframe.

    :param df: Input pandas DataFrame
    """
    
    # Number of columns in the dataframe
    num_cols = df.shape[1]

    # Setting up the subplots
    fig, axes = plt.subplots(nrows=num_cols, figsize=(8, 4*num_cols))

    # Iterating through each column to plot
    for ax, (col_name, col_data) in zip(axes, df.items()):
        if df[col_name].dtype.kind in 'bifc':  # Numerical types
            col_data.hist(ax=ax, bins=50, edgecolor='black')
            ax.set_title(f"Distribution of {col_name}")
            ax.set_xlabel(col_name)
            ax.set_ylabel('Frequency')
        else:
            # If non-numeric, we consider it categorical and use a bar plot
            col_data.value_counts().plot(kind='bar', ax=ax)
            ax.set_title(f"Distribution of {col_name}")
            ax.set_xlabel(col_name)
            ax.set_ylabel('Count')
        ax.set_xlim(left=-1, right=1)
    plt.tight_layout()
    plt.show()

plot_column_distributions(df)


# NETWORK

In [None]:
import sklearn
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

METRICS_COUNT = 51

class Autoencodertemp(torch.nn.Module): 
    def __init__(self, input_size, latent_repr_size) -> None:
        super().__init__()
        self.reduction = 2

        encoder_modules = [] 
        depth = -1
        while True:
            depth += 1
            i_size = input_size // (self.reduction ** depth)
            o_size = input_size // (self.reduction ** (depth + 1))
            if o_size <= latent_repr_size:
                break
            encoder_modules.append(torch.nn.Linear(i_size, o_size))
            encoder_modules.append(torch.nn.ReLU())
        encoder_modules.append(torch.nn.Linear(input_size // (self.reduction ** depth), latent_repr_size))
        self.encoder = torch.nn.Sequential(*encoder_modules)

        encoder_shapes = [layer.weight.shape for idx, layer in enumerate(self.encoder) if idx % 2 == 0]
        print(encoder_shapes)

        decoder_modules = [] 
        for i in range(0, len(self.encoder), 2):
            reversed_shape = self.encoder[len(self.encoder) - i - 1].weight.shape
            decoder_modules.append(torch.nn.Linear(reversed_shape[0], reversed_shape[1]))
            if reversed_shape[0] == input_size:
                break
            decoder_modules.append(torch.nn.ReLU())
        self.decoder = torch.nn.Sequential(*decoder_modules)
        decoder_shapes = [layer.weight.shape for idx, layer in enumerate(self.decoder) if idx % 2 == 0]
        print(decoder_shapes)


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
    
    def train(self, data, loss_f, optim, n_epochs=20, batch_size=32):
        data_loader = DataLoader(data, batch_size=batch_size, shuffle=False)
        for epoch in range(n_epochs):
            epoch_loss = 0
            for batch_data, _ in data_loader:
                optim.zero_grad()
                reconstructed = self.forward(batch_data)
                loss = loss_f(reconstructed, batch_data)
                loss.backward()
                optim.step()
                epoch_loss += loss.item() * len(batch_data)
            epoch_loss /= len(data.tensors[0])
            print(f"Epoch {epoch} loss: {epoch_loss}")
        print("Training finished")

    def encoder_pass(self, data):
        return self.encoder(data)

    def test(self, data):
        with torch.no_grad():
            reconstructed = self.forward(data)
            return reconstructed

# Prepare data for training/testing

In [None]:
from sklearn.model_selection import train_test_split


# Split the data into training and test sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Training

In [None]:
import torch.optim as optim
import torch
import torch.utils.data as data_utils
from hyperopt import hp, fmin, tpe, space_eval, Trials

def hyper_tune(params):
    model = Autoencodertemp(len(df.columns), params['latent_dim'])
    optim = torch.optim.Adam(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
    loss_f = params['loss']
    data_loader = DataLoader(train_data, batch_size=32, shuffle=False)
    for epoch in range(params['epochs']):
        epoch_loss = 0
        for batch_data, _ in data_loader:
            optim.zero_grad()
            reconstructed = model(batch_data)
            loss = loss_f(reconstructed, batch_data)
            loss.backward()
            optim.step()
            epoch_loss += loss.item() * len(batch_data)
        epoch_loss /= len(train_data.tensors[0])
        print(f"Epoch {epoch} loss: {epoch_loss}")
    print("Training finished")
    return epoch_loss

space = {
    "lr": hp.loguniform("lr", -5, 0),
    "latent_dim": hp.choice("latent_dim", [2, 3, 4, 5, 6, 7, 8, 9, 10]),
    "loss": hp.choice("loss", [nn.MSELoss(), nn.L1Loss()]),
    "weight_decay": hp.loguniform("weight_decay", -8, -4),
    "epochs": hp.choice("epochs", range(1, 16)),
}

train_data = torch.tensor(train_df.values, dtype=torch.float32)
train_data = data_utils.TensorDataset(train_data, train_data)
best = fmin(fn=hyper_tune, space=space, algo=tpe.suggest, max_evals=100, trials=Trials())
print(best)