In [None]:
import yfinance as yf
import metrics.fundamental_analysis as fa
import pandas as pd
import numpy as np
import importlib
from dataclasses import dataclass, asdict
from pprint import pprint
import matplotlib.pyplot as plt
import torch.nn as nn

In [None]:
# S&P 500 tickers
sp500_tickers = np.array(pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0].loc[:, 'Symbol'].tolist())
# nasdaq_composite = np.array(pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0].loc[:, 'Symbol'].tolist())
print(len(sp500_tickers))

In [None]:
t1 = yf.Ticker(sp500_tickers[0])
print(t1.financials.loc['Net Income'])
print(t1.balance_sheet.loc['Total Assets'])
print((t1.financials.loc['Net Income'] / t1.balance_sheet.loc['Total Assets']).tolist())
print(t1.financials.loc['Net Income'].tolist())
print(t1.balance_sheet.loc['Total Assets'].tolist())


In [None]:
importlib.reload(fa)
metrics = []
for sptick in sp500_tickers:
    try:
        metrics.append(fa.get_fundamental_analysis_metrics(sptick))
    except Exception as e:
        print(e)
        print(f'Failed to get metrics for {sptick}')
        continue
sp500df = pd.DataFrame.from_records([asdict(s) for s in metrics])
sp500df.head()
sp500df.to_csv('sp500.csv')

In [None]:
for y_delta in range(4):
    roi = 'return_on_investments_y' + str(y_delta + 1)
    related_metrics = [roi, 'return_on_assets_y' + str(y_delta + 1), 'return_on_equity_y' + str(y_delta + 1)]
    related_metrics_corr_df = sp500df[related_metrics].corr()
    num_related = len(related_metrics) - 1
    # tested in cli, works
    sp500df.loc[sp500df[roi] == -1, roi] = sum([related_metrics_corr_df.loc[roi, m] * sp500df.loc[sp500df[roi] == -1, m] for m in related_metrics[1:]]) / num_related

sp500df.to_csv('s_and_p500df.csv')

In [None]:
num_columns = len(sp500df.columns)
clean_df = sp500df[sp500df[sp500df == -1].count(axis=1) < num_columns / 4].reset_index()
clean_df.to_csv('clean_sp500.csv')
very_clean_df = sp500df[sp500df[sp500df == -1].count(axis=1) == 0].reset_index()
very_clean_df.to_csv('very_clean_sp500.csv')

In [None]:
aapl  = yf.Ticker('AAPL')
mmm = yf.Ticker('MMM')
amzn = yf.Ticker('AMZN')
print("========AAPL========")
aapl_keys = list(aapl.balance_sheet['2022-09-30'].keys())

In [None]:
print("========MMM========")
mmm_keys = mmm.balance_sheet['2022-12-31'].keys().tolist()
print(len(mmm_keys))

In [None]:

print("======AMZN======")
amzn_keys = list(amzn.balance_sheet['2022-12-31'].keys())
print(len(amzn_keys))

In [None]:
set(mmm_keys) & set(amzn_keys) & set(aapl_keys)

In [None]:
bal_keys = []
for ticker in sp500_tickers[:100]:
    tick = yf.Ticker(ticker)
    try:
        current_balance_sheet = tick.balance_sheet[tick.balance_sheet.keys()[0]].keys()
    except Exception as e:
        continue
    if len(bal_keys) == 0:
        bal_keys = current_balance_sheet
    else:
        bal_keys = list(set(bal_keys) & set(current_balance_sheet))
pprint(bal_keys)
# See common keys inside list of lists bal_keys


In [None]:
financials_keys = []
for ticker in sp500_tickers[:100]:
    tick = yf.Ticker(ticker)
    try:
        current_financial = tick.financials[tick.financials.keys()[0]].keys()
    except Exception as e:
        continue
    if len(financials_keys) == 0:
        financials_keys = current_financial
    else:
        financials_keys = list(set(financials_keys) & set(current_financial))
pprint(financials_keys)
# See common keys inside list of lists bal_keys

In [None]:
income_stmt_keys = []
for ticker in sp500_tickers[:100]:
    tick = yf.Ticker(ticker)
    try:
        current_income_stmt = tick.income_stmt[tick.income_stmt.keys()[0]].keys()
    except Exception as e:
        continue
    if len(financials_keys) == 0:
        income_stmt_keys = current_income_stmt
    else:
        income_stmt_keys = list(set(income_stmt_keys) & set(current_income_stmt))
pprint(financials_keys)
# See common keys inside list of lists bal_keys

# Data Cleaning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('clean_sp500.csv')

In [None]:
df.columns

In [None]:
#df.isna().sum()
count_minus_ones = df.apply(lambda col: (col == -1).sum())

# Create a new DataFrame with the count of -1 values for each column
count_df = pd.DataFrame({'Column': count_minus_ones.index, 'Count of -1': count_minus_ones.values})
count_df


## Activity Domain

In [None]:
df['activity_domain'].value_counts()

In [None]:
print("Missing values:" + str(df['activity_domain'].isna().sum()))
print(df['ticker'].loc[df['activity_domain'].isna()])

### Activity domain is na for:

Berkeley Class B - should be "Financial Services" (Is there another BRK already?)

Brown-Forman Corporation Class B - should be "Consumer Cyclical" (Is there another BF already?)
 
Caterpillar Inc. - should be "Industrials"

In [None]:
df.at[64, 'activity_domain'] = "Financial Services"
df.at[80, 'activity_domain'] = "Consumer Cyclical"
df.at[92, 'activity_domain'] = "Industrials"

In [None]:
one_hot = pd.get_dummies(df['activity_domain'], prefix='activity_domain')
df = df.drop('activity_domain', axis=1)
df = pd.concat([df, one_hot], axis=1)

In [None]:
columns_to_convert = [col for col in df.columns if 'activity_domain' in col]
for column in columns_to_convert:
    df[column] = df[column].astype(int)

# Market Cap

In [None]:
print('Missing values: ' + str(df['market_cap'].loc[df['market_cap'] == -1].count()))

# Net Revenue

In [None]:
# IGNORE

# Net Income

In [None]:
print('Missing values: ' + str(df['net_income'].isna().sum()))

# P/E

In [None]:
print('Missing values: ' + str(df['price_earnings_ratio'].loc[df['price_earnings_ratio'] == -1].count()))

In [None]:
df = pd.read_csv('s_and_p500metrics.csv')
roe = []

def get_roe(ticker: yf.Ticker) -> float:
    metrics = {}
    fa.check_metric_exists_and_fill_out(ticker, metrics, 'return_on_equity', 
        lambda: (ticker.financials.loc['Net Income'] / ticker.balance_sheet.loc['Stockholders Equity']).tolist() + [ticker.info['returnOnEquity']], [-1])

    return metrics['return_on_equity']


for index, row in df.iterrows():
    print(index)
    ticker = yf.Ticker(row['ticker'])
    roe.append(get_roe(ticker))
df['return_on_equity'] = roe
df.to_csv('s_and_p500_v2.csv')

In [None]:
import yfinance as yf

ticker = yf.Ticker("AAPL")
balance_sheet = ticker.balance_sheet

In [None]:
balance_sheet['2022-09-30'].keys()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn

df = df.drop(columns=['return_on_investments_y1', 'return_on_investments_y2', 'return_on_investments_y3', 'return_on_investments_y4'])
df = df[~(df == -1).any(axis=1)]

In [None]:
corr_matrix = df.drop(['ticker', 'Unnamed: 0'], axis=1).corr()
sn.heatmap(corr_matrix, annot=True)

# Very Clean

In [None]:
df = pd.read_csv('very_clean_sp500.csv')
df.head()

In [None]:
one_hot = pd.get_dummies(df['activity_domain'], prefix='activity_domain')
df = df.drop('activity_domain', axis=1)
df = pd.concat([df, one_hot], axis=1)

In [None]:
from sklearn.model_selection import train_test_split


# Split the data into training and test sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [86]:
import matplotlib.pyplot as plt
import sklearn
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

METRICS_COUNT = 51

class Autoencodertemp(torch.nn.Module): 
    def __init__(self, input_size, latent_repr_size) -> None:
        super().__init__()
        self.reduction = 2

        encoder_modules = [] 
        depth = -1
        while True:
            depth += 1
            i_size = input_size // (self.reduction ** depth)
            o_size = input_size // (self.reduction ** (depth + 1))
            if o_size <= latent_repr_size:
                break
            encoder_modules.append(torch.nn.Linear(i_size, o_size))
            encoder_modules.append(torch.nn.ReLU())
        encoder_modules.append(torch.nn.Linear(input_size // (self.reduction ** depth), latent_repr_size))
        self.encoder = torch.nn.Sequential(*encoder_modules)

        encoder_shapes = [layer.weight.shape for idx, layer in enumerate(self.encoder) if idx % 2 == 0]
        print(encoder_shapes)

        decoder_modules = [] 
        for i in range(0, len(self.encoder), 2):
            reversed_shape = self.encoder[len(self.encoder) - i - 1].weight.shape
            decoder_modules.append(torch.nn.Linear(reversed_shape[0], reversed_shape[1]))
            if reversed_shape[0] == input_size:
                break
            decoder_modules.append(torch.nn.ReLU())
        self.decoder = torch.nn.Sequential(*decoder_modules)
        decoder_shapes = [layer.weight.shape for idx, layer in enumerate(self.decoder) if idx % 2 == 0]
        print(decoder_shapes)


    def forward(self, x):
        print(x)
        encoded = self.encoder(x)
        print(encoded)
        decoded = self.decoder(encoded)
        return decoded
    
    
    def train(self, data, loss_f, optim, n_epochs=20, batch_size=32):
        data_loader = DataLoader(data, batch_size=batch_size, shuffle=False)
        for epoch in range(n_epochs):
            epoch_loss = 0
            for batch_data, _ in data_loader:
                batch_data.requires_grad = True
                optim.zero_grad()
                print(batch_data.shape)
                reconstructed = self.forward(batch_data)
                loss = loss_f(reconstructed, batch_data)
                loss.backward()
                optim.step()
                epoch_loss += loss.item() * len(batch_data)
            epoch_loss /= len(data.tensors[0])
            print(f"Epoch {epoch} loss: {epoch_loss}")
        print("Training finished")

    def encoder_pass(self, data):
        return self.encoder(data)

    def test(self, data):
        with torch.no_grad():
            reconstructed = self.forward(data)
            return reconstructed

In [101]:
model.encoder_pass(train_data[0:1][0])
#model.forward(train_data[0:1][0])

tensor([[-0.1402, -0.0558,  0.1962, -0.0318,  0.2077,  0.1507,  0.2515,  0.1689,
          0.1741,  0.0211]], grad_fn=<AddmmBackward0>)

In [85]:
import torch.optim as optim
import torch
import torch.utils.data as data_utils

latent_dim = 10
model = Autoencodertemp(len(df.columns), latent_dim)

train_data = torch.tensor(train_df.values, dtype=torch.float32)
train_data = data_utils.TensorDataset(train_data, train_data)
loss = nn.MSELoss()
lr = 0.001
optimizer = optim.Adam(model.parameters(), lr=lr)

model.train(train_data, loss, optimizer)

[torch.Size([25, 51]), torch.Size([12, 25]), torch.Size([10, 12])]
[torch.Size([12, 10]), torch.Size([25, 12]), torch.Size([51, 25])]


torch.Size([32, 51])
tensor([[-0.1851, -0.1021, -0.0736,  ...,  0.0000,  0.0000,  0.0000],
        [ 6.8144,  4.0180,  4.2392,  ...,  0.0000,  0.0000,  0.0000],
        [-0.2451, -0.4461, -0.4375,  ...,  0.0000,  0.0000,  1.0000],
        ...,
        [-0.2897,  0.5688,  0.5805,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0436, -0.2519, -0.2435,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0952,  0.1236,  0.1992,  ...,  0.0000,  0.0000,  0.0000]],
       requires_grad=True)
tensor([[-1.4019e-01, -5.5782e-02,  1.9616e-01, -3.1771e-02,  2.0774e-01,
          1.5070e-01,  2.5149e-01,  1.6888e-01,  1.7410e-01,  2.1148e-02],
        [-1.5108e-01,  2.3139e-01,  6.1302e-01, -5.5594e-01, -1.3151e-01,
          2.5421e-01,  3.2023e-01,  4.5227e-01,  1.7759e-01,  5.4352e-01],
        [-1.0638e-01, -7.8966e-02,  2.2418e-01, -5.2787e-02,  2.0488e-01,
          1.4200e-01,  2.3455e-01,  1.5166e-01,  1.8583e-01,  3.4621e-02],
        [-8.8359e-02, -8.7791e-02,  2.3212e-01, -4.4217e-02,  1.7159e-0

KeyboardInterrupt: 