In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch import nn

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow.keras import layers, models

In [2]:
import helper_functions as hf
import Models

# Load Data

In [3]:
df = hf.load_data()

# Feature Selection

## Fundamental features

In [4]:
df = hf.create_fundamental_features(df)

  df['revenue_growth'] = df.groupby('ticker')['revenue'].pct_change()
  df["income_growth"] = df.groupby("ticker")["net_income"].pct_change()


## Engineered Features

In [5]:
df = hf.create_engineered_features(df)
df.columns

Index(['index_x', 'ticker', 'simfinid_x', 'date', 'open', 'high', 'low',
       'close', 'adj_close', 'volume', 'dividend', 'shares_outstanding',
       'index_y', 'simfinid_y', 'currency', 'fiscal_year', 'fiscal_period',
       'report_date', 'publish_date', 'restated_date', 'shares_basic',
       'shares_diluted', 'revenue', 'cost_of_revenue', 'gross_profit',
       'operating_expenses', 'selling_general_&_administrative',
       'research_&_development', 'depreciation_&_amortization',
       'operating_income_loss', 'non-operating_income_loss',
       'interest_expense_net', 'pretax_income_loss_adj',
       'abnormal_gains_losses', 'pretax_income_loss',
       'income_tax_expense_benefit_net',
       'income_loss_from_continuing_operations',
       'net_extraordinary_gains_losses', 'net_income', 'net_income_common',
       'ret', 'eps', 'profit_margin', 'revenue_growth', 'income_growth',
       'gross_margin', 'operating_margin', 'sga_ratio', 'rd_ratio',
       'cost_ratio', 'net_in

<h3 style="color:red">Todo: generate more features</h3>

Select features

In [6]:
raw = ['ret']
eng = [
    'mean_20', 'mean_60',
    'vol_20', 'vol_60',
    'mom_5', 'mom_20', 'mom_60',
    'ema_12', 'ema_26', 'ema_cross',
    'skew_20', 'skew_60',
    'kurt_20', 'kurt_60',
    'vol_z'
]
fund = [
    'eps',
    'profit_margin',
    'revenue_growth',
    'income_growth',
    'gross_margin',
    'operating_margin',
    'sga_ratio',
    'rd_ratio',
    'cost_ratio',
    'net_income_per_share',
    'tax_burden',
    'nonop_ratio',
    'abnormal_ratio',
    'revenue_per_share',
    'da_ratio',
    'interest_coverage',
    'interest_burden'
]
fund_eng = fund + eng

# Feature Selection

In [7]:
rt = df.pivot(index='date', columns='ticker', values='ret')
rt

ticker,AAPL,ABT,ADBE,AMGN,AMZN,AXP,BAC,BMY,CMCSA,COST,...,RTX,SBUX,T,TSLA,TXN,UNH,V,VZ,WMT,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-23,,,,,,,,,,,...,,,,,,,,,,
2019-12-24,0.000876,-0.000764,0.002098,-0.002851,-0.002119,0.002001,0.001639,-0.000793,0.009595,0.003130,...,-0.001544,0.003367,-0.002459,0.014311,-0.000185,-0.001856,0.002621,-0.002067,0.004106,-0.003761
2019-12-26,0.019974,0.000000,0.004732,-0.001774,0.044489,0.005382,0.008508,0.012691,0.010032,0.005126,...,0.007577,-0.005033,0.004931,0.013404,-0.000369,0.003756,0.008456,0.000230,0.000000,0.001510
2019-12-27,-0.000429,0.001274,-0.001238,-0.001531,0.000535,-0.001813,-0.004867,0.002937,0.002875,-0.005506,...,-0.002916,0.000519,0.001963,-0.001392,0.000739,0.001112,0.001214,0.003911,0.000818,-0.003392
2019-12-30,0.005863,-0.006870,-0.007407,-0.005242,-0.012301,-0.007093,-0.005543,-0.006443,0.001564,0.003530,...,-0.002617,-0.007778,-0.004897,-0.036250,-0.007105,-0.007180,-0.008264,-0.005270,-0.001634,-0.005862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-19,0.001145,-0.001911,0.000200,0.004266,0.014427,0.000035,-0.006572,0.025139,-0.015927,0.011577,...,-0.003655,-0.020354,-0.010909,0.021432,-0.029504,-0.021454,-0.000968,-0.007589,0.029903,-0.013961
2024-11-20,0.003168,-0.010267,-0.000220,0.028309,-0.008455,0.007541,-0.007718,-0.005950,0.015939,-0.002226,...,0.001621,-0.000940,0.004136,-0.011474,-0.014328,0.040732,-0.014306,0.006882,0.006763,0.014246
2024-11-21,-0.002105,0.011429,0.009890,0.007052,-0.022181,0.018412,0.008889,0.005986,0.011827,0.029698,...,0.011582,0.018288,0.006407,-0.006988,0.000052,-0.005003,0.008158,0.006582,0.013899,0.013442
2024-11-22,0.005890,0.004346,0.015284,0.015949,-0.006351,0.028327,0.011454,0.010999,-0.000716,0.008751,...,0.002021,0.024425,0.009095,0.038040,-0.000990,-0.011082,0.000065,0.015342,0.023189,-0.001190


<h3 style="color:red">Todo: save clean data for easy access</h3>

# Binary Labels

In [8]:
# input is the dataframe and the horizon we are looking at
df = hf.create_binary_labels(df,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'y_{h}'] = (df[f'cumret_{h}'] > 0).astype(int)


# Modeling

In [9]:
# split sets
# inputs are dataframe, fraction in training, fraction in validation, column we're splitting at
train,val,test = hf.time_split(df,0.7,0.15,'date')

## Standarization

In [10]:
X_train = train[fund_eng]
X_val = val[fund_eng]
X_test = test[fund_eng]

scaler = StandardScaler()
scaler.fit(X_train)

In [11]:
X_train = scaler.transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

## Training

<h3 style="color:red">Note: apply basic logistic regression as starting point - ideally do LSTM later</h3>

# Logistic Regression

In [18]:
# pick your feature set
FEATURES = fund_eng
TARGET = "y_1"

model, val_acc, test_acc = Models.run_logistic_regression(train, val, test, FEATURES, TARGET)


Validation Accuracy: 0.6043010752688172
Test Accuracy: 0.6162162162162163


# LSTM

In [19]:
SEQ_LEN = 30
TARGET = "y_1"
FEATURES = fund_eng

model, history, test_acc = Models.run_lstm(train, val, test, FEATURES, TARGET, seq_len=SEQ_LEN)


Shapes:
Train: (3060, 30, 32) (3060,)
Val:   (780, 30, 32) (780,)
Test:  (775, 30, 32) (775,)
Epoch 1/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5159 - loss: 0.7007 - val_accuracy: 0.5051 - val_loss: 0.6982
Epoch 2/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5322 - loss: 0.6895 - val_accuracy: 0.4897 - val_loss: 0.7001
Epoch 3/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.5434 - loss: 0.6853 - val_accuracy: 0.4936 - val_loss: 0.7153
Epoch 4/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5520 - loss: 0.6911 - val_accuracy: 0.4795 - val_loss: 0.7083
Epoch 5/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5326 - loss: 0.6961 - val_accuracy: 0.4846 - val_loss: 0.7111
Epoch 6/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.53