In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch import nn

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow.keras import layers, models

import helper_functions as hf

# Load Data

In [2]:
df = hf.load_data()

# Feature Selection

## Fundamental features

In [3]:
df = hf.create_fundamental_features(df)

  df['revenue_growth'] = df.groupby('ticker')['revenue'].pct_change()


## Engineered Features

In [4]:
df = hf.create_engineered_features(df)
df

Unnamed: 0,index_x,ticker,simfinid_x,date,open,high,low,close,adj_close,volume,...,net_income,net_income_common,ret,eps,profit_margin,revenue_growth,mean_20,mean_60,vol_20,vol_60
0,16033,AAPL,111052,2019-12-23,70.13,71.06,70.09,71.00,68.53,98711532,...,,,,,,,,,,
1,40152,ABT,63877,2019-12-23,86.31,87.46,86.28,87.35,78.56,4067769,...,,,,,,,,,,
2,95854,ADBE,14099,2019-12-23,328.83,329.88,327.26,328.95,328.95,2210706,...,,,,,,,,,,
3,296720,AMGN,65735,2019-12-23,243.98,244.00,241.59,243.03,203.47,1686220,...,,,,,,,,,,
4,333254,AMZN,62747,2019-12-23,89.41,89.65,89.23,89.65,89.65,42749860,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60755,5695307,UNH,367714,2024-11-25,597.58,609.84,595.01,605.83,592.89,5146485,...,6.055000e+09,6.055000e+09,0.025318,6.510753,0.060058,0.0,0.003663,0.000649,0.020149,0.017548
60756,5755996,V,107326,2024-11-25,311.86,313.66,309.86,313.19,310.99,14189302,...,5.318000e+09,5.318000e+09,0.010561,2.550600,0.552979,0.0,0.005051,0.002204,0.013469,0.013145
60757,5926877,VZ,101219,2024-11-25,43.34,44.05,43.31,43.98,41.15,28365883,...,3.306000e+09,3.306000e+09,0.019321,0.782485,0.099190,0.0,0.002830,0.001224,0.011802,0.014616
60758,6024283,WMT,239962,2024-11-25,90.50,90.95,89.06,89.50,88.64,25078633,...,4.577000e+09,4.577000e+09,-0.010383,0.566320,0.026989,0.0,0.003992,0.002519,0.011294,0.010704


<h3 style="color:red">Todo: generate more features</h3>

Select features

In [5]:
df.columns

Index(['index_x', 'ticker', 'simfinid_x', 'date', 'open', 'high', 'low',
       'close', 'adj_close', 'volume', 'dividend', 'shares_outstanding',
       'index_y', 'simfinid_y', 'currency', 'fiscal_year', 'fiscal_period',
       'report_date', 'publish_date', 'restated_date', 'shares_basic',
       'shares_diluted', 'revenue', 'cost_of_revenue', 'gross_profit',
       'operating_expenses', 'selling_general_&_administrative',
       'research_&_development', 'depreciation_&_amortization',
       'operating_income_loss', 'non-operating_income_loss',
       'interest_expense_net', 'pretax_income_loss_adj',
       'abnormal_gains_losses', 'pretax_income_loss',
       'income_tax_expense_benefit_net',
       'income_loss_from_continuing_operations',
       'net_extraordinary_gains_losses', 'net_income', 'net_income_common',
       'ret', 'eps', 'profit_margin', 'revenue_growth', 'mean_20', 'mean_60',
       'vol_20', 'vol_60'],
      dtype='object')

In [6]:
raw = ['ret']
eng = ['mean_20', 'mean_60', 'vol_20', 'vol_60']
fund = ['eps', 'profit_margin', 'revenue_growth']
fund_eng = fund + eng

# Feature Selection

In [7]:
rt = df.pivot(index='date', columns='ticker', values='ret')
rt

ticker,AAPL,ABT,ADBE,AMGN,AMZN,AXP,BAC,BMY,CMCSA,COST,...,RTX,SBUX,T,TSLA,TXN,UNH,V,VZ,WMT,XOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-23,,,,,,,,,,,...,,,,,,,,,,
2019-12-24,0.000876,-0.000764,0.002098,-0.002851,-0.002119,0.002001,0.001639,-0.000793,0.009595,0.003130,...,-0.001544,0.003367,-0.002459,0.014311,-0.000185,-0.001856,0.002621,-0.002067,0.004106,-0.003761
2019-12-26,0.019974,0.000000,0.004732,-0.001774,0.044489,0.005382,0.008508,0.012691,0.010032,0.005126,...,0.007577,-0.005033,0.004931,0.013404,-0.000369,0.003756,0.008456,0.000230,0.000000,0.001510
2019-12-27,-0.000429,0.001274,-0.001238,-0.001531,0.000535,-0.001813,-0.004867,0.002937,0.002875,-0.005506,...,-0.002916,0.000519,0.001963,-0.001392,0.000739,0.001112,0.001214,0.003911,0.000818,-0.003392
2019-12-30,0.005863,-0.006870,-0.007407,-0.005242,-0.012301,-0.007093,-0.005543,-0.006443,0.001564,0.003530,...,-0.002617,-0.007778,-0.004897,-0.036250,-0.007105,-0.007180,-0.008264,-0.005270,-0.001634,-0.005862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-19,0.001145,-0.001911,0.000200,0.004266,0.014427,0.000035,-0.006572,0.025139,-0.015927,0.011577,...,-0.003655,-0.020354,-0.010909,0.021432,-0.029504,-0.021454,-0.000968,-0.007589,0.029903,-0.013961
2024-11-20,0.003168,-0.010267,-0.000220,0.028309,-0.008455,0.007541,-0.007718,-0.005950,0.015939,-0.002226,...,0.001621,-0.000940,0.004136,-0.011474,-0.014328,0.040732,-0.014306,0.006882,0.006763,0.014246
2024-11-21,-0.002105,0.011429,0.009890,0.007052,-0.022181,0.018412,0.008889,0.005986,0.011827,0.029698,...,0.011582,0.018288,0.006407,-0.006988,0.000052,-0.005003,0.008158,0.006582,0.013899,0.013442
2024-11-22,0.005890,0.004346,0.015284,0.015949,-0.006351,0.028327,0.011454,0.010999,-0.000716,0.008751,...,0.002021,0.024425,0.009095,0.038040,-0.000990,-0.011082,0.000065,0.015342,0.023189,-0.001190


<h3 style="color:red">Todo: save clean data for easy access</h3>

# Binary Labels

In [8]:
# input is the dataframe and the horizon we are looking at
df = hf.create_binary_labels(df,1)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'y_{h}'] = (df[f'cumret_{h}'] > 0).astype(int)


Unnamed: 0,index_x,ticker,simfinid_x,date,open,high,low,close,adj_close,volume,...,ret,eps,profit_margin,revenue_growth,mean_20,mean_60,vol_20,vol_60,cumret_1,y_1
49,16034,AAPL,111052,2019-12-24,71.17,71.22,70.73,71.07,68.59,48478856,...,0.000876,,,,,,,,0.000876,1
50,40153,ABT,63877,2019-12-24,87.37,87.48,86.98,87.28,78.50,1067679,...,-0.000764,,,,,,,,-0.000764,0
51,95855,ADBE,14099,2019-12-24,329.00,331.54,328.68,329.64,329.64,1066406,...,0.002098,,,,,,,,0.002098,1
52,296721,AMGN,65735,2019-12-24,242.82,243.10,241.72,242.33,202.89,612809,...,-0.002851,,,,,,,,-0.002851,0
53,333255,AMZN,62747,2019-12-24,89.69,89.78,89.38,89.46,89.46,17626740,...,-0.002119,,,,,,,,-0.002119,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60755,5695307,UNH,367714,2024-11-25,597.58,609.84,595.01,605.83,592.89,5146485,...,0.025318,6.510753,0.060058,0.0,0.003663,0.000649,0.020149,0.017548,0.025318,1
60756,5755996,V,107326,2024-11-25,311.86,313.66,309.86,313.19,310.99,14189302,...,0.010561,2.550600,0.552979,0.0,0.005051,0.002204,0.013469,0.013145,0.010561,1
60757,5926877,VZ,101219,2024-11-25,43.34,44.05,43.31,43.98,41.15,28365883,...,0.019321,0.782485,0.099190,0.0,0.002830,0.001224,0.011802,0.014616,0.019321,1
60758,6024283,WMT,239962,2024-11-25,90.50,90.95,89.06,89.50,88.64,25078633,...,-0.010383,0.566320,0.026989,0.0,0.003992,0.002519,0.011294,0.010704,-0.010383,0


# Modeling

In [9]:
# split sets
# inputs are dataframe, fraction in training, fraction in validation, column we're splitting at
train,val,test = hf.time_split(df,0.7,0.15,'date')

## Standarization

In [10]:
X_train = train[fund_eng]
X_val = val[fund_eng]
X_test = test[fund_eng]

scaler = StandardScaler()
scaler.fit(X_train)

In [11]:
X_train = scaler.transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

## Training

<h3 style="color:red">Note: apply basic logistic regression as starting point - ideally do LSTM later</h3>

## LSTM

In [12]:
def make_lstm_sequences(df, feature_cols, target_col, seq_len):
    X_list, y_list = [], []

    # VERY important: group by ticker
    for ticker, tdf in df.groupby("ticker"):
        tdf = tdf.sort_values("date")

        feature_mat = tdf[feature_cols].values
        labels = tdf[target_col].values

        # slide window over each ticker independently
        for i in range(seq_len, len(tdf)):
            X_list.append(feature_mat[i-seq_len:i])
            y_list.append(labels[i])

    X = np.array(X_list)
    y = np.array(y_list)
    return X, y


In [15]:
SEQ_LEN = 30     # LSTM sees the past 30 days
TARGET = "y_1"  # or "y_1", "y_60" depending on the horizon
FEATURES = fund_eng 

In [16]:
X_train_seq, y_train_seq = make_lstm_sequences(train, FEATURES, TARGET, SEQ_LEN)
X_val_seq, y_val_seq     = make_lstm_sequences(val, FEATURES, TARGET, SEQ_LEN)
X_test_seq, y_test_seq   = make_lstm_sequences(test, FEATURES, TARGET, SEQ_LEN)

print(X_train_seq.shape, y_train_seq.shape)
print(X_val_seq.shape, y_val_seq.shape)
print(X_test_seq.shape, y_test_seq.shape)


(41062, 30, 7) (41062,)
(7644, 30, 7) (7644,)
(7595, 30, 7) (7595,)


In [17]:


model = models.Sequential([
    layers.Input(shape=(SEQ_LEN, len(FEATURES))),
    layers.LSTM(64, return_sequences=False),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # binary classification
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()


In [18]:
history = model.fit(
    X_train_seq, y_train_seq,
    validation_data=(X_val_seq, y_val_seq),
    epochs=20,
    batch_size=128,
    shuffle=False
)


Epoch 1/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.4877 - loss: nan - val_accuracy: 0.4731 - val_loss: nan
Epoch 2/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4874 - loss: nan - val_accuracy: 0.4731 - val_loss: nan
Epoch 3/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4874 - loss: nan - val_accuracy: 0.4731 - val_loss: nan
Epoch 4/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4874 - loss: nan - val_accuracy: 0.4731 - val_loss: nan
Epoch 5/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4874 - loss: nan - val_accuracy: 0.4731 - val_loss: nan
Epoch 6/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4874 - loss: nan - val_accuracy: 0.4731 - val_loss: nan
Epoch 7/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [19]:
test_loss, test_acc = model.evaluate(X_test_seq, y_test_seq)
print("Test accuracy:", test_acc)


[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4552 - loss: nan
Test accuracy: 0.4574061930179596


In [20]:
target = "y_1"

y_train = train[target]
y_val   = val[target]
y_test  = test[target]

model = LogisticRegression(max_iter = 1000)
model.fit(X_train, y_train)
acc_val = accuracy_score(val[target], model.predict(X_val))
acc_test = accuracy_score(test[target], model.predict(X_test))

print(acc_val, acc_test)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values