In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch import nn

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# import tensorflow as tf
# from tensorflow.keras import layers, models

import helper_functions as hf
import Models

2025-12-07 16:19:55.040897: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load Data

In [3]:
df = hf.load_data()
df.head()

Unnamed: 0,index_x,ticker,simfinid_x,date,open,high,low,close,adj_close,volume,...,non-operating_income_loss,interest_expense_net,pretax_income_loss_adj,abnormal_gains_losses,pretax_income_loss,income_tax_expense_benefit_net,income_loss_from_continuing_operations,net_extraordinary_gains_losses,net_income,net_income_common
0,16033,AAPL,111052,2019-12-23,70.13,71.06,70.09,71.0,68.53,98711532,...,,,,,,,,,,
1,40152,ABT,63877,2019-12-23,86.31,87.46,86.28,87.35,78.56,4067769,...,,,,,,,,,,
2,95854,ADBE,14099,2019-12-23,328.83,329.88,327.26,328.95,328.95,2210706,...,,,,,,,,,,
3,296720,AMGN,65735,2019-12-23,243.98,244.0,241.59,243.03,203.47,1686220,...,,,,,,,,,,
4,333254,AMZN,62747,2019-12-23,89.41,89.65,89.23,89.65,89.65,42749860,...,,,,,,,,,,


# Feature Sets

## Raw Features

In [4]:
df = hf.create_raw_features(df)

df[df['ticker'] == "AAPL"].head()[['date', 'ticker', 'adj_close', 'ret']]

Unnamed: 0,date,ticker,adj_close,ret
0,2019-12-23,AAPL,68.53,
49,2019-12-24,AAPL,68.59,0.000876
98,2019-12-26,AAPL,69.96,0.019974
147,2019-12-27,AAPL,69.93,-0.000429
196,2019-12-30,AAPL,70.34,0.005863


## Fundamental Features

In [5]:
df = hf.create_fundamental_features(df)

df[df['ticker'] == "AAPL"].tail()[['date', 'ticker', 'eps', 'profit_margin', 'revenue_growth']]

  df['revenue_growth'] = df.groupby('ticker')['revenue'].pct_change()
  df["income_growth"] = df.groupby("ticker")["net_income"].pct_change()


Unnamed: 0,date,ticker,eps,profit_margin,revenue_growth
60515,2024-11-19,AAPL,0.966748,0.15523,0.0
60564,2024-11-20,AAPL,0.966748,0.15523,0.0
60613,2024-11-21,AAPL,0.966748,0.15523,0.0
60662,2024-11-22,AAPL,0.966748,0.15523,0.0
60711,2024-11-25,AAPL,0.966748,0.15523,0.0


## Engineered Features

In [6]:
df = hf.create_engineered_features(df)

df[df['ticker'] == "AAPL"].head(25)[['date', 'ticker', 'mean_20', 'vol_20', 'ema_cross', 'skew_20', 'kurt_20']]

Unnamed: 0,date,ticker,mean_20,vol_20,ema_cross,skew_20,kurt_20
0,2019-12-23,AAPL,,,,,
49,2019-12-24,AAPL,,,0.0,,
98,2019-12-26,AAPL,,,0.004786,,
147,2019-12-27,AAPL,,,0.117769,,
196,2019-12-30,AAPL,,,0.202554,,
245,2019-12-31,AAPL,,,0.299379,,
294,2020-01-02,AAPL,,,0.413308,,
343,2020-01-03,AAPL,,,0.626292,,
392,2020-01-06,AAPL,,,0.730182,,
441,2020-01-07,AAPL,,,0.848727,,


In [7]:
list(df.columns)

['index_x',
 'ticker',
 'simfinid_x',
 'date',
 'open',
 'high',
 'low',
 'close',
 'adj_close',
 'volume',
 'dividend',
 'shares_outstanding',
 'index_y',
 'simfinid_y',
 'currency',
 'fiscal_year',
 'fiscal_period',
 'report_date',
 'publish_date',
 'restated_date',
 'shares_basic',
 'shares_diluted',
 'revenue',
 'cost_of_revenue',
 'gross_profit',
 'operating_expenses',
 'selling_general_&_administrative',
 'research_&_development',
 'depreciation_&_amortization',
 'operating_income_loss',
 'non-operating_income_loss',
 'interest_expense_net',
 'pretax_income_loss_adj',
 'abnormal_gains_losses',
 'pretax_income_loss',
 'income_tax_expense_benefit_net',
 'income_loss_from_continuing_operations',
 'net_extraordinary_gains_losses',
 'net_income',
 'net_income_common',
 'ret',
 'eps',
 'profit_margin',
 'revenue_growth',
 'income_growth',
 'gross_margin',
 'operating_margin',
 'sga_ratio',
 'rd_ratio',
 'cost_ratio',
 'net_income_per_share',
 'tax_burden',
 'nonop_ratio',
 'abnormal_ra

# Feature Selection

<h5 style="color:red">Check multicolinearity</h5>

In [8]:
raw = ['ret', 'volume']

eng = ['mean_20', 'mean_60', 'vol_20', 'vol_60',
       'mom_20', 'mom_60', 'ema_cross', 'skew_20',
       'skew_60', 'kurt_20', 'kurt_60']

fund = ['eps', 'profit_margin', 'revenue_growth']

fund_eng = fund + eng

# Binary Labels

In [9]:
horizon = 20

df = hf.create_binary_labels(df, horizon)

df[df["ticker"] == "AAPL"].tail(30)[['date', 'ret', f'y_{horizon}']]

Unnamed: 0,date,ret,y_20
59290,2024-10-15,0.011043,0.0
59339,2024-10-16,-0.008858,0.0
59388,2024-10-17,0.001605,0.0
59437,2024-10-18,0.012258,0.0
59486,2024-10-21,0.00629,0.0
59535,2024-10-22,-0.002594,0.0
59584,2024-10-23,-0.021615,0.0
59633,2024-10-24,-0.000828,0.0
59682,2024-10-25,0.00362,0.0
59731,2024-10-28,0.008604,0.0


### `NOTE: we can use sklearn train-test split`

In [10]:
train, val, test = hf.time_split(df)
len(train), len(val), len(test)

(42581, 9114, 9065)

# Modeling: Logistic Regression

## Horizon 20 days

In [11]:
TARGET = "y_20"

### Raw Features

In [15]:
x = Models.run_logistic_regression(train, val, test, raw, TARGET)
x.columns


hi


Index(['index_x', 'ticker', 'simfinid_x', 'date', 'open', 'high', 'low',
       'close', 'adj_close', 'volume', 'dividend', 'shares_outstanding',
       'index_y', 'simfinid_y', 'currency', 'fiscal_year', 'fiscal_period',
       'report_date', 'publish_date', 'restated_date', 'shares_basic',
       'shares_diluted', 'revenue', 'cost_of_revenue', 'gross_profit',
       'operating_expenses', 'selling_general_&_administrative',
       'research_&_development', 'depreciation_&_amortization',
       'operating_income_loss', 'non-operating_income_loss',
       'interest_expense_net', 'pretax_income_loss_adj',
       'abnormal_gains_losses', 'pretax_income_loss',
       'income_tax_expense_benefit_net',
       'income_loss_from_continuing_operations',
       'net_extraordinary_gains_losses', 'net_income', 'net_income_common',
       'ret', 'eps', 'profit_margin', 'revenue_growth', 'income_growth',
       'gross_margin', 'operating_margin', 'sga_ratio', 'rd_ratio',
       'cost_ratio', 'net_in

## Standarization

In [22]:
m = [TARGET] + raw
f = test.dropna(subset=m)
f[m]



Unnamed: 0,y_20,ret,volume
51695,0.0,-0.028440,95132355
51696,0.0,-0.013015,3502340
51697,0.0,-0.040673,4286834
51698,0.0,-0.009792,3064004
51699,1.0,-0.019484,37228343
...,...,...,...
59775,1.0,0.001213,1826559
59776,1.0,0.008703,4306240
59777,1.0,0.005942,13574976
59778,1.0,0.002937,7990186


In [10]:
X_train = train[fund_eng]
X_val = val[fund_eng]
X_test = test[fund_eng]

scaler = StandardScaler()
scaler.fit(X_train)

In [11]:
X_train = scaler.transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

## Training

<h3 style="color:red">Note: apply basic logistic regression as starting point - ideally do LSTM later</h3>

## LSTM

In [36]:
def make_lstm_sequences(df, feature_cols, target_col, seq_len):
    X_list, y_list = [], []

    # VERY important: group by ticker
    for ticker, tdf in df.groupby("ticker"):
        tdf = tdf.sort_values("date")

        feature_mat = tdf[feature_cols].values
        labels = tdf[target_col].values

        # slide window over each ticker independently
        for i in range(seq_len, len(tdf)):
            X_list.append(feature_mat[i-seq_len:i])
            y_list.append(labels[i])

    X = np.array(X_list)
    y = np.array(y_list)
    return X, y


In [37]:
SEQ_LEN = 30     # LSTM sees the past 30 days
TARGET = "y_1"  # or "y_1", "y_60" depending on the horizon
FEATURES = fund_eng 

In [16]:
X_train_seq, y_train_seq = make_lstm_sequences(train, FEATURES, TARGET, SEQ_LEN)
X_val_seq, y_val_seq     = make_lstm_sequences(val, FEATURES, TARGET, SEQ_LEN)
X_test_seq, y_test_seq   = make_lstm_sequences(test, FEATURES, TARGET, SEQ_LEN)

print(X_train_seq.shape, y_train_seq.shape)
print(X_val_seq.shape, y_val_seq.shape)
print(X_test_seq.shape, y_test_seq.shape)


(41062, 30, 7) (41062,)
(7644, 30, 7) (7644,)
(7595, 30, 7) (7595,)


In [38]:
X_train_seq

NameError: name 'X_train_seq' is not defined

In [17]:


model = models.Sequential([
    layers.Input(shape=(SEQ_LEN, len(FEATURES))),
    layers.LSTM(64, return_sequences=False),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # binary classification
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()


In [18]:
history = model.fit(
    X_train_seq, y_train_seq,
    validation_data=(X_val_seq, y_val_seq),
    epochs=20,
    batch_size=128,
    shuffle=False
)


Epoch 1/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.4877 - loss: nan - val_accuracy: 0.4731 - val_loss: nan
Epoch 2/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4874 - loss: nan - val_accuracy: 0.4731 - val_loss: nan
Epoch 3/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4874 - loss: nan - val_accuracy: 0.4731 - val_loss: nan
Epoch 4/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4874 - loss: nan - val_accuracy: 0.4731 - val_loss: nan
Epoch 5/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4874 - loss: nan - val_accuracy: 0.4731 - val_loss: nan
Epoch 6/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4874 - loss: nan - val_accuracy: 0.4731 - val_loss: nan
Epoch 7/20
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [19]:
test_loss, test_acc = model.evaluate(X_test_seq, y_test_seq)
print("Test accuracy:", test_acc)


[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4552 - loss: nan
Test accuracy: 0.4574061930179596


In [20]:
target = "y_1"

y_train = train[target]
y_val   = val[target]
y_test  = test[target]

model = LogisticRegression(max_iter = 1000)
model.fit(X_train, y_train)
acc_val = accuracy_score(val[target], model.predict(X_val))
acc_test = accuracy_score(test[target], model.predict(X_test))

print(acc_val, acc_test)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values