In [40]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
from itertools import product

import helper_functions as hf
import Models

# Load Data

In [42]:
df = hf.load_data()
df.head()

Unnamed: 0,index_x,ticker,simfinid_x,date,open,high,low,close,adj_close,volume,...,non-operating_income_loss,interest_expense_net,pretax_income_loss_adj,abnormal_gains_losses,pretax_income_loss,income_tax_expense_benefit_net,income_loss_from_continuing_operations,net_extraordinary_gains_losses,net_income,net_income_common
0,16033,AAPL,111052,2019-12-23,70.13,71.06,70.09,71.0,68.53,98711532,...,,,,,,,,,,
1,40152,ABT,63877,2019-12-23,86.31,87.46,86.28,87.35,78.56,4067769,...,,,,,,,,,,
2,95854,ADBE,14099,2019-12-23,328.83,329.88,327.26,328.95,328.95,2210706,...,,,,,,,,,,
3,296720,AMGN,65735,2019-12-23,243.98,244.0,241.59,243.03,203.47,1686220,...,,,,,,,,,,
4,333254,AMZN,62747,2019-12-23,89.41,89.65,89.23,89.65,89.65,42749860,...,,,,,,,,,,


# Feature Sets

## Raw Features

In [43]:
df = hf.create_raw_features(df)

df[df['ticker'] == "AAPL"].head()[['date', 'ticker', 'adj_close', 'ret']]

Unnamed: 0,date,ticker,adj_close,ret
0,2019-12-23,AAPL,68.53,
49,2019-12-24,AAPL,68.59,0.000876
98,2019-12-26,AAPL,69.96,0.019974
147,2019-12-27,AAPL,69.93,-0.000429
196,2019-12-30,AAPL,70.34,0.005863


## Fundamental Features

In [44]:
df = hf.create_fundamental_features(df)

df[df['ticker'] == "AAPL"].tail()[['date', 'ticker', 'eps', 'profit_margin', 'revenue_growth']]

  from sklearn.preprocessing import StandardScaler
  from sklearn.linear_model import LogisticRegression


Unnamed: 0,date,ticker,eps,profit_margin,revenue_growth
60515,2024-11-19,AAPL,0.966748,0.15523,0.0
60564,2024-11-20,AAPL,0.966748,0.15523,0.0
60613,2024-11-21,AAPL,0.966748,0.15523,0.0
60662,2024-11-22,AAPL,0.966748,0.15523,0.0
60711,2024-11-25,AAPL,0.966748,0.15523,0.0


## Engineered Features

In [45]:
df = hf.create_engineered_features(df)

df[df['ticker'] == "AAPL"].head(25)[['date', 'ticker', 'mean_20', 'vol_20', 'ema_cross', 'skew_20', 'kurt_20']]

Unnamed: 0,date,ticker,mean_20,vol_20,ema_cross,skew_20,kurt_20
0,2019-12-23,AAPL,,,,,
49,2019-12-24,AAPL,,,0.0,,
98,2019-12-26,AAPL,,,0.004786,,
147,2019-12-27,AAPL,,,0.117769,,
196,2019-12-30,AAPL,,,0.202554,,
245,2019-12-31,AAPL,,,0.299379,,
294,2020-01-02,AAPL,,,0.413308,,
343,2020-01-03,AAPL,,,0.626292,,
392,2020-01-06,AAPL,,,0.730182,,
441,2020-01-07,AAPL,,,0.848727,,


In [46]:
list(df.columns)

['index_x',
 'ticker',
 'simfinid_x',
 'date',
 'open',
 'high',
 'low',
 'close',
 'adj_close',
 'volume',
 'dividend',
 'shares_outstanding',
 'index_y',
 'simfinid_y',
 'currency',
 'fiscal_year',
 'fiscal_period',
 'report_date',
 'publish_date',
 'restated_date',
 'shares_basic',
 'shares_diluted',
 'revenue',
 'cost_of_revenue',
 'gross_profit',
 'operating_expenses',
 'selling_general_&_administrative',
 'research_&_development',
 'depreciation_&_amortization',
 'operating_income_loss',
 'non-operating_income_loss',
 'interest_expense_net',
 'pretax_income_loss_adj',
 'abnormal_gains_losses',
 'pretax_income_loss',
 'income_tax_expense_benefit_net',
 'income_loss_from_continuing_operations',
 'net_extraordinary_gains_losses',
 'net_income',
 'net_income_common',
 'ret',
 'eps',
 'profit_margin',
 'revenue_growth',
 'income_growth',
 'gross_margin',
 'operating_margin',
 'sga_ratio',
 'rd_ratio',
 'cost_ratio',
 'net_income_per_share',
 'tax_burden',
 'nonop_ratio',
 'abnormal_ra

# Feature Selection

<h5 style="color:red">Check multicolinearity</h5>

In [73]:
raw = ['ret','volume']

eng = [
    'mean_20', 'mean_60',
    'vol_20', 'vol_60',
    'mom_20', 'mom_60',
    'ema_12', 'ema_26', 'ema_cross',
    'skew_20', 'skew_60',
    'kurt_20', 'kurt_60',
    'vol_z'
]

fund = [
    'eps',
    'profit_margin',
    'revenue_growth',
    'income_growth',
    'gross_margin',
    'operating_margin',
    'sga_ratio',
    'rd_ratio',
    'cost_ratio',
    'net_income_per_share',
    'tax_burden',
    'nonop_ratio',
    'abnormal_ratio',
    'revenue_per_share',
    'da_ratio',
    'interest_coverage',
    'interest_burden'
]

fund_eng = fund + eng

# Binary Labels

In [48]:
horizons = [1, 20, 60]

horizon_cols = [f'y_{h}' for h in horizons]

df = hf.create_binary_labels(df, horizons)

df[df["ticker"] == "AAPL"].tail(30)[['date', 'ret'] + horizon_cols]

Unnamed: 0,date,ret,y_1,y_20,y_60
59290,2024-10-15,0.011043,0.0,0.0,
59339,2024-10-16,-0.008858,1.0,0.0,
59388,2024-10-17,0.001605,1.0,0.0,
59437,2024-10-18,0.012258,1.0,0.0,
59486,2024-10-21,0.00629,0.0,0.0,
59535,2024-10-22,-0.002594,0.0,0.0,
59584,2024-10-23,-0.021615,0.0,0.0,
59633,2024-10-24,-0.000828,1.0,0.0,
59682,2024-10-25,0.00362,1.0,0.0,
59731,2024-10-28,0.008604,1.0,0.0,


### `NOTE: we can use sklearn train-test split`

In [49]:
train, val, test = hf.time_split(df)
len(train), len(val), len(test)

(42581, 9114, 9065)

# Modeling: Logistic Regression

## Horizon 1 days

In [50]:
TARGET = "y_1"

### Raw Features

In [51]:
data = hf.prune(df, raw, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, raw, TARGET)



(0.4923070078208573, 0.4985415036119433)

### Features Eng

In [52]:
data = hf.prune(df, eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, eng, TARGET)



(0.49965201317710106, 0.5042357262457207)

### Features Fund

In [53]:
data = hf.prune(df, fund, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, fund, TARGET)



(0.5093368795320723, 0.5039061800930102)

### Features Fund & Eng

In [54]:
data = hf.prune(df, fund_eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, fund_eng, TARGET)



(0.5067229253207568, 0.5028364861844177)

## Horizon 20 days

In [55]:
TARGET = "y_20"

### Features Raw

In [56]:
data = hf.prune(df, raw, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, raw, TARGET)



(0.4979015938672902, 0.5279288362438861)

### Features Eng

In [57]:
data = hf.prune(df, eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, eng, TARGET)



(0.4910048156287877, 0.5230469538782763)

### Features Fund

In [58]:
data = hf.prune(df, fund, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, fund, TARGET)



(0.5285954153269234, 0.4742475486393481)

### Features Fund & Eng

In [59]:
data = hf.prune(df, fund_eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, fund_eng, TARGET)



(0.5025293255131965, 0.5147096701169483)

## Horizon 60 days

In [60]:
TARGET = "y_60"

### Features Raw

In [61]:
data = hf.prune(df, eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, eng, TARGET)



(0.4983610356720456, 0.5155989360767208)

### Features Eng

In [62]:
data = hf.prune(df, eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, eng, TARGET)



(0.4983610356720456, 0.5155989360767208)

### Features Fund

In [63]:
data = hf.prune(df, fund, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, fund, TARGET)



(0.46037978450658146, 0.47889826190038687)

### Features Fund & Eng

In [64]:
data = hf.prune(df, fund_eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, fund_eng, TARGET)



(0.48361722766838544, 0.5006284541505793)

# Results

In [74]:
features_map = {
    "raw": raw,
    "eng": eng,
    "fund": fund,
    "fund_eng": fund_eng
}

In [75]:
features_sets = ['raw', 'eng', 'fund', 'fund_eng']
targets = ['y_1', 'y_20', 'y_60']
res = []

for f, t in product(features_sets, targets):
    print(f, t, type(f), type(t))
    features = features_map[f]
    data = hf.prune(df, features, t)

    train, val, test = hf.time_split(data)
    print(f, t)
    val_acc, test_acc = Models.run_logistic_regression(train, val, test, features, t)

    res.append({
        "Horizon": t,
        "Features": f,
        "Val Accuracy": val_acc,
        "Test Accuracy": test_acc,
    })

res

raw y_1 <class 'str'> <class 'str'>
raw y_1
raw y_20 <class 'str'> <class 'str'>




raw y_20
raw y_60 <class 'str'> <class 'str'>




raw y_60
eng y_1 <class 'str'> <class 'str'>




eng y_1




eng y_20 <class 'str'> <class 'str'>
eng y_20




eng y_60 <class 'str'> <class 'str'>
eng y_60




fund y_1 <class 'str'> <class 'str'>
fund y_1
fund y_20 <class 'str'> <class 'str'>
fund y_20




fund y_60 <class 'str'> <class 'str'>
fund y_60
fund_eng y_1 <class 'str'> <class 'str'>
fund_eng y_1




fund_eng y_20 <class 'str'> <class 'str'>
fund_eng y_20




fund_eng y_60 <class 'str'> <class 'str'>
fund_eng y_60




[{'Horizon': 'y_1',
  'Features': 'raw',
  'Val Accuracy': 0.4923070078208573,
  'Test Accuracy': 0.4985415036119433},
 {'Horizon': 'y_20',
  'Features': 'raw',
  'Val Accuracy': 0.4979015938672902,
  'Test Accuracy': 0.5279288362438861},
 {'Horizon': 'y_60',
  'Features': 'raw',
  'Val Accuracy': 0.5024241822488832,
  'Test Accuracy': 0.5045861626797885},
 {'Horizon': 'y_1',
  'Features': 'eng',
  'Val Accuracy': 0.4993089328350766,
  'Test Accuracy': 0.5067686224404312},
 {'Horizon': 'y_20',
  'Features': 'eng',
  'Val Accuracy': 0.4715113895816627,
  'Test Accuracy': 0.5230466047225638},
 {'Horizon': 'y_60',
  'Features': 'eng',
  'Val Accuracy': 0.4404432627345193,
  'Test Accuracy': 0.5138195597407563},
 {'Horizon': 'y_1',
  'Features': 'fund',
  'Val Accuracy': 0.5284216102688157,
  'Test Accuracy': 0.5253763163919414},
 {'Horizon': 'y_20',
  'Features': 'fund',
  'Val Accuracy': 0.48633032488334327,
  'Test Accuracy': 0.5719727682632598},
 {'Horizon': 'y_60',
  'Features': 'fund

In [78]:
res_df = pd.DataFrame(res)
val_df = res_df.pivot(index = "Features", columns = "Horizon", values = "Val Accuracy")
val_df

Horizon,y_1,y_20,y_60
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
eng,0.499309,0.471511,0.440443
fund,0.528422,0.48633,0.462684
fund_eng,0.508679,0.449265,0.47227
raw,0.492307,0.497902,0.502424


In [79]:
test_df = res_df.pivot(index = "Features", columns = "Horizon", values = "Test Accuracy")
test_df

Horizon,y_1,y_20,y_60
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
eng,0.506769,0.523047,0.51382
fund,0.525376,0.571973,0.628289
fund_eng,0.523082,0.619222,0.63709
raw,0.498542,0.527929,0.504586


In [92]:
val_df = val_df.rename(columns={
    "y_1": "1-day horizon",
    "y_20": "20-day horizon",
    "y_60": "60-day horizon"
})

val_df = val_df.rename(index={
    "raw": "Raw",
    "eng": "Engineered",
    "fund": "Fundamental",
    "fund_eng": "Fund + Eng"
})

val_st = (
    val_df.style
      .set_caption("Table 1: Summary Validation AUC")
      .format("{:.4f}")              # numeric formatting
      .set_table_styles([
          {"selector": "table", "props": "width:100%; border-collapse:separate; border-spacing:10px;"},
          {"selector": "th, td", "props": "padding:10px;"},
          {"selector": "th", "props": "font-size:12pt;"},
          {"selector": "td", "props": "font-size:11pt;"},
      ])
      .set_properties(**{"min-width": "120px"})
)
val_st

Horizon,1-day horizon,20-day horizon,60-day horizon
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Engineered,0.4993,0.4715,0.4404
Fundamental,0.5284,0.4863,0.4627
Fund + Eng,0.5087,0.4493,0.4723
Raw,0.4923,0.4979,0.5024


In [94]:
test_df = test_df.rename(columns={
    "y_1": "1-day horizon",
    "y_20": "20-day horizon",
    "y_60": "60-day horizon"
})

test_df = test_df.rename(index={
    "raw": "Raw",
    "eng": "Engineered",
    "fund": "Fundamental",
    "fund_eng": "Fund + Eng"
})

test_st = (
    test_df.style
      .set_caption("Table 1: Summary Testing AUC")
      .format("{:.4f}")              # numeric formatting
      .set_table_styles([
          {"selector": "table", "props": "width:100%; border-collapse:separate; border-spacing:10px;"},
          {"selector": "th, td", "props": "padding:10px;"},
          {"selector": "th", "props": "font-size:12pt;"},
          {"selector": "td", "props": "font-size:11pt;"},
      ])
      .set_properties(**{"min-width": "120px"})
)
test_st

Horizon,1-day horizon,20-day horizon,60-day horizon
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Engineered,0.5068,0.523,0.5138
Fundamental,0.5254,0.572,0.6283
Fund + Eng,0.5231,0.6192,0.6371
Raw,0.4985,0.5279,0.5046


In [95]:
import dataframe_image as dfi
# dfi.export(styled, "table1.png")
dfi.export(test_st, "table1.png", table_conversion="matplotlib")
dfi.export(val_st, "table2.png", table_conversion="matplotlib")