In [132]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [160]:
from itertools import product

import helper_functions as hf
import Models

# Load Data

In [134]:
df = hf.load_data()
df.head()

Unnamed: 0,index_x,ticker,simfinid_x,date,open,high,low,close,adj_close,volume,...,non-operating_income_loss,interest_expense_net,pretax_income_loss_adj,abnormal_gains_losses,pretax_income_loss,income_tax_expense_benefit_net,income_loss_from_continuing_operations,net_extraordinary_gains_losses,net_income,net_income_common
0,16033,AAPL,111052,2019-12-23,70.13,71.06,70.09,71.0,68.53,98711532,...,,,,,,,,,,
1,40152,ABT,63877,2019-12-23,86.31,87.46,86.28,87.35,78.56,4067769,...,,,,,,,,,,
2,95854,ADBE,14099,2019-12-23,328.83,329.88,327.26,328.95,328.95,2210706,...,,,,,,,,,,
3,296720,AMGN,65735,2019-12-23,243.98,244.0,241.59,243.03,203.47,1686220,...,,,,,,,,,,
4,333254,AMZN,62747,2019-12-23,89.41,89.65,89.23,89.65,89.65,42749860,...,,,,,,,,,,


# Feature Sets

## Raw Features

In [135]:
df = hf.create_raw_features(df)

df[df['ticker'] == "AAPL"].head()[['date', 'ticker', 'adj_close', 'ret']]

Unnamed: 0,date,ticker,adj_close,ret
0,2019-12-23,AAPL,68.53,
49,2019-12-24,AAPL,68.59,0.000876
98,2019-12-26,AAPL,69.96,0.019974
147,2019-12-27,AAPL,69.93,-0.000429
196,2019-12-30,AAPL,70.34,0.005863


## Fundamental Features

In [136]:
df = hf.create_fundamental_features(df)

df[df['ticker'] == "AAPL"].tail()[['date', 'ticker', 'eps', 'profit_margin', 'revenue_growth']]

  df['revenue_growth'] = df.groupby('ticker')['revenue'].pct_change()
  df["income_growth"] = df.groupby("ticker")["net_income"].pct_change()


Unnamed: 0,date,ticker,eps,profit_margin,revenue_growth
60515,2024-11-19,AAPL,0.966748,0.15523,0.0
60564,2024-11-20,AAPL,0.966748,0.15523,0.0
60613,2024-11-21,AAPL,0.966748,0.15523,0.0
60662,2024-11-22,AAPL,0.966748,0.15523,0.0
60711,2024-11-25,AAPL,0.966748,0.15523,0.0


## Engineered Features

In [137]:
df = hf.create_engineered_features(df)

df[df['ticker'] == "AAPL"].head(25)[['date', 'ticker', 'mean_20', 'vol_20', 'ema_cross', 'skew_20', 'kurt_20']]

Unnamed: 0,date,ticker,mean_20,vol_20,ema_cross,skew_20,kurt_20
0,2019-12-23,AAPL,,,,,
49,2019-12-24,AAPL,,,0.0,,
98,2019-12-26,AAPL,,,0.004786,,
147,2019-12-27,AAPL,,,0.117769,,
196,2019-12-30,AAPL,,,0.202554,,
245,2019-12-31,AAPL,,,0.299379,,
294,2020-01-02,AAPL,,,0.413308,,
343,2020-01-03,AAPL,,,0.626292,,
392,2020-01-06,AAPL,,,0.730182,,
441,2020-01-07,AAPL,,,0.848727,,


In [138]:
list(df.columns)

['index_x',
 'ticker',
 'simfinid_x',
 'date',
 'open',
 'high',
 'low',
 'close',
 'adj_close',
 'volume',
 'dividend',
 'shares_outstanding',
 'index_y',
 'simfinid_y',
 'currency',
 'fiscal_year',
 'fiscal_period',
 'report_date',
 'publish_date',
 'restated_date',
 'shares_basic',
 'shares_diluted',
 'revenue',
 'cost_of_revenue',
 'gross_profit',
 'operating_expenses',
 'selling_general_&_administrative',
 'research_&_development',
 'depreciation_&_amortization',
 'operating_income_loss',
 'non-operating_income_loss',
 'interest_expense_net',
 'pretax_income_loss_adj',
 'abnormal_gains_losses',
 'pretax_income_loss',
 'income_tax_expense_benefit_net',
 'income_loss_from_continuing_operations',
 'net_extraordinary_gains_losses',
 'net_income',
 'net_income_common',
 'ret',
 'eps',
 'profit_margin',
 'revenue_growth',
 'income_growth',
 'gross_margin',
 'operating_margin',
 'sga_ratio',
 'rd_ratio',
 'cost_ratio',
 'net_income_per_share',
 'tax_burden',
 'nonop_ratio',
 'abnormal_ra

# Feature Selection

<h5 style="color:red">Check multicolinearity</h5>

In [139]:
raw = ['ret', 'volume']

eng = ['mean_20', 'mean_60', 'vol_20', 'vol_60',
       'mom_20', 'mom_60', 'ema_cross', 'skew_20',
       'skew_60', 'kurt_20', 'kurt_60']

fund = ['eps', 'profit_margin', 'revenue_growth']

fund_eng = fund + eng

# Binary Labels

In [140]:
horizons = [1, 20, 60]

horizon_cols = [f'y_{h}' for h in horizons]

df = hf.create_binary_labels(df, horizons)

df[df["ticker"] == "AAPL"].tail(30)[['date', 'ret'] + horizon_cols]

Unnamed: 0,date,ret,y_1,y_20,y_60
59290,2024-10-15,0.011043,0.0,0.0,
59339,2024-10-16,-0.008858,1.0,0.0,
59388,2024-10-17,0.001605,1.0,0.0,
59437,2024-10-18,0.012258,1.0,0.0,
59486,2024-10-21,0.00629,0.0,0.0,
59535,2024-10-22,-0.002594,0.0,0.0,
59584,2024-10-23,-0.021615,0.0,0.0,
59633,2024-10-24,-0.000828,1.0,0.0,
59682,2024-10-25,0.00362,1.0,0.0,
59731,2024-10-28,0.008604,1.0,0.0,


### `NOTE: we can use sklearn train-test split`

In [141]:
train, val, test = hf.time_split(df)
len(train), len(val), len(test)

(42581, 9114, 9065)

# Modeling: Logistic Regression

## Horizon 1 days

In [169]:
TARGET = "y_1"

### Raw Features

In [171]:
data = hf.prune(df, raw, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, raw, TARGET)



Test AUC: 0.4985415036119433
Test accuracy: 0.5343629343629344
              precision    recall  f1-score   support

         0.0       0.51      0.07      0.13      4232
         1.0       0.54      0.94      0.68      4833

    accuracy                           0.53      9065
   macro avg       0.52      0.51      0.41      9065
weighted avg       0.52      0.53      0.42      9065

Coefficients:  [-0.0665677   0.00937894]
done




ValueError: Found input variables with inconsistent numbers of samples: [9065, 51597]

### Features Eng

In [144]:
data = hf.prune(df, eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, eng, TARGET)

Test AUC: 0.5042326455210047
Test accuracy: 0.5257421150278293
              precision    recall  f1-score   support

         0.0       0.44      0.07      0.12      4010
         1.0       0.53      0.92      0.67      4614

    accuracy                           0.53      8624
   macro avg       0.49      0.50      0.40      8624
weighted avg       0.49      0.53      0.42      8624



### Features Fund

In [145]:
data = hf.prune(df, fund, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, fund, TARGET)

Test AUC: 0.5039061800930102
Test accuracy: 0.5332406471183013
              precision    recall  f1-score   support

         0.0       0.58      0.02      0.03      3711
         1.0       0.53      0.99      0.69      4201

    accuracy                           0.53      7912
   macro avg       0.56      0.50      0.36      7912
weighted avg       0.56      0.53      0.38      7912



### Features Fund & Eng

In [146]:
data = hf.prune(df, fund_eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, fund_eng, TARGET)

Test AUC: 0.5028364861844177
Test accuracy: 0.5046247357293869
              precision    recall  f1-score   support

         0.0       0.48      0.58      0.53      3546
         1.0       0.54      0.43      0.48      4022

    accuracy                           0.50      7568
   macro avg       0.51      0.51      0.50      7568
weighted avg       0.51      0.50      0.50      7568



## Horizon 20 days

In [147]:
TARGET = "y_20"

### Features Raw

In [148]:
data = hf.prune(df, raw, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, raw, TARGET)

Test AUC: 0.5279288362438861
Test accuracy: 0.4392240412648576
              precision    recall  f1-score   support

         0.0       0.40      0.84      0.54      3480
         1.0       0.64      0.19      0.29      5438

    accuracy                           0.44      8918
   macro avg       0.52      0.51      0.41      8918
weighted avg       0.54      0.44      0.39      8918



### Features Eng

In [149]:
data = hf.prune(df, eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, eng, TARGET)

Test AUC: 0.5230507945911125
Test accuracy: 0.6059926860917777
              precision    recall  f1-score   support

         0.0       0.55      0.02      0.05      3355
         1.0       0.61      0.99      0.75      5122

    accuracy                           0.61      8477
   macro avg       0.58      0.51      0.40      8477
weighted avg       0.58      0.61      0.47      8477



### Features Fund

In [150]:
data = hf.prune(df, fund, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, fund, TARGET)

Test AUC: 0.4742475486393481
Test accuracy: 0.5888065422949144
              precision    recall  f1-score   support

         0.0       0.60      0.02      0.03      3234
         1.0       0.59      0.99      0.74      4592

    accuracy                           0.59      7826
   macro avg       0.59      0.50      0.38      7826
weighted avg       0.59      0.59      0.45      7826



### Features Fund & Eng

In [151]:
data = hf.prune(df, fund_eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, fund_eng, TARGET)

Test AUC: 0.5147096701169483
Test accuracy: 0.490926199758032
              precision    recall  f1-score   support

         0.0       0.43      0.67      0.52      3111
         1.0       0.60      0.37      0.46      4328

    accuracy                           0.49      7439
   macro avg       0.52      0.52      0.49      7439
weighted avg       0.53      0.49      0.48      7439



## Horizon 60 days

In [152]:
TARGET = "y_60"

### Features Raw

In [153]:
data = hf.prune(df, eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, eng, TARGET)

Test AUC: 0.5155989360767208
Test accuracy: 0.41158499327874865
              precision    recall  f1-score   support

         0.0       0.32      0.82      0.46      2534
         1.0       0.74      0.23      0.35      5649

    accuracy                           0.41      8183
   macro avg       0.53      0.52      0.41      8183
weighted avg       0.61      0.41      0.38      8183



### Features Eng

In [154]:
data = hf.prune(df, eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, eng, TARGET)

Test AUC: 0.5155989360767208
Test accuracy: 0.41158499327874865
              precision    recall  f1-score   support

         0.0       0.32      0.82      0.46      2534
         1.0       0.74      0.23      0.35      5649

    accuracy                           0.41      8183
   macro avg       0.53      0.52      0.41      8183
weighted avg       0.61      0.41      0.38      8183



### Features Fund

In [155]:
data = hf.prune(df, fund, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, fund, TARGET)

Test AUC: 0.47889826190038687
Test accuracy: 0.6671511627906976
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      2519
         1.0       0.67      1.00      0.80      5049

    accuracy                           0.67      7568
   macro avg       0.33      0.50      0.40      7568
weighted avg       0.45      0.67      0.53      7568



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Features Fund & Eng

In [156]:
data = hf.prune(df, fund_eng, TARGET)

train, val, test = hf.time_split(data)

Models.run_logistic_regression(train, val, test, fund_eng, TARGET)

Test AUC: 0.5006284541505793
Test accuracy: 0.4386575685837627
              precision    recall  f1-score   support

         0.0       0.35      0.78      0.48      2412
         1.0       0.70      0.27      0.39      4769

    accuracy                           0.44      7181
   macro avg       0.53      0.52      0.43      7181
weighted avg       0.58      0.44      0.42      7181



# Results

In [180]:
features_map = {
    "raw": raw,
    "eng": eng,
    "fund": fund,
    "fund_eng": fund_eng
}

In [181]:
features_sets = ['raw', 'eng', 'fund', 'fund_eng']
targets = ['y_1', 'y_20', 'y_60']
res = []

for f, t in product(features_sets, targets):
    print(f, t, type(f), type(t))
    features = features_map[f]
    data = hf.prune(df, features, t)

    train, val, test = hf.time_split(data)
    print(f, t)
    val_acc, test_acc = Models.run_logistic_regression(train, val, test, features, t)

    res.append({
        "Horizon": t,
        "Features": f,
        "Val Accuracy": val_acc,
        "Test Accuracy": test_acc,
    })

res

raw y_1 <class 'str'> <class 'str'>
raw y_1




raw y_20 <class 'str'> <class 'str'>
raw y_20




raw y_60 <class 'str'> <class 'str'>
raw y_60




eng y_1 <class 'str'> <class 'str'>
eng y_1




eng y_20 <class 'str'> <class 'str'>
eng y_20




eng y_60 <class 'str'> <class 'str'>
eng y_60




fund y_1 <class 'str'> <class 'str'>
fund y_1




fund y_20 <class 'str'> <class 'str'>
fund y_20




fund y_60 <class 'str'> <class 'str'>
fund y_60




fund_eng y_1 <class 'str'> <class 'str'>
fund_eng y_1




fund_eng y_20 <class 'str'> <class 'str'>
fund_eng y_20




fund_eng y_60 <class 'str'> <class 'str'>
fund_eng y_60




[{'Horizon': 'y_1',
  'Features': 'raw',
  'Val Accuracy': 0.4923070078208573,
  'Test Accuracy': 0.4985415036119433},
 {'Horizon': 'y_20',
  'Features': 'raw',
  'Val Accuracy': 0.4979015938672902,
  'Test Accuracy': 0.5279288362438861},
 {'Horizon': 'y_60',
  'Features': 'raw',
  'Val Accuracy': 0.5024241822488832,
  'Test Accuracy': 0.5045861626797885},
 {'Horizon': 'y_1',
  'Features': 'eng',
  'Val Accuracy': 0.49965201317710106,
  'Test Accuracy': 0.5042357262457207},
 {'Horizon': 'y_20',
  'Features': 'eng',
  'Val Accuracy': 0.4910048156287877,
  'Test Accuracy': 0.5230469538782763},
 {'Horizon': 'y_60',
  'Features': 'eng',
  'Val Accuracy': 0.4983610356720456,
  'Test Accuracy': 0.5155989360767208},
 {'Horizon': 'y_1',
  'Features': 'fund',
  'Val Accuracy': 0.5093368795320723,
  'Test Accuracy': 0.5039061800930102},
 {'Horizon': 'y_20',
  'Features': 'fund',
  'Val Accuracy': 0.5285954153269234,
  'Test Accuracy': 0.4742475486393481},
 {'Horizon': 'y_60',
  'Features': 'fund

In [183]:
res_df = pd.DataFrame(res)
val_df = res_df.pivot(index = "Features", columns = "Horizon", values = "Val Accuracy")
val_df

Horizon,y_1,y_20,y_60
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
eng,0.499652,0.491005,0.498361
fund,0.509337,0.528595,0.46038
fund_eng,0.506723,0.502529,0.483617
raw,0.492307,0.497902,0.502424


In [184]:
test_df = res_df.pivot(index = "Features", columns = "Horizon", values = "Test Accuracy")
test_df

Horizon,y_1,y_20,y_60
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
eng,0.504236,0.523047,0.515599
fund,0.503906,0.474248,0.478898
fund_eng,0.502836,0.51471,0.500628
raw,0.498542,0.527929,0.504586
