# Random Forest Model

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
import helper_functions as hf
import RandomPrediction as rp
import dataframe_image as dfi
from itertools import product

In [11]:
df = hf.load_data()
df = hf.create_engineered_plus_fundamental_features(df)
df = hf.create_binary_labels(df,[1,20,60])

raw = ['ret','volume']
eng = [
    'mean_20', 'mean_60',
    'vol_20', 'vol_60',
    'mom_20', 'mom_60',
    'ema_12', 'ema_26', 'ema_cross',
    'skew_20', 'skew_60',
    'kurt_20', 'kurt_60',
    'vol_z'
]
fund = [
    'eps',
    'profit_margin',
    'revenue_growth',
    'income_growth',
    'gross_margin',
    'operating_margin',
    'sga_ratio',
    'rd_ratio',
    'cost_ratio',
    'net_income_per_share',
    'tax_burden',
    'nonop_ratio',
    'abnormal_ratio',
    'revenue_per_share',
    'da_ratio',
    'interest_coverage',
    'interest_burden'
]
fund_eng = fund + eng 

#rt = df.pivot(index='date', columns='ticker', values='ret')


  df['revenue_growth'] = df.groupby('ticker')['revenue'].pct_change()
  df["income_growth"] = df.groupby("ticker")["net_income"].pct_change()


## Run all the models with variations in Features and Horizons

In [12]:
features_map = {
    "raw": raw,
    "eng": eng,
    "fund": fund,
    "fund_eng": fund_eng
}

In [14]:
features_map

{'raw': ['ret', 'volume'],
 'eng': ['mean_20',
  'mean_60',
  'vol_20',
  'vol_60',
  'mom_20',
  'mom_60',
  'ema_12',
  'ema_26',
  'ema_cross',
  'skew_20',
  'skew_60',
  'kurt_20',
  'kurt_60',
  'vol_z'],
 'fund': ['eps',
  'profit_margin',
  'revenue_growth',
  'income_growth',
  'gross_margin',
  'operating_margin',
  'sga_ratio',
  'rd_ratio',
  'cost_ratio',
  'net_income_per_share',
  'tax_burden',
  'nonop_ratio',
  'abnormal_ratio',
  'revenue_per_share',
  'da_ratio',
  'interest_coverage',
  'interest_burden'],
 'fund_eng': ['eps',
  'profit_margin',
  'revenue_growth',
  'income_growth',
  'gross_margin',
  'operating_margin',
  'sga_ratio',
  'rd_ratio',
  'cost_ratio',
  'net_income_per_share',
  'tax_burden',
  'nonop_ratio',
  'abnormal_ratio',
  'revenue_per_share',
  'da_ratio',
  'interest_coverage',
  'interest_burden',
  'mean_20',
  'mean_60',
  'vol_20',
  'vol_60',
  'mom_20',
  'mom_60',
  'ema_12',
  'ema_26',
  'ema_cross',
  'skew_20',
  'skew_60',
  'ku

In [15]:
features_sets = ['raw', 'eng', 'fund', 'fund_eng']
targets = ['y_1', 'y_20', 'y_60']

res = []

for f, t in product(features_sets, targets):
    print(f, t, type(f), type(t))
    
    # FIX: force list
    features = list(features_map[f])

    data = hf.prune(df, features, t)

    train, val, test = hf.time_split(data)
    print(f, t)

    val_auc, val_acc, test_auc, test_acc = rp.run_random_baseline(train, val, test, features, t)

    res.append({
        "Horizon": t,
        "Features": f,
        "Val AUC": val_auc,
        "Val Accuracy": val_acc,
        "Test AUC": test_auc,
        "Test Accuracy": test_acc
    })

res


raw y_1 <class 'str'> <class 'str'>
raw y_1
raw y_20 <class 'str'> <class 'str'>
raw y_20
raw y_60 <class 'str'> <class 'str'>
raw y_60
eng y_1 <class 'str'> <class 'str'>
eng y_1
eng y_20 <class 'str'> <class 'str'>
eng y_20
eng y_60 <class 'str'> <class 'str'>
eng y_60
fund y_1 <class 'str'> <class 'str'>
fund y_1
fund y_20 <class 'str'> <class 'str'>
fund y_20
fund y_60 <class 'str'> <class 'str'>
fund y_60
fund_eng y_1 <class 'str'> <class 'str'>
fund_eng y_1
fund_eng y_20 <class 'str'> <class 'str'>
fund_eng y_20
fund_eng y_60 <class 'str'> <class 'str'>
fund_eng y_60


[{'Horizon': 'y_1',
  'Features': 'raw',
  'Val AUC': 0.5049348487574257,
  'Val Accuracy': 0.5026333113890717,
  'Test AUC': 0.5013900476286025,
  'Test Accuracy': 0.499834528405957},
 {'Horizon': 'y_20',
  'Features': 'raw',
  'Val AUC': 0.5013404591545769,
  'Val Accuracy': 0.49871751979480317,
  'Test AUC': 0.4970457994614314,
  'Test Accuracy': 0.4934962996187486},
 {'Horizon': 'y_60',
  'Features': 'raw',
  'Val AUC': 0.49509227774296466,
  'Val Accuracy': 0.4921019255159691,
  'Test AUC': 0.5097128437063754,
  'Test Accuracy': 0.5041743970315399},
 {'Horizon': 'y_1',
  'Features': 'eng',
  'Val AUC': 0.5024319079117537,
  'Val Accuracy': 0.5096275798454976,
  'Test AUC': 0.49530940745232716,
  'Test Accuracy': 0.49907235621521334},
 {'Horizon': 'y_20',
  'Features': 'eng',
  'Val AUC': 0.4904787854531263,
  'Val Accuracy': 0.4910860896082571,
  'Test AUC': 0.5206939353398536,
  'Test Accuracy': 0.5166922260233573},
 {'Horizon': 'y_60',
  'Features': 'eng',
  'Val AUC': 0.5029426

In [16]:
res_df = pd.DataFrame(res)
val_acc_df = res_df.pivot(index = "Features", columns = "Horizon", values = "Val Accuracy")

val_acc_df = val_acc_df.rename(columns={
    "y_1": "1-day horizon",
    "y_20": "20-day horizon",
    "y_60": "60-day horizon"
})

val_acc_df = val_acc_df.rename(index={
    "raw": "Raw",
    "eng": "Engineered",
    "fund": "Fundamental",
    "fund_eng": "Fund + Eng"
})

val_acc_st = (
    val_acc_df.style
      .set_caption("Table 1: Random Predictor - Summary Validation Accuracy")
      .format("{:.4f}")              # numeric formatting
      .set_table_styles([
          {"selector": "table", "props": "width:100%; border-collapse:separate; border-spacing:10px;"},
          {"selector": "th, td", "props": "padding:10px;"},
          {"selector": "th", "props": "font-size:12pt;"},
          {"selector": "td", "props": "font-size:11pt;"},
      ])
      .set_properties(**{"min-width": "120px"})
)

#dfi.export(test_st, "random_forest_table_test.png", table_conversion="matplotlib")
dfi.export(val_acc_st, "random_predictor_table_validation_accuracy.png", table_conversion="matplotlib")

In [17]:
val_acc_st

Horizon,1-day horizon,20-day horizon,60-day horizon
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Engineered,0.5096,0.4911,0.4984
Fundamental,0.4886,0.5132,0.5159
Fund + Eng,0.478,0.5057,0.4964
Raw,0.5026,0.4987,0.4921


In [18]:
res_df = pd.DataFrame(res)
val_auc_df = res_df.pivot(index = "Features", columns = "Horizon", values = "Val AUC")

val_auc_df = val_auc_df.rename(columns={
    "y_1": "1-day horizon",
    "y_20": "20-day horizon",
    "y_60": "60-day horizon"
})

val_auc_df = val_auc_df.rename(index={
    "raw": "Raw",
    "eng": "Engineered",
    "fund": "Fundamental",
    "fund_eng": "Fund + Eng"
})

val_auc_st = (
    val_auc_df.style
      .set_caption("Table 1: Random Predictor - Summary Validation AUC")
      .format("{:.4f}")              # numeric formatting
      .set_table_styles([
          {"selector": "table", "props": "width:100%; border-collapse:separate; border-spacing:10px;"},
          {"selector": "th, td", "props": "padding:10px;"},
          {"selector": "th", "props": "font-size:12pt;"},
          {"selector": "td", "props": "font-size:11pt;"},
      ])
      .set_properties(**{"min-width": "120px"})
)

dfi.export(val_auc_st, "random_predictor_table_validation_AUC.png", table_conversion="matplotlib")

In [19]:
val_auc_st

Horizon,1-day horizon,20-day horizon,60-day horizon
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Engineered,0.5024,0.4905,0.5029
Fundamental,0.4925,0.5094,0.513
Fund + Eng,0.4856,0.4975,0.4962
Raw,0.5049,0.5013,0.4951


In [20]:
test_acc_df = res_df.pivot(index = "Features", columns = "Horizon", values = "Test Accuracy")
test_acc_df = test_acc_df.rename(columns={
    "y_1": "1-day horizon",
    "y_20": "20-day horizon",
    "y_60": "60-day horizon"
})

test_acc_df = test_acc_df.rename(index={
    "raw": "Raw",
    "eng": "Engineered",
    "fund": "Fundamental",
    "fund_eng": "Fund + Eng"
})

test_acc_st = (
    test_acc_df.style
      .set_caption("Table 2: Random Predictor - Summary Testing Accuracy")
      .format("{:.4f}")              # numeric formatting
      .set_table_styles([
          {"selector": "table", "props": "width:100%; border-collapse:separate; border-spacing:10px;"},
          {"selector": "th, td", "props": "padding:10px;"},
          {"selector": "th", "props": "font-size:12pt;"},
          {"selector": "td", "props": "font-size:11pt;"},
      ])
      .set_properties(**{"min-width": "120px"})
)
dfi.export(test_acc_st, "random_predictor_table_test_accuracy.png", table_conversion="matplotlib")

In [21]:
test_acc_st

Horizon,1-day horizon,20-day horizon,60-day horizon
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Engineered,0.4991,0.5167,0.5035
Fundamental,0.4924,0.5033,0.4955
Fund + Eng,0.475,0.4983,0.5042
Raw,0.4998,0.4935,0.5042


In [22]:
test_auc_df = res_df.pivot(index = "Features", columns = "Horizon", values = "Test AUC")
test_auc_df = test_auc_df.rename(columns={
    "y_1": "1-day horizon",
    "y_20": "20-day horizon",
    "y_60": "60-day horizon"
})

test_auc_df = test_auc_df.rename(index={
    "raw": "Raw",
    "eng": "Engineered",
    "fund": "Fundamental",
    "fund_eng": "Fund + Eng"
})

test_auc_st = (
    test_auc_df.style
      .set_caption("Table 2: Random Predictor - Summary Testing AUC")
      .format("{:.4f}")              # numeric formatting
      .set_table_styles([
          {"selector": "table", "props": "width:100%; border-collapse:separate; border-spacing:10px;"},
          {"selector": "th, td", "props": "padding:10px;"},
          {"selector": "th", "props": "font-size:12pt;"},
          {"selector": "td", "props": "font-size:11pt;"},
      ])
      .set_properties(**{"min-width": "120px"})
)
dfi.export(test_auc_st, "random_predictor_table_test_AUC.png", table_conversion="matplotlib")

In [23]:
test_auc_st

Horizon,1-day horizon,20-day horizon,60-day horizon
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Engineered,0.4953,0.5207,0.5102
Fundamental,0.482,0.4957,0.495
Fund + Eng,0.4758,0.4972,0.5049
Raw,0.5014,0.497,0.5097
