# Random Forest Model

In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch import nn

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow.keras import layers, models

In [8]:
import helper_functions as hf
import Models as models
import MLP_Model as mlp
import Random_Forest_Model as rf

In [9]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from itertools import product
from sklearn.metrics import classification_report

In [10]:
df = hf.load_data()
df = hf.create_raw_features(df)
df = hf.create_fundamental_features(df)
df = hf.create_engineered_features(df)
df = hf.create_binary_labels(df,[1,20,60])

raw = ['ret','volume']
eng = [
    'mean_20', 'mean_60',
    'vol_20', 'vol_60',
    'mom_20', 'mom_60',
    'ema_12', 'ema_26', 'ema_cross',
    'skew_20', 'skew_60',
    'kurt_20', 'kurt_60',
    'vol_z'
]
fund = [
    'eps',
    'profit_margin',
    'revenue_growth',
    'income_growth',
    'gross_margin',
    'operating_margin',
    'sga_ratio',
    'rd_ratio',
    'cost_ratio',
    'net_income_per_share',
    'tax_burden',
    'nonop_ratio',
    'abnormal_ratio',
    'revenue_per_share',
    'da_ratio',
    'interest_coverage',
    'interest_burden'
]
fund_eng = fund + eng

#rt = df.pivot(index='date', columns='ticker', values='ret')


  df['revenue_growth'] = df.groupby('ticker')['revenue'].pct_change()
  df["income_growth"] = df.groupby("ticker")["net_income"].pct_change()


## Run all the models with variations in Features and Horizons

In [11]:
features_map = {
    "raw": raw,
    "eng": eng,
    "fund": fund,
    "fund_eng": fund_eng
}

In [13]:
features_sets = ['raw', 'eng', 'fund', 'fund_eng']
targets = ['y_1', 'y_20', 'y_60']
#features_sets = ['raw']
#targets = ['y_1']
res = []

for f, t in product(features_sets, targets):
    print(f, t, type(f), type(t))
    features = features_map[f]
    data = hf.prune(df, features, t)

    train, val, test = hf.time_split(data)
    print(f, t)
    ## not returning accuracy right now
    ### FIX
    #val_acc, test_acc = rf.run_optimize_eval_RF(train, val, test, features, t)
    val_auc, val_acc, test_auc, test_acc = rf.run_optimize_eval_RF(train, val, test, features, t)


    res.append({
        "Horizon": t,
        "Features": f,
        "Val AUC": val_auc,
        "Val Accuracy": val_acc,
        "Test AUC": test_auc,
        "Test Accuracy": test_acc
    })


res

raw y_1 <class 'str'> <class 'str'>
raw y_1
              precision    recall  f1-score   support

           0       0.47      0.41      0.44      4232
           1       0.53      0.59      0.56      4833

    accuracy                           0.51      9065
   macro avg       0.50      0.50      0.50      9065
weighted avg       0.50      0.51      0.50      9065

OOB Score: 0.506366649223792

Top 10 Feature Importances:
ret       0.508936
volume    0.491064
dtype: float64
raw y_20 <class 'str'> <class 'str'>
raw y_20
              precision    recall  f1-score   support

           0       0.40      0.74      0.52      3480
           1       0.63      0.28      0.39      5438

    accuracy                           0.46      8918
   macro avg       0.51      0.51      0.45      8918
weighted avg       0.54      0.46      0.44      8918

OOB Score: 0.5000098400015744

Top 10 Feature Importances:
volume    0.508437
ret       0.491563
dtype: float64
raw y_60 <class 'str'> <class 'st

[{'Horizon': 'y_1',
  'Features': 'raw',
  'Val AUC': 0.5053005990896623,
  'Val Accuracy': 0.5061443932411674,
  'Test AUC': 0.498329214673693,
  'Test Accuracy': 0.505019305019305},
 {'Horizon': 'y_20',
  'Features': 'raw',
  'Val AUC': 0.5133946082968286,
  'Val Accuracy': 0.46180439388870304,
  'Test AUC': 0.5076126703106704,
  'Test Accuracy': 0.4610899304776856},
 {'Horizon': 'y_60',
  'Features': 'raw',
  'Val AUC': 0.5150323667352983,
  'Val Accuracy': 0.5043237634036666,
  'Test AUC': 0.5138342882048625,
  'Test Accuracy': 0.4902597402597403},
 {'Horizon': 'y_1',
  'Features': 'eng',
  'Val AUC': 0.49794541113376506,
  'Val Accuracy': 0.49613743802605786,
  'Test AUC': 0.4982897654001105,
  'Test Accuracy': 0.4967532467532468},
 {'Horizon': 'y_20',
  'Features': 'eng',
  'Val AUC': 0.4898147385872731,
  'Val Accuracy': 0.49132066619751347,
  'Test AUC': 0.47768973557855976,
  'Test Accuracy': 0.480712516220361},
 {'Horizon': 'y_60',
  'Features': 'eng',
  'Val AUC': 0.47879026

In [14]:
res_df = pd.DataFrame(res)
val_df = res_df.pivot(index = "Features", columns = "Horizon", values = "Val Accuracy")
val_df

Horizon,y_1,y_20,y_60
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
eng,0.496137,0.491321,0.558309
fund,0.522162,0.5,0.594318
fund_eng,0.525424,0.565517,0.636905
raw,0.506144,0.461804,0.504324


In [15]:
test_df = res_df.pivot(index = "Features", columns = "Horizon", values = "Test Accuracy")
test_df

Horizon,y_1,y_20,y_60
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
eng,0.496753,0.480713,0.56874
fund,0.513043,0.505495,0.485227
fund_eng,0.472727,0.526012,0.535329
raw,0.505019,0.46109,0.49026


In [16]:
val_df = val_df.rename(columns={
    "y_1": "1-day horizon",
    "y_20": "20-day horizon",
    "y_60": "60-day horizon"
})

val_df = val_df.rename(index={
    "raw": "Raw",
    "eng": "Engineered",
    "fund": "Fundamental",
    "fund_eng": "Fund + Eng"
})

val_st = (
    val_df.style
      .set_caption("Table 1: Summary Validation AUC")
      .format("{:.4f}")              # numeric formatting
      .set_table_styles([
          {"selector": "table", "props": "width:100%; border-collapse:separate; border-spacing:10px;"},
          {"selector": "th, td", "props": "padding:10px;"},
          {"selector": "th", "props": "font-size:12pt;"},
          {"selector": "td", "props": "font-size:11pt;"},
      ])
      .set_properties(**{"min-width": "120px"})
)
val_st

Horizon,1-day horizon,20-day horizon,60-day horizon
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Engineered,0.4961,0.4913,0.5583
Fundamental,0.5222,0.5,0.5943
Fund + Eng,0.5254,0.5655,0.6369
Raw,0.5061,0.4618,0.5043


In [17]:
test_df = test_df.rename(columns={
    "y_1": "1-day horizon",
    "y_20": "20-day horizon",
    "y_60": "60-day horizon"
})

test_df = test_df.rename(index={
    "raw": "Raw",
    "eng": "Engineered",
    "fund": "Fundamental",
    "fund_eng": "Fund + Eng"
})

test_st = (
    test_df.style
      .set_caption("Table 2: Summary Testing AUC")
      .format("{:.4f}")              # numeric formatting
      .set_table_styles([
          {"selector": "table", "props": "width:100%; border-collapse:separate; border-spacing:10px;"},
          {"selector": "th, td", "props": "padding:10px;"},
          {"selector": "th", "props": "font-size:12pt;"},
          {"selector": "td", "props": "font-size:11pt;"},
      ])
      .set_properties(**{"min-width": "120px"})
)
test_st

Horizon,1-day horizon,20-day horizon,60-day horizon
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Engineered,0.4968,0.4807,0.5687
Fundamental,0.513,0.5055,0.4852
Fund + Eng,0.4727,0.526,0.5353
Raw,0.505,0.4611,0.4903


In [18]:
import dataframe_image as dfi
# dfi.export(styled, "table1.png")
dfi.export(test_st, "random_forest_table1.png", table_conversion="matplotlib")
dfi.export(val_st, "random_forest_table2.png", table_conversion="matplotlib")

ModuleNotFoundError: No module named 'dataframe_image'