In [1]:
import datetime as dt

import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error
from sklearn.preprocessing import (
    OneHotEncoder,
)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from analytics.machine_learning.price_prediction_with_fundamentals import utils

# Random Forrest

In [2]:
from sklearn.ensemble import RandomForestRegressor

dataset = utils.get_features(features_categories=['arctan_pct_change'])

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'price_avg_pct_change_next_three_months']
target_col = 'price_avg_pct_change_next_three_months'

y_train = train_set[target_col]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[target_col]
X_test = test_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    remainder='passthrough'
)

random_forrest = RandomForestRegressor(n_estimators=100)
random_forrest_regressor = make_pipeline(
    column_transformer,
    random_forrest
)

random_forrest_regressor.fit(X_train, y_train)
y_pred_random_forrest = random_forrest_regressor.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred_random_forrest))

Mean absolute error: 0.02


In [27]:
random_forrest_regressor.named_steps

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('onehotencoder', OneHotEncoder(), ['sector'])]),
 'randomforestregressor': RandomForestRegressor()}

In [28]:
utils.get_feature_importance_sorted(
    feature_importance_scores=random_forrest_regressor.named_steps['randomforestregressor'].feature_importances_,
    feature_names=random_forrest_regressor.named_steps['columntransformer'].get_feature_names_out()
)

[('remainder__price_avg_pct_change_three_months', 0.20945009427850544),
 ('remainder__price_volatility_three_months', 0.13415223412601202),
 ('remainder__gross_profit_arctan_pct_change', 0.04793211263613009),
 ('remainder__payments_for_repurchase_of_equity_arctan_pct_change',
  0.041327137760405454),
 ('remainder__operating_cashflow_arctan_pct_change', 0.0382234404450389),
 ('remainder__cashflow_from_investment_arctan_pct_change',
  0.03522573261405776),
 ('remainder__cashflow_from_financing_arctan_pct_change',
  0.029562971378163307),
 ('remainder__property_plant_equipment_arctan_pct_change',
  0.027551993416335643),
 ('remainder__total_assets_arctan_pct_change', 0.026958764258755853),
 ('remainder__capital_expenditures_arctan_pct_change', 0.02645145927180378),
 ('remainder__avg_treasury_yield', 0.02639081423152141),
 ('remainder__avg_interest_rate', 0.026312321986542635),
 ('remainder__proceeds_from_issuance_of_long_term_debt_and_capital_securities_net_arctan_pct_change',
  0.0241841

# Lasso Regression

In [2]:
from sklearn.linear_model import Lasso

dataset = utils.get_features(features_categories=['arctan_pct_change'])

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'price_avg_pct_change_next_three_months']
target_col = 'price_avg_pct_change_next_three_months'

y_train = train_set[target_col]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[target_col]
X_test = test_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    remainder='passthrough'
)

lasso_regressor = make_pipeline(
    column_transformer,
    Lasso()
)

lasso_regressor.fit(X_train, y_train)
y_pred_lasso = lasso_regressor.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred_lasso))

Mean absolute error: 0.02


# XGBoost

In [2]:
import xgboost as xgb

dataset = utils.get_features(features_categories=['arctan_pct_change'])

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'price_avg_pct_change_next_three_months']
target_col = 'price_avg_pct_change_next_three_months'

y_train = train_set[target_col]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[target_col]
X_test = test_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    remainder='passthrough'
)

xgb_regressor = make_pipeline(
    column_transformer,
    xgb.XGBRegressor(
        objective = 'reg:absoluteerror',
        n_estimators = 100,
        learning_rate = 0.1,
        # booster='dart',
    )
)

xgb_regressor.fit(X_train, y_train)
y_pred_xgb = xgb_regressor.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred_xgb))

Mean absolute error: 0.02
