In [1]:
import datetime as dt

import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler
)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from analytics.machine_learning.price_prediction_with_fundamentals import utils

# Random Forrest

In [2]:
from sklearn.ensemble import RandomForestRegressor

dataset = utils.get_features(features_categories=['arctan_pct_change'], target='avg_next_three_months_price')

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'avg_next_three_months_price']
target_col = 'avg_next_three_months_price'

y_train = train_set[target_col]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[target_col]
X_test = test_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    remainder='passthrough'
)

random_forrest = RandomForestRegressor(n_estimators=100)
random_forrest_regressor = make_pipeline(
    column_transformer,
    random_forrest
)

random_forrest_regressor.fit(X_train, y_train)
y_pred_random_forrest = random_forrest_regressor.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred_random_forrest))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred_random_forrest))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test, y_pred_random_forrest))

Mean absolute error: 6.01
Coefficient of determination: 0.98
Mean absolute pct error: 0.13


In [3]:
utils.get_feature_importance_sorted(
    feature_importance_scores=random_forrest_regressor.named_steps['randomforestregressor'].feature_importances_,
    feature_names=random_forrest_regressor.named_steps['columntransformer'].get_feature_names_out()
)

[('remainder__avg_three_months_price', 0.9733832082597829),
 ('remainder__price_volatility_three_months', 0.0038380903835893393),
 ('remainder__avg_natural_gas_price', 0.002119791604677019),
 ('remainder__cashflow_from_investment_arctan_pct_change',
  0.001301497045028639),
 ('remainder__avg_oil_price', 0.001231882734403403),
 ('remainder__capital_expenditures_arctan_pct_change', 0.0011500059204233),
 ('remainder__total_assets_arctan_pct_change', 0.0010191249237280627),
 ('remainder__property_plant_equipment_arctan_pct_change',
  0.0010142223554987359),
 ('remainder__total_shareholder_equity_arctan_pct_change',
  0.0009927523267590724),
 ('remainder__avg_treasury_yield', 0.0009089359259042175),
 ('remainder__total_current_liabilities_arctan_pct_change',
  0.0008701669464561718),
 ('remainder__inventory_arctan_pct_change', 0.0008165697774997011),
 ('remainder__total_liabilities_arctan_pct_change', 0.0008125340612381433),
 ('remainder__net_interest_income_arctan_pct_change', 0.0007620787

# Lasso Regression

In [3]:
from sklearn.linear_model import Lasso

dataset = utils.get_features(features_categories=['arctan_pct_change'], target='avg_next_three_months_price')

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'avg_next_three_months_price', 'avg_global_commodities_index_value', 'avg_oil_price', 'avg_natural_gas_price']
target_col = 'avg_next_three_months_price'

y_train = train_set[target_col]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[target_col]
X_test = test_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    remainder='passthrough'
)

lasso_regressor = make_pipeline(
    column_transformer,
    Lasso()
)

lasso_regressor.fit(X_train, y_train)
y_pred_lasso = lasso_regressor.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred_lasso))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred_lasso))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test, y_pred_lasso))

Mean absolute error: 5.79
Coefficient of determination: 0.99
Mean absolute pct error: 0.12


# XGBoost

In [2]:
import xgboost as xgb

dataset = utils.get_features(features_categories=['arctan_pct_change'], target='avg_next_three_months_price')

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'avg_next_three_months_price']
target_col = 'avg_next_three_months_price'

y_train = train_set[target_col]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[target_col]
X_test = test_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    remainder='passthrough'
)

xgb_regressor = make_pipeline(
    column_transformer,
    xgb.XGBRegressor(
        objective = 'reg:absoluteerror',
        n_estimators = 100,
        learning_rate = 0.1,
        # booster='dart',
    )
)

xgb_regressor.fit(X_train, y_train)
y_pred_xgb = xgb_regressor.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred_xgb))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred_xgb))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test, y_pred_xgb))

Mean absolute error: 8.02
Coefficient of determination: 0.72
Mean absolute pct error: 0.13
