In [1]:
import datetime as dt

import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler
)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from analytics.machine_learning.price_prediction_with_fundamentals import utils

In [2]:
dataset = utils.get_features(features_categories=['arctan_pct_change'])

In [5]:
dataset['price_avg_pct_change_next_three_months'].describe()

count    52877.000000
mean         0.003427
std          0.052282
min         -0.168366
25%         -0.008170
50%          0.001418
75%          0.011758
max          5.064149
Name: price_avg_pct_change_next_three_months, dtype: float64

In [7]:
dataset[dataset['price_avg_pct_change_next_three_months'] > 0.04]

Unnamed: 0,symbol,sector,fiscal_date_ending,avg_interest_rate,avg_treasury_yield,avg_unemployment_rate,inflation,price_avg_pct_change_three_months,price_volatility_three_months,price_avg_pct_change_next_three_months,...,operating_income_arctan_pct_change,payments_for_repurchase_of_equity_arctan_pct_change,proceeds_from_issuance_of_long_term_debt_and_capital_securities_net_arctan_pct_change,property_plant_equipment_arctan_pct_change,total_assets_arctan_pct_change,total_current_assets_arctan_pct_change,total_current_liabilities_arctan_pct_change,total_liabilities_arctan_pct_change,total_revenue_arctan_pct_change,total_shareholder_equity_arctan_pct_change
0,NEGG,LIFE SCIENCES,2011-12-31,0.103333,2.036667,8.266667,3.156842,-0.010763,0.068063,0.052330,...,-0.860448,0.000000,0.0,-0.005268,-0.207480,-0.223584,-0.590515,-0.588085,-0.070946,0.004828
180,JKS,MANUFACTURING,2018-12-31,2.403333,2.653333,3.866667,2.442583,0.008197,0.120519,0.050504,...,0.256635,0.000000,0.0,-0.700760,0.014928,0.005050,-0.003766,0.011696,0.151999,0.014667
222,STNG,ENERGY & TRANSPORTATION,2018-12-31,2.403333,2.653333,3.866667,2.442583,-0.005378,0.125385,0.801848,...,-1.161759,0.000000,0.0,-0.010555,0.067756,0.759401,0.000566,0.008346,0.384342,0.177445
225,HIVE,TECHNOLOGY,2018-12-31,2.403333,2.653333,3.866667,2.442583,-0.058758,0.109008,0.048459,...,0.453670,0.000000,0.0,0.000000,-0.253244,-0.308488,0.552548,0.524650,-1.067935,-0.275433
228,ASND,LIFE SCIENCES,2018-12-31,2.403333,2.653333,3.866667,2.442583,-0.005283,0.084671,0.066674,...,-0.063487,0.000000,0.0,0.368714,-0.048483,-0.103844,0.248971,0.248971,1.568891,-0.079237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52496,TIGR,FINANCE,2023-06-30,5.260000,4.150000,3.700000,4.642850,-0.006801,0.087137,0.064439,...,-0.783827,0.000000,0.0,-0.089005,-0.014084,-0.013020,-0.022976,-0.023647,0.034446,0.043755
52555,CSPI,TECHNOLOGY,2023-06-30,5.260000,4.150000,3.700000,4.642850,-0.011833,0.075174,0.042579,...,0.959277,1.570795,0.0,-0.082007,0.023911,-0.046884,-0.012091,-0.048789,0.322835,0.066948
52579,DLO,TRADE & SERVICES,2023-06-30,5.260000,4.150000,3.700000,4.642850,-0.005027,0.080443,0.053280,...,0.208957,0.000000,0.0,0.054344,0.217359,0.228734,0.424829,0.425001,0.172014,-0.026821
52700,SKIL,REAL ESTATE & CONSTRUCTION,2023-07-31,5.330000,4.450000,3.833333,4.642850,0.014760,0.133175,1.212449,...,-0.277643,-0.719097,0.0,0.046787,-0.040415,-0.094640,-0.082512,-0.034253,0.044396,-0.053644


# Random Forrest

In [2]:
from sklearn.ensemble import RandomForestRegressor

dataset = utils.get_features(features_categories=['arctan_pct_change'])

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'price_avg_pct_change_next_three_months']
target_col = 'price_avg_pct_change_next_three_months'

y_train = train_set[target_col]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[target_col]
X_test = test_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    remainder='passthrough'
)

random_forrest = RandomForestRegressor(n_estimators=100)
random_forrest_regressor = make_pipeline(
    column_transformer,
    random_forrest
)

random_forrest_regressor.fit(X_train, y_train)
y_pred_random_forrest = random_forrest_regressor.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred_random_forrest))

Mean absolute error: 0.02


In [27]:
random_forrest_regressor.named_steps

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('onehotencoder', OneHotEncoder(), ['sector'])]),
 'randomforestregressor': RandomForestRegressor()}

In [28]:
utils.get_feature_importance_sorted(
    feature_importance_scores=random_forrest_regressor.named_steps['randomforestregressor'].feature_importances_,
    feature_names=random_forrest_regressor.named_steps['columntransformer'].get_feature_names_out()
)

[('remainder__price_avg_pct_change_three_months', 0.20945009427850544),
 ('remainder__price_volatility_three_months', 0.13415223412601202),
 ('remainder__gross_profit_arctan_pct_change', 0.04793211263613009),
 ('remainder__payments_for_repurchase_of_equity_arctan_pct_change',
  0.041327137760405454),
 ('remainder__operating_cashflow_arctan_pct_change', 0.0382234404450389),
 ('remainder__cashflow_from_investment_arctan_pct_change',
  0.03522573261405776),
 ('remainder__cashflow_from_financing_arctan_pct_change',
  0.029562971378163307),
 ('remainder__property_plant_equipment_arctan_pct_change',
  0.027551993416335643),
 ('remainder__total_assets_arctan_pct_change', 0.026958764258755853),
 ('remainder__capital_expenditures_arctan_pct_change', 0.02645145927180378),
 ('remainder__avg_treasury_yield', 0.02639081423152141),
 ('remainder__avg_interest_rate', 0.026312321986542635),
 ('remainder__proceeds_from_issuance_of_long_term_debt_and_capital_securities_net_arctan_pct_change',
  0.0241841

# Lasso Regression

In [4]:
from sklearn.linear_model import Lasso

dataset = utils.get_features(features_categories=['arctan_ratio'])

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'price_avg_pct_change_next_three_months']
target_col = 'price_avg_pct_change_next_three_months'

y_train = train_set[target_col]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[target_col]
X_test = test_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    (
        StandardScaler(),
        ~X_train.columns.isin(['sector'])
    ),
    remainder='passthrough'
)

lasso_regressor = make_pipeline(
    column_transformer,
    Lasso()
)

lasso_regressor.fit(X_train, y_train)
y_pred_lasso = lasso_regressor.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred_lasso))
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred_lasso))

Mean absolute error: 0.02
Coefficient of determination: -0.01


# XGBoost

In [2]:
import xgboost as xgb

dataset = utils.get_features(features_categories=['arctan_pct_change'])

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'price_avg_pct_change_next_three_months']
target_col = 'price_avg_pct_change_next_three_months'

y_train = train_set[target_col]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[target_col]
X_test = test_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    remainder='passthrough'
)

xgb_regressor = make_pipeline(
    column_transformer,
    xgb.XGBRegressor(
        objective = 'reg:absoluteerror',
        n_estimators = 100,
        learning_rate = 0.1,
        # booster='dart',
    )
)

xgb_regressor.fit(X_train, y_train)
y_pred_xgb = xgb_regressor.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred_xgb))

Mean absolute error: 0.02
