In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error

from analytics.machine_learning.price_prediction_with_fundamentals import utils

In [2]:
dataset = utils.get_dataset(only_value_change_columns=True)

In [3]:
dataset['avg_next_three_months_price'].describe()

count    52633.000000
mean        68.113406
std        153.437300
min          0.113538
25%         15.354615
50%         33.767692
75%         72.607500
max       6211.739167
Name: avg_next_three_months_price, dtype: float64

In [5]:
dataset.columns


Index(['change_in_cash_and_cash_equivalents', 'change_in_exchange_rate',
       'accumulated_depreciation_amortization_ppe_value_change',
       'capital_expenditures_value_change',
       'capital_lease_obligations_value_change',
       'cash_and_cash_equivalents_at_carrying_value_value_change',
       'cash_and_short_term_investments_value_change',
       'cashflow_from_financing_value_change',
       'cashflow_from_investment_value_change',
       'change_in_inventory_value_change',
       'change_in_operating_assets_value_change',
       'change_in_operating_liabilities_value_change',
       'change_in_receivables_value_change', 'common_stock_value_change',
       'common_stock_shares_outstanding_value_change',
       'comprehensive_income_net_of_tax_value_change',
       'cost_of_goods_and_services_sold_value_change',
       'cost_of_revenue_value_change', 'current_accounts_payable_value_change',
       'current_debt_value_change', 'current_long_term_debt_value_change',
       'cu

In [3]:
import datetime as dt


train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

In [9]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49444 entries, 0 to 49443
Data columns (total 97 columns):
 #   Column                                                                            Non-Null Count  Dtype  
---  ------                                                                            --------------  -----  
 0   change_in_cash_and_cash_equivalents                                               49444 non-null  float64
 1   change_in_exchange_rate                                                           49444 non-null  float64
 2   accumulated_depreciation_amortization_ppe_value_change                            49444 non-null  float64
 3   capital_expenditures_value_change                                                 49444 non-null  float64
 4   capital_lease_obligations_value_change                                            49444 non-null  float64
 5   cash_and_cash_equivalents_at_carrying_value_value_change                          49444 non-null  float64
 6 

In [10]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3189 entries, 0 to 3188
Data columns (total 97 columns):
 #   Column                                                                            Non-Null Count  Dtype  
---  ------                                                                            --------------  -----  
 0   change_in_cash_and_cash_equivalents                                               3189 non-null   float64
 1   change_in_exchange_rate                                                           3189 non-null   float64
 2   accumulated_depreciation_amortization_ppe_value_change                            3189 non-null   float64
 3   capital_expenditures_value_change                                                 3189 non-null   float64
 4   capital_lease_obligations_value_change                                            3189 non-null   float64
 5   cash_and_cash_equivalents_at_carrying_value_value_change                          3189 non-null   float64
 6   

In [4]:
y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(['avg_next_three_months_price'], axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(['avg_next_three_months_price'], axis=1)

In [5]:
from sklearn.preprocessing import (
    OneHotEncoder,
)

one_hot_encoder = OneHotEncoder()

X_train_transformed = utils.transform_input(
    X=X_train,
    one_hot_encoder=one_hot_encoder,
    fit=True
)

X_test_transformed = utils.transform_input(
    X=X_test,
    one_hot_encoder=one_hot_encoder,
    fit=False
)

In [6]:
from sklearn.ensemble import RandomForestRegressor

random_forest_regr = RandomForestRegressor(n_estimators=100)

random_forest_regr.fit(X_train_transformed, y_train['avg_next_three_months_price'])

# Make predictions using the testing set
y_pred_rf = pd.Series(random_forest_regr.predict(X_test_transformed))

In [7]:
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error

# The mean squared error
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_rf))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_rf))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_rf))

Mean absolute error: 6.37
Coefficient of determination: 0.97
Mean absolute pct error: 0.13


In [11]:
utils.calculate_avg_pct_loss_per_sector(
    y_pred=y_pred_rf,
    y_actual=y_test['avg_next_three_months_price'],
    # y_pred=y_test['avg_next_three_months_price'],
    # y_actual=y_pred_rf,
    sector_series=y_test['sector']
)

{'MANUFACTURING': 0.12164958654503585,
 'LIFE SCIENCES': 0.19791071353856526,
 'TRADE & SERVICES': 0.1475747442936356,
 'TECHNOLOGY': 0.11318599876649903,
 'FINANCE': 0.08687217920728653,
 'ENERGY & TRANSPORTATION': 0.1150310468238148,
 'REAL ESTATE & CONSTRUCTION': 0.10173205224725565}

In [10]:
utils.get_feature_importance_sorted(
    feature_importance_scores=random_forest_regr.feature_importances_,
    feature_names=random_forest_regr.feature_names_in_
)

[('avg_three_months_price', 0.9715616744357627),
 ('common_stock_shares_outstanding_value_change', 0.0036088518806861023),
 ('other_current_liabilities_value_change', 0.0015974911440723842),
 ('short_term_investments_value_change', 0.0013337330145527773),
 ('avg_oil_price', 0.0013276948839551416),
 ('investments_value_change', 0.0009601592620496372),
 ('avg_natural_gas_price', 0.0005785545865728654),
 ('capital_lease_obligations_value_change', 0.0005107163355619238),
 ('avg_treasury_yield', 0.00048681878119731444),
 ('avg_global_commodities_index_value', 0.00045724397803616624),
 ('total_current_assets_value_change', 0.00042925508151854243),
 ('change_in_operating_assets_value_change', 0.00042866856951121347),
 ('sector_REAL ESTATE & CONSTRUCTION', 0.0004206700119974058),
 ('total_revenue_value_change', 0.0004156352683536201),
 ('retained_earnings_value_change', 0.0004156238362974992),
 ('depreciation_depletion_and_amortization_value_change',
  0.0003773763680451207),
 ('capital_expend

# XGBoost

In [11]:
import xgboost as xgb

xgb_regressor = xgb.XGBRegressor(
        objective = 'reg:squarederror',
        n_estimators = 100,
        learning_rate = 0.1,
)

xgb_regressor.fit(X_train_transformed, y_train['avg_next_three_months_price'])

# Make predictions using the testing set
y_pred_xgb = pd.Series(xgb_regressor.predict(X_test_transformed))


In [12]:
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error

# The mean squared error
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_xgb))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_xgb))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_xgb))

Mean absolute error: 7.49
Coefficient of determination: 0.95
Mean absolute pct error: 0.16
