In [38]:
import datetime as dt

import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    StandardScaler
)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from analytics.machine_learning.price_prediction_with_fundamentals import utils

# Random Forrest

In [33]:
from sklearn.ensemble import RandomForestRegressor

dataset = utils.get_dataset()

value_change_cols = ['change_in_cash_and_cash_equivalents', 'change_in_exchange_rate']
for col_name in dataset.columns:
    if '_value_change' in col_name:
        value_change_cols.append(col_name)

economic_indicators_avg_value_cols = [
    'avg_interest_rate',
    'avg_treasury_yield',
    'avg_natural_gas_price',
    'avg_oil_price',
    'avg_unemployment_rate',
    'avg_global_commodities_index_value',
    'inflation'
]
columnss_to_keep = value_change_cols + economic_indicators_avg_value_cols  + ['symbol', 'reported_currency', 'fiscal_date_ending', 'sector', 'industry', 'avg_three_months_price', 'avg_next_three_months_price']
dataset = dataset[columnss_to_keep]
# Keep only the top 20 industry categories and replace the rest with Other
dataset['industry'].fillna('Other', inplace=True)
top_categories = dataset['industry'].value_counts().nlargest(30).index
dataset.loc[~dataset['industry'].isin(top_categories), 'industry'] = 'Other'

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

world_indices_cols = ['s_p_500_index_value_change', 'dow_jones_index_value_change', 'nasdaq_index_value_change', 'nyse_index_value_change']
economic_indicators_cols = [
    'interest_rate_value_change',
    'treasury_yield_value_change',
    'natural_gas_price_value_change',
    'oil_price_value_change',
    'unemployment_rate_value_change',
    'global_commodities_index_value_change',
]
cols_to_drop = ['symbol', 'fiscal_date_ending', 'reported_currency', 'avg_next_three_months_price'] + world_indices_cols + economic_indicators_cols


y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector', 'industry']
    ),
    remainder='passthrough'
)

random_forrest_regressor = make_pipeline(
    column_transformer,
    RandomForestRegressor(n_estimators=100)
)

random_forrest_regressor.fit(X_train, y_train['avg_next_three_months_price'])
y_pred_random_forrest = random_forrest_regressor.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_random_forrest))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_random_forrest))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_random_forrest))


Mean absolute error: 6.54
Coefficient of determination: 0.97
Mean absolute pct error: 0.16


# Lasso Regression

In [41]:
from sklearn.linear_model import Lasso

dataset = utils.get_dataset(only_value_change_columns=True)

# value_change_cols = ['change_in_cash_and_cash_equivalents', 'change_in_exchange_rate']
# for col_name in dataset.columns:
#     if '_value_change' in col_name:
#         value_change_cols.append(col_name)

# economic_indicators_avg_value_cols = [
#     'avg_interest_rate',
#     'avg_treasury_yield',
#     'avg_natural_gas_price',
#     'avg_oil_price',
#     'avg_unemployment_rate',
#     'avg_global_commodities_index_value',
#     'inflation'
# ]
# columnss_to_keep = value_change_cols + economic_indicators_avg_value_cols  + ['symbol', 'reported_currency', 'fiscal_date_ending', 'sector', 'industry', 'avg_three_months_price', 'avg_next_three_months_price']
# dataset = dataset[columnss_to_keep]
# Keep only the top 20 industry categories and replace the rest with Other
dataset['industry'].fillna('Other', inplace=True)
top_categories = dataset['industry'].value_counts().nlargest(30).index
dataset.loc[~dataset['industry'].isin(top_categories), 'industry'] = 'Other'

# train_set, test_set = utils.split_data_to_train_and_test(
#     df=dataset,
#     cutoff_date=dt.datetime(2023,6,1)
# )

world_indices_cols = ['s_p_500_index_value_change', 'dow_jones_index_value_change', 'nasdaq_index_value_change', 'nyse_index_value_change']
# economic_indicators_cols = [
#     'interest_rate_value_change',
#     'treasury_yield_value_change',
#     'natural_gas_price_value_change',
#     'oil_price_value_change',
#     'unemployment_rate_value_change',
#     'global_commodities_index_value_change',
# ]
# cols_to_drop = ['symbol', 'fiscal_date_ending', 'reported_currency', 'avg_next_three_months_price'] + world_indices_cols + economic_indicators_cols

cols_to_drop = ['symbol', 'fiscal_date_ending', 'reported_currency', 'avg_next_three_months_price'] + world_indices_cols

y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(cols_to_drop, axis=1)


column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector', 'industry']
    ),
    (
        StandardScaler(),
        ~X_train.columns.isin(['sector', 'industry'])
    ),
    remainder='passthrough'
)

lasso_reg = make_pipeline(
    column_transformer,
    Lasso(max_iter=3000, alpha=1)
)

lasso_reg.fit(X_train, y_train['avg_next_three_months_price'])
y_pred_lasso_reg =  lasso_reg.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_lasso_reg))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_lasso_reg))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_lasso_reg))

Mean absolute error: 5.91
Coefficient of determination: 0.99
Mean absolute pct error: 0.14
