In [1]:
import datetime as dt

import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    StandardScaler
)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from analytics.machine_learning.price_prediction_with_fundamentals import utils

# Random Forrest

In [2]:
from sklearn.ensemble import RandomForestRegressor

dataset = utils.get_dataset()

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'avg_next_three_months_price']

y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    (
        StandardScaler(),
        ~X_train.columns.isin(['sector'])
    ),
    remainder='passthrough'
)

random_forrest_regressor = make_pipeline(
    column_transformer,
    RandomForestRegressor(n_estimators=100)
)

random_forrest_regressor.fit(X_train, y_train['avg_next_three_months_price'])
y_pred_random_forrest = random_forrest_regressor.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_random_forrest))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_random_forrest))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_random_forrest))


Mean absolute error: 6.26
Coefficient of determination: 0.97
Mean absolute pct error: 0.13


# Lasso Regression

In [2]:
from sklearn.linear_model import Lasso

dataset = utils.get_dataset()

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'avg_next_three_months_price']


y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(cols_to_drop, axis=1)


column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    (
        StandardScaler(),
        ~X_train.columns.isin(['sector'])
    ),
    remainder='passthrough'
)

lasso_reg = make_pipeline(
    column_transformer,
    Lasso(max_iter=3000, alpha=1)
)

lasso_reg.fit(X_train, y_train['avg_next_three_months_price'])
y_pred_lasso_reg =  lasso_reg.predict(X_test)

print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_lasso_reg))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_lasso_reg))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_lasso_reg))

Mean absolute error: 6.01
Coefficient of determination: 0.99
Mean absolute pct error: 0.12
