In [1]:
import datetime as dt

import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    StandardScaler
)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsRegressor

from analytics.machine_learning.price_prediction_with_fundamentals import utils

In [2]:
dataset = utils.get_dataset(only_value_change_columns=True)

In [3]:
train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

In [5]:
cols_to_drop = ['symbol', 'fiscal_date_ending', 'reported_currency', 'avg_next_three_months_price']

y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(cols_to_drop, axis=1)

In [13]:
categorical_features = ['sector']
numerical_features = ~X_train.columns.isin(categorical_features)

preprocessor = make_column_transformer(
    (OneHotEncoder(), categorical_features),
    (StandardScaler(), ['total_revenue_value_change', 'total_liabilities_value_change']),
    remainder='passthrough'
)


In [11]:
X_train[['sector', 'total_revenue_value_change', 'total_liabilities_value_change']]

Unnamed: 0,sector,total_revenue_value_change,total_liabilities_value_change
0,LIFE SCIENCES,-4.042930e+05,-9493442.0
1,LIFE SCIENCES,-1.968819e+06,-1271845.0
2,LIFE SCIENCES,3.667197e+06,527701.0
3,LIFE SCIENCES,-1.454008e+06,318380.0
4,LIFE SCIENCES,1.302600e+04,-103763.0
...,...,...,...
48764,MANUFACTURING,3.193900e+09,-77400000.0
48765,TECHNOLOGY,4.840000e+05,1136000.0
48766,TECHNOLOGY,1.510000e+07,-64844000.0
48767,TRADE & SERVICES,-4.186700e+07,1393000.0


In [14]:
preprocessor.fit_transform(X_train[['sector', 'total_revenue_value_change', 'total_liabilities_value_change']])[0]

array([ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        , -0.00743135, -0.00641379])

In [6]:
from sklearn.pipeline import make_pipeline


neigh_regressor = make_pipeline(
    OneHotEncoder(categories=['sector']),
    StandardScaler(),
    KNeighborsRegressor(
        n_neighbors=10,
        weights='distance'
    )
)


neigh_regressor.fit(X_train, y_train['avg_next_three_months_price'])
y_pred_neigh_regressor = pd.Series(neigh_regressor.predict(X_test))

ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).

In [7]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_neigh_regressor))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_neigh_regressor))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_neigh_regressor))

Mean absolute error: 53.22
Coefficient of determination: 0.11
Mean absolute pct error: 2.32


In [8]:
from sklearn.neighbors import RadiusNeighborsRegressor

radius_regressor = RadiusNeighborsRegressor(radius=1.0, weights='distance')

radius_regressor.fit(X_train_transformed, y_train['avg_next_three_months_price'])
y_pred_radius_regressor = pd.Series(neigh_regressor.predict(X_test_transformed))

In [9]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_radius_regressor))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_radius_regressor))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_radius_regressor))

Mean absolute error: 53.22
Coefficient of determination: 0.11
Mean absolute pct error: 2.32


In [42]:
from sklearn.svm import LinearSVR

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


sv_regr = make_pipeline(
    StandardScaler(),
    LinearSVR(
        dual="auto",
        random_state=0, 
        tol=1e-20,
        C=0.1,
        max_iter=20000,
        #loss="squared_epsilon_insensitive"
        epsilon=0.5
    )
)
sv_regr.fit(X_train_transformed, y_train['avg_next_three_months_price'])

y_pred_sv_regr =  sv_regr.predict(X_test_transformed)



In [43]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_sv_regr))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_sv_regr))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_sv_regr))

Mean absolute error: 6.54
Coefficient of determination: 0.99
Mean absolute pct error: 0.16


In [14]:
from sklearn.svm import SVR

svr = make_pipeline(
    StandardScaler(),
    SVR(kernel='linear')
)

svr.fit(X_train_transformed, y_train['avg_next_three_months_price'])

y_pred_svr =  svr.predict(X_test_transformed)

In [15]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_svr))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_svr))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_svr))

Mean absolute error: 6.25
Coefficient of determination: 0.99
Mean absolute pct error: 0.17


Mean absolute error: 6.54
Coefficient of determination: 0.99
Mean absolute pct error: 0.16

In [16]:
utils.calculate_avg_pct_loss_per_sector(
    y_pred=y_test['avg_next_three_months_price'],
    y_actual=y_pred_svr,
    sector_series=y_test['sector']
)

{'MANUFACTURING': 15.246270784732788,
 'LIFE SCIENCES': 23.35996424172644,
 'TRADE & SERVICES': 16.32714793982479,
 'TECHNOLOGY': 16.270693934188213,
 'FINANCE': 13.240326329463228,
 'ENERGY & TRANSPORTATION': 14.727300861959604,
 'REAL ESTATE & CONSTRUCTION': 14.98303908659845}

In [7]:
from sklearn.linear_model import SGDRegressor

from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

sgd_reg = make_pipeline(
    StandardScaler(),
    SGDRegressor(max_iter=1000, tol=1e-3)
)

sgd_reg.fit(X_train_transformed, y_train['avg_next_three_months_price'])
y_pred_sgd_reg =  sgd_reg.predict(X_test_transformed)

In [8]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_sgd_reg))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_sgd_reg))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_sgd_reg))

Mean absolute error: 13407242157.96
Coefficient of determination: -3142959443303778816.00
Mean absolute pct error: 1613277416.36


In [20]:
from sklearn.linear_model import Ridge

ridge_reg = make_pipeline(
    StandardScaler(),
    Ridge()
)

ridge_reg.fit(X_train_transformed, y_train['avg_next_three_months_price'])
y_pred_ridge_reg =  ridge_reg.predict(X_test_transformed)

In [21]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_ridge_reg))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_ridge_reg))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_ridge_reg))

Mean absolute error: 7.29
Coefficient of determination: 0.99
Mean absolute pct error: 0.24


In [6]:
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline


# lasso_reg = make_pipeline(
#     StandardScaler(),
#     Lasso()
# )

lasso_reg = Lasso()

lasso_reg.fit(X_train_transformed, y_train['avg_next_three_months_price'])
y_pred_lasso_reg =  lasso_reg.predict(X_test_transformed)

In [7]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_lasso_reg))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_lasso_reg))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_lasso_reg))

Mean absolute error: 9.96
Coefficient of determination: 0.98
Mean absolute pct error: 0.61


In [12]:
utils.calculate_avg_pct_loss_per_sector(
    y_pred=y_pred_lasso_reg,
    y_actual=y_test['avg_next_three_months_price'],
    sector_series=y_test['sector']
)

{'MANUFACTURING': 0.11182686833966238,
 'LIFE SCIENCES': 0.18491570527086126,
 'TRADE & SERVICES': 0.12684532627844944,
 'TECHNOLOGY': 0.10898162102013316,
 'FINANCE': 0.08500590219486488,
 'ENERGY & TRANSPORTATION': 0.1094204096043076,
 'REAL ESTATE & CONSTRUCTION': 0.08584409993532244}