In [1]:
import datetime as dt

import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    StandardScaler
)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso

from analytics.machine_learning.price_prediction_with_fundamentals import utils

In [2]:
dataset = utils.get_dataset(only_value_change_columns=True)

In [4]:
train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

In [5]:
cols_to_drop = ['symbol', 'fiscal_date_ending', 'reported_currency', 'avg_next_three_months_price']

y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(cols_to_drop, axis=1)

In [6]:
categorical_features = ['sector']
numerical_features = ~X_train.columns.isin(categorical_features)

preprocessor = make_column_transformer(
    (OneHotEncoder(), categorical_features),
    (StandardScaler(), ['total_revenue_value_change', 'total_liabilities_value_change']),
    remainder='passthrough'
)


In [7]:
neigh_regressor = make_pipeline(
    preprocessor,
    KNeighborsRegressor(
        n_neighbors=10,
        weights='distance'
    )
)

neigh_regressor.fit(X_train, y_train['avg_next_three_months_price'])
y_pred_neigh_regressor = pd.Series(neigh_regressor.predict(X_test))

In [8]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_neigh_regressor))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_neigh_regressor))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_neigh_regressor))

Mean absolute error: 53.28
Coefficient of determination: 0.10
Mean absolute pct error: 2.32


In [9]:
from sklearn.neighbors import RadiusNeighborsRegressor

radius_regressor = make_pipeline(
    preprocessor,
    RadiusNeighborsRegressor(radius=1.0, weights='distance')
)

radius_regressor.fit(X_train, y_train['avg_next_three_months_price'])
y_pred_radius_regressor = pd.Series(neigh_regressor.predict(X_test))

In [10]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_radius_regressor))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_radius_regressor))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_radius_regressor))

Mean absolute error: 53.28
Coefficient of determination: 0.10
Mean absolute pct error: 2.32


In [11]:
from sklearn.svm import LinearSVR

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


sv_regr = make_pipeline(
    preprocessor,
    LinearSVR(
        dual="auto",
        random_state=0, 
        tol=1e-20,
        C=0.1,
        max_iter=20000,
        #loss="squared_epsilon_insensitive"
        epsilon=0.5
    )
)
sv_regr.fit(X_train, y_train['avg_next_three_months_price'])

y_pred_sv_regr =  sv_regr.predict(X_test)



In [12]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_sv_regr))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_sv_regr))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_sv_regr))

Mean absolute error: 58029067.61
Coefficient of determination: -339955457742053.81
Mean absolute pct error: 12706191.84


In [13]:
from sklearn.svm import SVR

svr = make_pipeline(
    preprocessor,
    SVR(kernel='linear')
)

svr.fit(X_train, y_train['avg_next_three_months_price'])

y_pred_svr =  svr.predict(X_test)

In [15]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_svr))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_svr))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_svr))

Mean absolute error: 6.25
Coefficient of determination: 0.99
Mean absolute pct error: 0.17


In [16]:
utils.calculate_avg_pct_loss_per_sector(
    y_pred=y_test['avg_next_three_months_price'],
    y_actual=y_pred_svr,
    sector_series=y_test['sector']
)

{'MANUFACTURING': 15.246270784732788,
 'LIFE SCIENCES': 23.35996424172644,
 'TRADE & SERVICES': 16.32714793982479,
 'TECHNOLOGY': 16.270693934188213,
 'FINANCE': 13.240326329463228,
 'ENERGY & TRANSPORTATION': 14.727300861959604,
 'REAL ESTATE & CONSTRUCTION': 14.98303908659845}

In [7]:
from sklearn.linear_model import SGDRegressor

sgd_reg = make_pipeline(
    preprocessor,
    SGDRegressor()
)

sgd_reg.fit(X_train, y_train['avg_next_three_months_price'])
y_pred_sgd_reg =  sgd_reg.predict(X_test)

In [8]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_sgd_reg))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_sgd_reg))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_sgd_reg))

Mean absolute error: 24081442819020007879184411412477771776.00
Coefficient of determination: -25533806818964250673175774928207193793647420877945504572789582984915189760.00
Mean absolute pct error: 4064884997415303453223940650396811264.00


In [9]:
from sklearn.linear_model import Ridge

ridge_reg = make_pipeline(
    preprocessor,
    Ridge()
)

ridge_reg.fit(X_train, y_train['avg_next_three_months_price'])
y_pred_ridge_reg =  ridge_reg.predict(X_test)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [10]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_ridge_reg))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_ridge_reg))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_ridge_reg))

Mean absolute error: 7.28
Coefficient of determination: 0.99
Mean absolute pct error: 0.24


In [19]:
from sklearn.linear_model import Lasso


lasso_reg = make_pipeline(
    preprocessor,
    Lasso(max_iter=3000, alpha=1)
)

lasso_reg.fit(X_train, y_train['avg_next_three_months_price'])
y_pred_lasso_reg =  lasso_reg.predict(X_test)

  model = cd_fast.enet_coordinate_descent(


In [20]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_lasso_reg))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_lasso_reg))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_lasso_reg))

Mean absolute error: 6.37
Coefficient of determination: 0.99
Mean absolute pct error: 0.15


In [12]:
utils.calculate_avg_pct_loss_per_sector(
    y_pred=y_pred_lasso_reg,
    y_actual=y_test['avg_next_three_months_price'],
    sector_series=y_test['sector']
)

{'MANUFACTURING': 0.11182686833966238,
 'LIFE SCIENCES': 0.18491570527086126,
 'TRADE & SERVICES': 0.12684532627844944,
 'TECHNOLOGY': 0.10898162102013316,
 'FINANCE': 0.08500590219486488,
 'ENERGY & TRANSPORTATION': 0.1094204096043076,
 'REAL ESTATE & CONSTRUCTION': 0.08584409993532244}

In [24]:
train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(['avg_next_three_months_price'], axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(['avg_next_three_months_price'], axis=1)

In [25]:
one_hot_encoder = OneHotEncoder()

X_train_transformed = utils.transform_input(
    X=X_train,
    one_hot_encoder=one_hot_encoder,
    fit=True
)

X_test_transformed = utils.transform_input(
    X=X_test,
    one_hot_encoder=one_hot_encoder,
    fit=False
)

In [26]:
lasso_reg_v2 = make_pipeline(
    StandardScaler(),
    Lasso(max_iter=3000, alpha=1)
)

lasso_reg_v2.fit(X_train_transformed, y_train['avg_next_three_months_price'])
y_pred_lasso_reg_v2 =  lasso_reg_v2.predict(X_test_transformed)

In [27]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_lasso_reg_v2))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_lasso_reg_v2))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_lasso_reg_v2))

Mean absolute error: 6.06
Coefficient of determination: 0.99
Mean absolute pct error: 0.12


# THE ONE BELOW IS THE EXAMPLE TO WATCH

In [2]:
dataset = utils.get_dataset(only_value_change_columns=True)

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'reported_currency', 'avg_next_three_months_price']

y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(cols_to_drop, axis=1)


column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    (
        StandardScaler(),
        ~X_train.columns.isin(['sector'])
    ),
    remainder='passthrough'
)

lasso_reg_v3 = make_pipeline(
    column_transformer,
    Lasso(max_iter=3000, alpha=1)
)

lasso_reg_v3.fit(X_train, y_train['avg_next_three_months_price'])
y_pred_lasso_reg_v3 =  lasso_reg_v3.predict(X_test)

In [3]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_lasso_reg_v3))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_lasso_reg_v3))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_lasso_reg_v3))

Mean absolute error: 6.06
Coefficient of determination: 0.99
Mean absolute pct error: 0.12


## Stohastic Gradient Descent

In [2]:
from sklearn.linear_model import Ridge


dataset = utils.get_dataset(only_value_change_columns=True)

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'reported_currency', 'avg_next_three_months_price']

y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(cols_to_drop, axis=1)


column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    (
        StandardScaler(),
        ~X_train.columns.isin(['sector'])
    ),
    remainder='passthrough'
)

ridge_reg = make_pipeline(
    column_transformer,
    Ridge()
)

ridge_reg.fit(X_train, y_train['avg_next_three_months_price'])
y_pred_ridge =  ridge_reg.predict(X_test)

In [3]:
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred_ridge))
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred_ridge))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred_ridge))

Mean absolute error: 7.29
Coefficient of determination: 0.99
Mean absolute pct error: 0.24
