In [1]:
import pandas as pd

from analytics.machine_learning.price_prediction_with_fundamentals import utils

In [2]:
dataset = utils.get_dataset(only_value_change_columns=True)

In [3]:
import datetime as dt

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

In [4]:
y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(['avg_next_three_months_price'], axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(['avg_next_three_months_price'], axis=1)

In [5]:
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler
)

one_hot_encoder = OneHotEncoder()
# scaler = MinMaxScaler(feature_range=(-1,1))

X_train_transformed = utils.transform_input(
    X=X_train,
    one_hot_encoder=one_hot_encoder,
    # scaler=scaler,
    fit=True
)

X_test_transformed = utils.transform_input(
    X=X_test,
    one_hot_encoder=one_hot_encoder,
    # scaler=scaler,
    fit=False
)

In [16]:
# target_scaler = MinMaxScaler()

# y_train_scaled = utils.tranform_target(
#     y=y_train['avg_next_three_months_price'],
#     scaler=target_scaler,
#     fit=True
# )

# y_test_scaled = utils.tranform_target(
#     y=y_test['avg_next_three_months_price'],
#     scaler=target_scaler,
#     fit=False
# )

In [6]:
X_train_transformed

Unnamed: 0,change_in_cash_and_cash_equivalents,change_in_exchange_rate,accumulated_depreciation_amortization_ppe_value_change,capital_expenditures_value_change,capital_lease_obligations_value_change,cash_and_cash_equivalents_at_carrying_value_value_change,cash_and_short_term_investments_value_change,cashflow_from_financing_value_change,cashflow_from_investment_value_change,change_in_inventory_value_change,...,avg_global_commodities_index_value,inflation,avg_three_months_price,sector_ENERGY & TRANSPORTATION,sector_FINANCE,sector_LIFE SCIENCES,sector_MANUFACTURING,sector_REAL ESTATE & CONSTRUCTION,sector_TECHNOLOGY,sector_TRADE & SERVICES
0,1093579.0,257799.0,1830683.0,23879.0,0.0,1093579.0,1093579.0,0.0,-23879.0,-1063153.0,...,181.335364,3.156842,1.659462,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-1395216.0,41252.0,-1830683.0,-51561.0,0.0,-1395216.0,-1395216.0,159887.0,51561.0,-470311.0,...,171.818546,2.069337,2.210792,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-1191328.0,-247599.0,0.0,2394191.0,0.0,-1191328.0,-1191328.0,472935.0,-2394191.0,2016519.0,...,170.295168,2.069337,2.165769,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2057539.0,211511.0,2192891.0,-2399949.0,0.0,2057539.0,2057539.0,-632822.0,2087818.0,-1637846.0,...,170.093088,2.069337,1.754654,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,339849.0,34602.0,137223.0,0.0,0.0,339849.0,339849.0,0.0,303462.0,1793347.0,...,173.578270,2.069337,1.877508,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49439,0.0,0.0,0.0,-74600000.0,-16700000.0,59000000.0,59000000.0,-140700000.0,60900000.0,-39500000.0,...,157.781959,4.642850,224.471538,0.0,0.0,0.0,1.0,0.0,0.0,0.0
49440,15839000.0,0.0,0.0,-172000.0,-237000.0,15839000.0,7021000.0,20362000.0,2570000.0,1998000.0,...,157.781959,4.642850,42.158462,0.0,0.0,0.0,0.0,0.0,1.0,0.0
49441,40881000.0,0.0,0.0,8549000.0,5098000.0,41301000.0,41430000.0,-12281000.0,-7552000.0,10524000.0,...,157.781959,4.642850,405.950000,0.0,0.0,0.0,0.0,0.0,1.0,0.0
49442,-43727000.0,0.0,0.0,14469000.0,827000.0,-38259000.0,20048000.0,7788000.0,-27151000.0,54438000.0,...,157.781959,4.642850,71.776154,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Linear Regression

In [12]:
from sklearn import linear_model
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error

regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train_transformed, y_train['avg_next_three_months_price'])

# Make predictions using the testing set
y_pred = pd.Series(regr.predict(X_test_transformed))

# The mean squared error
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred))

Mean absolute error: 6.51
Coefficient of determination: 0.99
Mean absolute pct error: 0.16


# Neural Net

In [20]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

In [21]:
import datetime as dt

dataset = utils.get_dataset(only_value_change_columns=True)

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(['avg_next_three_months_price'], axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(['avg_next_three_months_price'], axis=1)

In [22]:
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler
)

one_hot_encoder = OneHotEncoder()
# scaler = MinMaxScaler()

X_train_transformed = utils.transform_input(
    X=X_train,
    one_hot_encoder=one_hot_encoder,
    # scaler=scaler,
    fit=True
)

X_test_transformed = utils.transform_input(
    X=X_test,
    one_hot_encoder=one_hot_encoder,
    # scaler=scaler,
    fit=False
)

In [23]:
import numpy as np

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(X_train_transformed))

In [24]:
first = np.array(X_train_transformed[:1])

with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Normalized:', normalizer(first).numpy())

First example: [[  1093579.      257799.     1830683.       23879.           0.
    1093579.     1093579.           0.      -23879.    -1063153.
  -12132616.    -9657838.   -10579782.         137.       50000.
   -1584299.      326225.      326225.       -5825.           0.
          0.    -2872687.           0.           0.           0.
       3334.           0.           0.           0.    -1879443.
          0.           0.     -730518.    -1880742.     -281861.
          0.           0.        1299.        1299.           0.
    -904453.           0.           0.           0.           0.
          0.    -1598881.    -1593580.       -1299.       16791.
    1894634.     1931084.    -1928159.           0.           0.
          0.           0.           0.     1591619.           0.
          0.           0.           0.           0.           0.
          0.           0.           0.           0.    -1593580.
     -17733.           0.     -219626.     1197064.           0.
          

In [17]:
X_train_transformed

Unnamed: 0,change_in_cash_and_cash_equivalents,change_in_exchange_rate,accumulated_depreciation_amortization_ppe_value_change,capital_expenditures_value_change,capital_lease_obligations_value_change,cash_and_cash_equivalents_at_carrying_value_value_change,cash_and_short_term_investments_value_change,cashflow_from_financing_value_change,cashflow_from_investment_value_change,change_in_inventory_value_change,...,avg_global_commodities_index_value,inflation,avg_three_months_price,sector_ENERGY & TRANSPORTATION,sector_FINANCE,sector_LIFE SCIENCES,sector_MANUFACTURING,sector_REAL ESTATE & CONSTRUCTION,sector_TECHNOLOGY,sector_TRADE & SERVICES
0,0.327301,0.447972,0.497745,0.5,0.175715,0.999884,0.999884,0.5,0.5,0.999996,...,0.631043,0.385356,0.000291,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.327292,0.447772,0.497741,0.5,0.175715,0.999884,0.999884,0.5,0.5,0.999996,...,0.566434,0.247421,0.000394,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.327293,0.447505,0.497743,0.5,0.175715,0.999884,0.999884,0.5,0.5,0.999996,...,0.556092,0.247421,0.000386,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.327305,0.447929,0.497746,0.5,0.175715,0.999884,0.999884,0.5,0.5,0.999996,...,0.554720,0.247421,0.000308,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.327299,0.447766,0.497743,0.5,0.175715,0.999884,0.999884,0.5,0.5,0.999996,...,0.578381,0.247421,0.000332,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49439,0.327297,0.447734,0.497743,0.5,0.175713,0.999884,0.999884,0.5,0.5,0.999996,...,0.471141,0.573836,0.042166,0.0,0.0,0.0,1.0,0.0,0.0,0.0
49440,0.327353,0.447734,0.497743,0.5,0.175715,0.999884,0.999884,0.5,0.5,0.999996,...,0.471141,0.573836,0.007902,0.0,0.0,0.0,0.0,0.0,1.0,0.0
49441,0.327442,0.447734,0.497743,0.5,0.175715,0.999884,0.999884,0.5,0.5,0.999996,...,0.471141,0.573836,0.076273,0.0,0.0,0.0,0.0,0.0,1.0,0.0
49442,0.327143,0.447734,0.497743,0.5,0.175715,0.999884,0.999884,0.5,0.5,0.999996,...,0.471141,0.573836,0.013468,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [25]:
model = keras.Sequential([
    normalizer,
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(
    loss='mean_absolute_error',
    optimizer=tf.keras.optimizers.Adam(0.001)
)

# Train the model
history = model.fit(
    X_train_transformed,
    y_train['avg_next_three_months_price'],
    validation_split=0.2,
    verbose=0,
    epochs=100
)




In [26]:
y_pred = model.predict(X_test_transformed)



In [27]:
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error

# The mean squared error
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['avg_next_three_months_price'], y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test['avg_next_three_months_price'], y_pred))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['avg_next_three_months_price'], y_pred))

Mean absolute error: 14.88
Coefficient of determination: -0.35
Mean absolute pct error: 1.15
