In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error

from analytics.machine_learning.price_prediction_with_fundamentals import utils

In [4]:
dataset = utils.get_dataset(
    only_value_columns=True
)

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53963 entries, 0 to 53962
Data columns (total 97 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   symbol                                                               53963 non-null  object 
 1   fiscal_date_ending                                                   53963 non-null  object 
 2   reported_currency                                                    53963 non-null  object 
 3   gross_profit                                                         53963 non-null  float64
 4   total_revenue                                                        53963 non-null  float64
 5   cost_of_revenue                                                      53963 non-null  float64
 6   cost_of_goods_and_services_sold                                      53963 non-null  float64
 7   oper

In [12]:
import datetime as dt

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

y_train = train_set[['price', 'sector']]
X_train = train_set.drop(['price'], axis=1)

y_test = test_set[['price', 'sector']]
X_test = test_set.drop(['price'], axis=1)

In [13]:
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler
)

one_hot_encoder = OneHotEncoder()
input_scaler = MinMaxScaler()

X_train_transformed = utils.transform_input(
    X=X_train,
    one_hot_encoder=one_hot_encoder,
    #min_max_scaler=input_scaler,
    fit=True
)

X_test_transformed = utils.transform_input(
    X=X_test,
    one_hot_encoder=one_hot_encoder,
    #min_max_scaler=input_scaler,
    fit=False
)

In [8]:
target_scaler = MinMaxScaler()

y_train_transformed = utils.tranform_target(
    y=y_train['price'],
    min_max_scaler=target_scaler,
    fit=True
)

y_test_transformed = utils.tranform_target(
    y=y_test['price'],
    min_max_scaler=target_scaler,
    fit=False
)

In [14]:
import xgboost as xgb

xgb_regressor = xgb.XGBRegressor(
        objective = 'reg:squarederror',
        n_estimators = 100,
        learning_rate = 0.1,
)

xgb_regressor.fit(X_train_transformed, y_train['price'])

# Make predictions using the testing set
y_pred_xgb = pd.Series(xgb_regressor.predict(X_test_transformed))


In [12]:
y_pred_xgb

0       386.547821
1        50.519630
2        78.581024
3        11.338340
4       106.788521
           ...    
3153     90.825851
3154    235.768997
3155     68.810089
3156     83.901665
3157     12.302379
Length: 3158, dtype: float32

In [13]:
y_test['price']

0       531.500000
1        17.816667
2        72.690000
3         1.300000
4        46.326667
           ...    
3153     69.460000
3154     98.775000
3155     61.515000
3156     72.025000
3157     10.165000
Name: price, Length: 3158, dtype: float64

In [15]:
# The mean squared error
print("Mean absolute error: %.2f" % mean_absolute_error(y_test['price'], y_pred_xgb))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test['price'], y_pred_xgb))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test['price'], y_pred_xgb))

Mean absolute error: 25.09
Coefficient of determination: 0.87
Mean absolute pct error: 0.86


In [16]:
utils.calculate_avg_pct_loss_per_sector(
    y_pred=y_pred_xgb,
    y_actual=y_test['price'],
    sector_series=y_test['sector']
)

{'TECHNOLOGY': 49.92131763072077,
 'MANUFACTURING': 37.31342579678325,
 'LIFE SCIENCES': 52.52818797466729,
 'TRADE & SERVICES': 42.536302840446254,
 'FINANCE': 34.47573652536061,
 'ENERGY & TRANSPORTATION': 54.92193279698565,
 'REAL ESTATE & CONSTRUCTION': -104.04878916386997}

In [17]:
negative_count = 0
for i, pred_value in enumerate(y_pred_xgb):
    if pred_value < 0:
        negative_count += 1

negative_count

62

## Try KNN Regression

In [21]:
dataset = utils.get_dataset()

train_set_knn, test_set_knn = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

y_train_knn = train_set[['price', 'sector']]
X_train_knn = train_set.drop(['price'], axis=1)

y_test_knn = test_set[['price', 'sector']]
X_test_knn = test_set.drop(['price'], axis=1)

one_hot_encoder_knn = OneHotEncoder()

X_train_transformed_knn = utils.transform_input(
    X=X_train_knn,
    one_hot_encoder=one_hot_encoder_knn,
    fit=True
)

X_test_transformed_knn = utils.transform_input(
    X=X_test_knn,
    one_hot_encoder=one_hot_encoder_knn,
    fit=False
)

In [22]:
from sklearn.neighbors import KNeighborsRegressor

knn_regr = KNeighborsRegressor(n_neighbors=10)

knn_regr.fit(X_train_transformed_knn, y_train_knn['price'])

y_pred_knn = knn_regr.predict(X_test_transformed_knn)

In [23]:
# The mean squared error
print("Mean absolute error: %.2f" % mean_absolute_error(y_test_knn['price'], y_pred_knn))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test_knn['price'], y_pred_knn))
print("Mean absolute pct error: %.2f" % mean_absolute_percentage_error(y_test_knn['price'], y_pred_knn))

Mean absolute error: 26.89
Coefficient of determination: 0.77
Mean absolute pct error: 1.16
