In [1]:
import shap
import joblib

import datetime as dt
import sqlite3

from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.preprocessing import (
    OneHotEncoder,
)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from analytics.machine_learning.price_prediction_with_fundamentals import utils
import pandas as pd

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [3]:
db_conn = sqlite3.connect('/Users/orestis/MyProjects/InvestorAPI/app/database/ibd.db')

query = '''
    SELECT * 
    FROM price_prediction_dataset_v2
    WHERE DATE(Date) <= date('now', '-3 months')
    ORDER BY DATE(Date)
'''

dataset = pd.read_sql(query, db_conn)
dataset.dropna(inplace=True)

# Create categorical target
bins = [-float('inf'), 0, float('inf')]
labels = ['down', 'up']
label_mapping = {0: 'down', 1: 'up'}

dataset['next_three_months_pct_change_range'] = pd.cut(
    dataset['price_pct_change_next_three_months'],
    bins=bins,
    labels=[0, 1],
    right=False
)

print("Target value counts")
print(dataset['next_three_months_pct_change_range'].value_counts())

train_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,8,1),
    cutoff_date_column_name='Date'
)

print("Test set target value counts")
print(test_set['next_three_months_pct_change_range'].value_counts())

cols_to_drop = ['symbol', 'Date', 'price_pct_change_next_six_months', 'price_pct_change_next_three_months', 'price_pct_change_next_month', 'next_three_months_pct_change_range']
target_col = 'next_three_months_pct_change_range'

y_test = test_set[target_col]
X_test = test_set.drop(cols_to_drop, axis=1)

rf_three_months_model = joblib.load('/Users/orestis/MyProjects/InvestorAPI/analytics/machine_learning/price_prediction_with_fundamentals/ml_models/rf_three_months_prediction_model.joblib') 
rf_classifier = rf_three_months_model.steps[1][1]
column_transformer = rf_three_months_model.steps[0][1]


Target value counts
1    80412
0    77928
Name: next_three_months_pct_change_range, dtype: int64
Test set target value counts
0    4329
1    2343
Name: next_three_months_pct_change_range, dtype: int64


In [4]:
explainer = shap.TreeExplainer(rf_classifier)
features = column_transformer.get_feature_names_out()

shap_values = explainer.shap_values(column_transformer.transform(X_test))
shap.summary_plot(shap_values,column_transformer.transform(X_test), feature_names=features)