In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import warnings
warnings.filterwarnings('ignore')

In [2]:
pip install imblearn


Note: you may need to restart the kernel to use updated packages.


In [3]:
# Load the data
file_path = Path('Data/Supervised/pct_cr_etf_5_years_complete_trail_df.csv')
df = pd.read_csv(file_path)

# Preview the data
df.set_index('ETF_Names')
df.head()

Unnamed: 0,ETF_Names,30-Jun-19,31-Jul-19,31-Aug-19,30-Sep-19,31-Oct-19,30-Nov-19,31-Dec-19,31-Jan-20,29-Feb-20,...,31-Mar-22,30-Apr-22,31-May-22,30-Jun-22,STD_2019_2020,STD_2020_2021,STD_2021_2022,Class,PCT_Cumulative_Returns,Target_met
0,iShares Asia 50 ETF,6.81,-0.84,-2.85,3.24,2.12,2.52,4.41,-0.92,3.41,...,-8.37,-0.8,0.93,0.42,3.518415,3.015198,4.323178,0,-59300000000000.0,False
1,iShares China Large-Cap ETF,5.44,-2.18,-2.63,1.63,1.22,1.34,4.14,-4.36,6.23,...,-11.48,2.27,2.01,11.45,3.488395,3.665791,7.037962,0,-3240000000000000.0,False
2,iShares Core Cash ETF,0.13,0.13,0.08,0.09,0.08,0.08,0.08,0.09,0.07,...,0.0,-0.01,0.03,0.05,0.036829,0.005189,0.016132,2,0.0,False
3,iShares Core Composite Bond ETF,1.01,0.96,1.49,-0.49,-0.51,0.8,-1.65,2.31,0.84,...,-3.78,-1.52,-0.9,-1.49,1.014293,1.168062,1.755788,2,3.74e-05,False
4,iShares Core Global Corporate Bond (AUD Hedged...,2.31,0.88,2.01,-0.51,0.2,0.16,0.24,1.98,0.04,...,-1.5,-4.65,-0.18,-2.98,2.547621,1.253187,1.72499,5,6.29e-05,False


In [4]:
# Create our features
X = pd.get_dummies(df.drop(["ETF_Names", 
                            "STD_2019_2020", 
                            "STD_2020_2021", 
                            "STD_2021_2022", 
                            "Class", 
                            "PCT_Cumulative_Returns"],
                             axis=1))

# Create our target
y = df[["PCT_Cumulative_Returns"]]

In [5]:
X.describe()

Unnamed: 0,30-Jun-19,31-Jul-19,31-Aug-19,30-Sep-19,31-Oct-19,30-Nov-19,31-Dec-19,31-Jan-20,29-Feb-20,31-Mar-20,...,30-Sep-21,31-Oct-21,30-Nov-21,31-Dec-21,31-Jan-22,28-Feb-22,31-Mar-22,30-Apr-22,31-May-22,30-Jun-22
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,...,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,3.879687,1.266563,-0.54125,1.704063,0.315,2.554375,0.01125,2.041875,-3.82625,-9.714062,...,-2.64875,-0.039375,1.485,1.75875,-2.494687,-2.814062,-0.405938,-1.902188,-0.97,-4.211563
std,2.072361,1.974436,2.130842,1.628405,1.262356,1.791962,2.200167,2.542379,4.046414,7.32131,...,1.902233,2.964479,2.136514,2.25297,3.406237,3.339546,4.613504,3.168,1.972797,4.554291
min,-0.21,-5.79,-3.83,-0.96,-2.77,-0.68,-3.11,-4.36,-9.8,-22.47,...,-5.86,-6.36,-2.35,-5.64,-9.01,-10.68,-11.48,-9.45,-7.06,-13.23
25%,2.5875,0.39,-2.3675,0.22,-0.525,1.27,-1.67,0.0975,-6.4675,-14.9525,...,-3.86,-1.2175,-0.3075,0.06,-4.3875,-4.9,-3.555,-3.1475,-1.8225,-7.2375
50%,4.14,1.445,-0.4,1.785,0.115,2.545,-0.685,2.3,-4.84,-9.74,...,-2.71,-0.195,1.39,2.035,-2.625,-2.765,-0.27,-1.485,-0.2,-4.465
75%,5.4725,2.3275,0.7225,2.9725,1.225,3.4175,1.64,3.8125,-0.0825,-4.4275,...,-1.8575,0.465,2.885,3.2075,-0.9125,-0.9075,3.31,-0.0175,0.13,-1.45
max,6.81,4.41,4.37,5.67,2.75,6.41,4.41,5.87,6.23,1.83,...,3.91,6.91,5.31,5.8,7.02,2.65,8.01,6.33,2.01,11.45


In [6]:
# Check the balance of our target values
y['PCT_Cumulative_Returns'].value_counts()

 0.000000e+00    2
-5.930000e+13    1
 1.729352e-01    1
-1.670000e+13    1
 9.020000e+14    1
-8.940000e+13    1
 7.120000e+17    1
 9.270000e+14    1
 4.080000e+13    1
-8.230000e+19    1
 2.490000e+18    1
-7.120000e+14    1
-4.443492e+09    1
 3.597110e+09    1
-4.519803e+08    1
-4.966301e+03    1
-4.510000e+11    1
-3.240000e+15    1
 1.720000e+13    1
 1.405238e+10    1
 5.290000e+17    1
-3.464153e+10    1
-5.670000e+12    1
-4.410000e+14    1
 3.390000e+15    1
-3.790000e+15    1
 2.590000e+13    1
 2.230000e+20    1
 6.290000e-05    1
 3.740000e-05    1
 2.316679e-03    1
Name: PCT_Cumulative_Returns, dtype: int64

In [7]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [10]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
X_scaler = scaler.fit(X_train)

In [11]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier

In [14]:
# Resample the training data with the BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)

ValueError: Unknown label type: 'continuous'

In [None]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
importances = brf.feature_importances_
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

In [None]:
# Train the Classifier
eec = EasyEnsembleClassifier(n_estimators=100,random_state=1)
eec.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))