# Train and Test different machine learning algorithms

In [1]:
# For parameters
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

# Models 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error
from collections import Counter

###

## Prepare data for machine learning

In [2]:
# Load the dataset returns.csv and set the index to level_0 and time
returns = pd.read_csv("Returns_ForML_Classification.csv",
                      index_col="Date",
                      infer_datetime_format=True, 
                      parse_dates=True)
returns.head()

Unnamed: 0_level_0,level_0,1_Day_returns,5_Day_returns,10_Day_returns,1_Day_binary,5_Day_binary,10_Day_binary
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-15,AMZN,-0.038482,-0.060736,-0.156401,0,0,0
2016-01-19,AMZN,0.007541,-0.070029,-0.098133,1,0,0
2016-01-20,AMZN,-0.004717,-0.074641,-0.097856,0,0,0
2016-01-21,AMZN,0.005684,-0.01167,-0.091093,1,0,0
2016-01-22,AMZN,0.037147,0.0057,-0.019015,1,1,0


In [3]:
# Create a separate dataframe for features and define the target variable as a binary target
X = returns.drop(columns=["level_0","1_Day_returns", "5_Day_returns","10_Day_returns", "1_Day_binary"])

# Create the target variable
y = returns["1_Day_binary"]

In [4]:
# Split the dataset without shuffling
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.3,
                                                    shuffle=False)

In [5]:
# Use Counter to count the number 1s and 0 in y_train
Counter(y_train)

Counter({0: 436, 1: 554})

In [6]:
# Use RandomOverSampler to resample the datase using random_state=1
ros = RandomOverSampler(random_state=1)

X_resampled, y_resampled = ros.fit_resample(X_train, 
                                            y_train)

In [7]:
# Use Counter again to verify imbalance removed
Counter(y_resampled)

Counter({0: 554, 1: 554})

###

## Run machine learning algorithms - Classification

### 1. Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

# Create a LogisticRegression model and train it on the X_resampled data we created before
LR = LogisticRegression()

LR_model = LR.fit(X_resampled, y_resampled)  

# Use the model you trained to predict using X_test
LR_pred = LR_model.predict(X_test)   

actual_v_pred = y_test.to_frame()
actual_v_pred["1_Day_binary_pred"] = LR_pred

# Calc mean squared error
out_of_sample_mse_LR = mean_squared_error(actual_v_pred["1_Day_binary"],
                                          actual_v_pred["1_Day_binary_pred"])

print(f"Out of sample MSE: {out_of_sample_mse_LR}")

# Calculate out-sample root mean_squared_error
out_of_sample_rmse_LR = np.sqrt(out_of_sample_mse_LR)
print(f"Out of sample RMSE: {out_of_sample_rmse_LR}")


# # Print out a classification report toevaluate performance
print(classification_report(y_test,
                            LR_pred, 
                            digits=4))

Out of sample MSE: 0.35294117647058826
Out of sample RMSE: 0.5940885257860046
              precision    recall  f1-score   support

           0     0.6162    0.5907    0.6032       193
           1     0.6708    0.6940    0.6822       232

    accuracy                         0.6471       425
   macro avg     0.6435    0.6423    0.6427       425
weighted avg     0.6460    0.6471    0.6463       425



In [9]:
Logistic = classification_report(y_test,
                                 LR_pred, 
                                 digits=4, 
                                 output_dict=True)

Logistic_classification_report = pd.DataFrame(Logistic).transpose()

Logistic_classification_report = Logistic_classification_report.sort_values(by=['f1-score'], ascending=False)

Logistic_classification_report.head()

Unnamed: 0,precision,recall,f1-score,support
1,0.670833,0.693966,0.682203,232.0
accuracy,0.647059,0.647059,0.647059,0.647059
weighted avg,0.646031,0.647059,0.646315,425.0
macro avg,0.643525,0.64232,0.642689,425.0
0,0.616216,0.590674,0.603175,193.0


### 2. Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier
# Create a RandomForestClassifier model and train it on the X_resampled data we created before
random_forest = RandomForestClassifier(random_state=0)

# Use the model you trained to predict using X_test
forest_model = random_forest.fit(X_resampled,
                                 y_resampled)

forest_pred = forest_model.predict(X_test)

actual_v_pred = y_test.to_frame()
actual_v_pred["1_Day_binary_pred"] = forest_pred

# Calc mean squared error
out_of_sample_mse_rf = mean_squared_error(actual_v_pred["1_Day_binary"],
                                       actual_v_pred["1_Day_binary_pred"])

print(f"Out of sample MSE: {out_of_sample_mse_rf}")

# Calculate out-sample root mean_squared_error
out_of_sample_rmse_rf = np.sqrt(out_of_sample_mse_rf)
print(f"Out of sample RMSE: {out_of_sample_rmse_rf}")


# Print out a classification report to evaluate performance
print(classification_report(y_test, 
                            forest_pred,
                            digits=4))

Out of sample MSE: 0.35294117647058826
Out of sample RMSE: 0.5940885257860046
              precision    recall  f1-score   support

           0     0.6162    0.5907    0.6032       193
           1     0.6708    0.6940    0.6822       232

    accuracy                         0.6471       425
   macro avg     0.6435    0.6423    0.6427       425
weighted avg     0.6460    0.6471    0.6463       425



In [11]:
Random = classification_report(y_test,
                               forest_pred,
                               digits=4, 
                               output_dict=True)

Random_classification_report = pd.DataFrame(Random).transpose()

Random_classification_report = Random_classification_report.sort_values(by=['f1-score'], ascending=False)

Random_classification_report.head()

Unnamed: 0,precision,recall,f1-score,support
1,0.670833,0.693966,0.682203,232.0
accuracy,0.647059,0.647059,0.647059,0.647059
weighted avg,0.646031,0.647059,0.646315,425.0
macro avg,0.643525,0.64232,0.642689,425.0
0,0.616216,0.590674,0.603175,193.0


### 3. Gradient Boosting Classifier

In [12]:
# Create a GradientBoostingClassifier model and train it on the X_resampled data we created before
gradient_boost = GradientBoostingClassifier(random_state=0)

# Use the model you trained to predict using X_test
gradient_model = gradient_boost.fit(X_resampled, y_resampled)

gradient_pred = gradient_model.predict(X_test)

actual_v_pred = y_test.to_frame()
actual_v_pred["1_Day_binary_pred"] = gradient_pred

# Calc mean squared error
out_of_sample_mse_gb = mean_squared_error(actual_v_pred["1_Day_binary"],
                                       actual_v_pred["1_Day_binary_pred"])

print(f"Out of sample MSE: {out_of_sample_mse_gb}")

# Calculate out-sample root mean_squared_error
out_of_sample_rmse_gb = np.sqrt(out_of_sample_mse_gb)
print(f"Out of sample RMSE: {out_of_sample_rmse_gb}")


# Print out a classification report to evaluate performance
print(classification_report(y_test, 
                            gradient_pred,
                            digits=4))

Out of sample MSE: 0.35294117647058826
Out of sample RMSE: 0.5940885257860046
              precision    recall  f1-score   support

           0     0.6162    0.5907    0.6032       193
           1     0.6708    0.6940    0.6822       232

    accuracy                         0.6471       425
   macro avg     0.6435    0.6423    0.6427       425
weighted avg     0.6460    0.6471    0.6463       425



In [13]:
Gradient = classification_report(y_test, 
                                 gradient_pred,
                                 digits=4,
                                 output_dict=True)

Gradient_classification_report = pd.DataFrame(Gradient).transpose()

Gradient_classification_report = Gradient_classification_report.sort_values(by=['f1-score'], ascending=False)

Gradient_classification_report.head()

Unnamed: 0,precision,recall,f1-score,support
1,0.670833,0.693966,0.682203,232.0
accuracy,0.647059,0.647059,0.647059,0.647059
weighted avg,0.646031,0.647059,0.646315,425.0
macro avg,0.643525,0.64232,0.642689,425.0
0,0.616216,0.590674,0.603175,193.0


### 4. Ada Boost Classifier

In [14]:
# Create a AdaBoostClassifier model and train it on the X_resampled data we created before
ada = AdaBoostClassifier(random_state=0)

# Use the model you trained to predict using X_test
ada_model = ada.fit(X_resampled, y_resampled)

ada_pred = ada_model.predict(X_test)

actual_v_pred = y_test.to_frame()
actual_v_pred["1_Day_binary_pred"] = ada_pred

# Calc mean squared error
out_of_sample_mse_ada = mean_squared_error(actual_v_pred["1_Day_binary"],
                                       actual_v_pred["1_Day_binary_pred"])

print(f"Out of sample MSE: {out_of_sample_mse_ada}")

# Calculate out-sample root mean_squared_error
out_of_sample_rmse_ada = np.sqrt(out_of_sample_mse_ada)
print(f"Out of sample RMSE: {out_of_sample_rmse_ada}")


# Print out a classification report to evaluate performance
print(classification_report(y_test,
                            ada_pred,
                            digits=4))

Out of sample MSE: 0.35294117647058826
Out of sample RMSE: 0.5940885257860046
              precision    recall  f1-score   support

           0     0.6162    0.5907    0.6032       193
           1     0.6708    0.6940    0.6822       232

    accuracy                         0.6471       425
   macro avg     0.6435    0.6423    0.6427       425
weighted avg     0.6460    0.6471    0.6463       425



In [15]:
Ada = classification_report(y_test, 
                            ada_pred,
                            digits=4,
                            output_dict=True)

Ada_classification_report = pd.DataFrame(Ada).transpose()

Ada_classification_report = Ada_classification_report.sort_values(by=['f1-score'], ascending=False)

Ada_classification_report.head()

Unnamed: 0,precision,recall,f1-score,support
1,0.670833,0.693966,0.682203,232.0
accuracy,0.647059,0.647059,0.647059,0.647059
weighted avg,0.646031,0.647059,0.646315,425.0
macro avg,0.643525,0.64232,0.642689,425.0
0,0.616216,0.590674,0.603175,193.0


### 5. XGB Forest Classifier

In [16]:
# Create a XGBClassifier model and train it on the X_resampled data we created before
xgb = XGBClassifier()

xgb_model = xgb.fit(X_resampled, y_resampled)

# Use the model you trained to predict using X_test
xgb_pred = xgb_model.predict(X_test)

actual_v_pred = y_test.to_frame()
actual_v_pred["1_Day_binary_pred"] = xgb_pred

# Calc mean squared error
out_of_sample_mse_xgb = mean_squared_error(actual_v_pred["1_Day_binary"],
                                       actual_v_pred["1_Day_binary_pred"])

print(f"Out of sample MSE: {out_of_sample_mse_xgb}")

# Calculate out-sample root mean_squared_error
out_of_sample_rmse_xgb = np.sqrt(out_of_sample_mse_xgb)
print(f"Out of sample RMSE: {out_of_sample_rmse_xgb}")

# Print out a classification report to evaluate performance
print(classification_report(y_test,
                            xgb_pred,
                            digits=4))

Out of sample MSE: 0.35294117647058826
Out of sample RMSE: 0.5940885257860046
              precision    recall  f1-score   support

           0     0.6162    0.5907    0.6032       193
           1     0.6708    0.6940    0.6822       232

    accuracy                         0.6471       425
   macro avg     0.6435    0.6423    0.6427       425
weighted avg     0.6460    0.6471    0.6463       425





In [17]:
xgb = classification_report(y_test, 
                            xgb_pred,
                            digits=4,
                            output_dict=True)

xgb_classification_report = pd.DataFrame(xgb).transpose()

xgb_classification_report = xgb_classification_report.sort_values(by=['f1-score'], ascending=False)

xgb_classification_report.head()

Unnamed: 0,precision,recall,f1-score,support
1,0.670833,0.693966,0.682203,232.0
accuracy,0.647059,0.647059,0.647059,0.647059
weighted avg,0.646031,0.647059,0.646315,425.0
macro avg,0.643525,0.64232,0.642689,425.0
0,0.616216,0.590674,0.603175,193.0


###

## Evaluate performance of each ML model

In [18]:
# Merge all classification outputs 
merge_df = pd.concat([Logistic_classification_report,
                      Random_classification_report, 
                      Gradient_classification_report,
                      Ada_classification_report,
                      xgb_classification_report], axis = 1) 

# Display top 5 rows
merge_df.head()

Unnamed: 0,precision,recall,f1-score,support,precision.1,recall.1,f1-score.1,support.1,precision.2,recall.2,f1-score.2,support.2,precision.3,recall.3,f1-score.3,support.3,precision.4,recall.4,f1-score.4,support.4
1,0.670833,0.693966,0.682203,232.0,0.670833,0.693966,0.682203,232.0,0.670833,0.693966,0.682203,232.0,0.670833,0.693966,0.682203,232.0,0.670833,0.693966,0.682203,232.0
accuracy,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059,0.647059
weighted avg,0.646031,0.647059,0.646315,425.0,0.646031,0.647059,0.646315,425.0,0.646031,0.647059,0.646315,425.0,0.646031,0.647059,0.646315,425.0,0.646031,0.647059,0.646315,425.0
macro avg,0.643525,0.64232,0.642689,425.0,0.643525,0.64232,0.642689,425.0,0.643525,0.64232,0.642689,425.0,0.643525,0.64232,0.642689,425.0,0.643525,0.64232,0.642689,425.0
0,0.616216,0.590674,0.603175,193.0,0.616216,0.590674,0.603175,193.0,0.616216,0.590674,0.603175,193.0,0.616216,0.590674,0.603175,193.0,0.616216,0.590674,0.603175,193.0


In [19]:
# Create new df selecting only the weighted average
new_df = merge_df.loc[['weighted avg'], :]

new_df.head()

Unnamed: 0,precision,recall,f1-score,support,precision.1,recall.1,f1-score.1,support.1,precision.2,recall.2,f1-score.2,support.2,precision.3,recall.3,f1-score.3,support.3,precision.4,recall.4,f1-score.4,support.4
weighted avg,0.646031,0.647059,0.646315,425.0,0.646031,0.647059,0.646315,425.0,0.646031,0.647059,0.646315,425.0,0.646031,0.647059,0.646315,425.0,0.646031,0.647059,0.646315,425.0


In [20]:
# Create empty table to populate later
models = ['LogisticRegression', 'RandomForestClassifier', 'GradientBoostingClassifier', 'AdaBoostClassifier', 'XGBClassifier']

row_names  = ['precision', 'recall', 'f1-score', 'support']

df = pd.DataFrame(index=row_names,
                  columns=models)

# Display empty table
df

Unnamed: 0,LogisticRegression,RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,XGBClassifier
precision,,,,,
recall,,,,,
f1-score,,,,,
support,,,,,


In [21]:
# Populate table with the weighted average classification scores for each model
df['LogisticRegression'] = new_df.iloc[:, 0:4].T
df['RandomForestClassifier'] = new_df.iloc[:, 4:8].T
df['GradientBoostingClassifier'] = new_df.iloc[:, 8:12].T
df['AdaBoostClassifier'] = new_df.iloc[:, 12:16].T
df['XGBClassifier'] = new_df.iloc[:, 16:].T

df

Unnamed: 0,LogisticRegression,RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,XGBClassifier
precision,0.646031,0.646031,0.646031,0.646031,0.646031
recall,0.647059,0.647059,0.647059,0.647059,0.647059
f1-score,0.646315,0.646315,0.646315,0.646315,0.646315
support,425.0,425.0,425.0,425.0,425.0


In [22]:
# Create empty table to populate with MSE and RMSE scores
models = ['LogisticRegression', 'RandomForestClassifier', 'GradientBoostingClassifier', 'AdaBoostClassifier', 'XGBClassifier']

row_names  = ['mse', 'rmse']

df_rmse = pd.DataFrame(index=row_names,
                  columns=models)

df_rmse

Unnamed: 0,LogisticRegression,RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,XGBClassifier
mse,,,,,
rmse,,,,,


In [23]:
# Populate table with MSE and RMSE scores
df_rmse['LogisticRegression'] = [out_of_sample_mse_LR, out_of_sample_rmse_LR]
df_rmse['RandomForestClassifier'] = [out_of_sample_mse_rf, out_of_sample_rmse_rf]
df_rmse['GradientBoostingClassifier'] = [out_of_sample_mse_gb, out_of_sample_rmse_gb]
df_rmse['AdaBoostClassifier'] = [out_of_sample_mse_ada, out_of_sample_rmse_ada]
df_rmse['XGBClassifier'] = [out_of_sample_mse_xgb, out_of_sample_rmse_xgb]

df_rmse

Unnamed: 0,LogisticRegression,RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,XGBClassifier
mse,0.352941,0.352941,0.352941,0.352941,0.352941
rmse,0.594089,0.594089,0.594089,0.594089,0.594089


In [24]:
# Merge Classification table and MSE + RMSE scores table to create a final table to evaluate ML models
final_df = pd.concat([df, df_rmse], axis=0)

final_df

Unnamed: 0,LogisticRegression,RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,XGBClassifier
precision,0.646031,0.646031,0.646031,0.646031,0.646031
recall,0.647059,0.647059,0.647059,0.647059,0.647059
f1-score,0.646315,0.646315,0.646315,0.646315,0.646315
support,425.0,425.0,425.0,425.0,425.0
mse,0.352941,0.352941,0.352941,0.352941,0.352941
rmse,0.594089,0.594089,0.594089,0.594089,0.594089
