# Train and Test different machine learning algorithms

In [1]:
# For parameters
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Models 
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor

# Metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from collections import Counter
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Import machine learning model class
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

###

## Prepare data for machine learning

In [2]:
# Load the dataset returns.csv and set the index to level_0 and time
returns = pd.read_csv("../Data/Returns_ForML_Classification.csv",
                      index_col="Date", 
                      infer_datetime_format=True,
                      parse_dates=True)

returns.head()

Unnamed: 0_level_0,level_0,1_Day_returns,5_Day_returns,10_Day_returns,1_Day_binary,5_Day_binary,10_Day_binary
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-19,AMZN,-1.195972,-0.571746,-1.131033,0,0,0
2016-01-20,AMZN,-1.625508,-20.423698,-0.060992,0,0,0
2016-01-21,AMZN,-2.204963,-1.097343,-4.160241,0,0,0
2016-01-22,AMZN,5.535157,0.931386,-1.951061,1,1,0
2016-01-25,AMZN,-0.993228,-1.006537,-1.171831,0,0,0


In [3]:
# Create X and y variables
X = returns.drop(columns=["level_0","1_Day_returns", "5_Day_returns","10_Day_returns", "1_Day_binary"])

# Create the target variable
y = returns["1_Day_binary"]                  

In [4]:
# Split dataset without shuffling
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.3,
                                                    shuffle=False)

In [5]:
# Use Counter to count the number 1s and 0 in y_train
Counter(y_train)

Counter({0: 713, 1: 274})

In [6]:
# Use RandomOverSampler to resample the datase using random_state=1
 ros = RandomOverSampler(random_state=1)

X_train_resampled, y_train_resampled = ros.fit_resample(X_train, 
                                            y_train)

In [7]:
# Use Counter again to verify imbalance removed
Counter(y_train_resampled)

Counter({0: 713, 1: 713})

###

## Run machine learning algorithms - Regressors

### 1. Linear Regression
##### Linear Regression is predicting the value of a dependent variable Y on some independent variable X provided there is a linear relationship exits. This relationship can be represented by a straight line. For more than one independent variables, the algorithm is called multiple linear Regression.

In [8]:
# Create a LogisticRegression model and train it on the X_resampled data we created before
lr = LinearRegression()

# Use the model you trained to predict using X_test
lr_model = lr.fit(X_train_resampled, y_train_resampled)

# Make predictions using the testing data
y_pred_lr = lr_model.predict(X_test)

# ------------------------------ Model Evaluation ------------------------------

# Get the coefficient of determination R2
r2_lr = r2_score(y_test, y_pred_lr)
print(f"The coefficient of determination R2 is {r2_lr}")

# Get the Mean Squared Error
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f"The Mean Squared Error is {mse_lr}")

# Get the Root Mean Squared Error
rmse_lr = np.sqrt(mse_lr) 
print(f"The Root Mean Squared Error is {rmse_lr}")

The coefficient of determination R2 is -0.34980182803531146
The Mean Squared Error is 0.24533896155103088
The Root Mean Squared Error is 0.4953170313557075


### 2. Random Forest 

##### Decision tree Regressor builds a tree incrementally by splitting the dataset into subsets which results in a tree with decision nodes and leaf nodes. A decision node has two or more branches each representing values for the attribute tested. Leaf node represents the decision on the numerical target. The topmost node is called the root node which corresponds to the best predictor.

In [9]:
# Instatiate machine learning model
rf = RandomForestRegressor()

# Fit the machine learning model with the training data
rf_model = rf.fit(X_train_resampled, y_train_resampled)

# Make predictions using the testing data
y_pred_rf = rf_model.predict(X_test)

# ------------------------------ Model Evaluation ------------------------------

# Get the coefficient of determination R2
r2_rf = r2_score(y_test, y_pred_rf)
print(f"The coefficient of determination R2 is {r2_rf}")

# Get the Mean Squared Error
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"The Mean Squared Error is {mse_rf}")

# Get the Root Mean Squared Error
rmse_rf = np.sqrt(mse_rf)
print(f"The Root Mean Squared Error is {rmse_rf}")

The coefficient of determination R2 is -0.35842932870105915
The Mean Squared Error is 0.24690708956075233
The Root Mean Squared Error is 0.4968974638300666


### 3. Extra Trees

##### Extra Tree regressor (stands for extremely randomized Tees) is built differently from the classic decision trees because of its strategy to split nodes. It performs splits for each of the max features randomly and it also selects features randomly and the best split among those is chosen. When max-feature is set to 1, it built a totally decision tree every time. 

In [10]:
# Instatiate machine learning model
etr = ExtraTreesRegressor()

# Fit the machine learning model with the training data
etr.fit(X_train_resampled, y_train_resampled)

# Make predictions using the testing data
y_pred_etr = etr.predict(X_test)

# ------------------------------ Model Evaluation ------------------------------

# Get the coefficient of determination R2
r2_etr = r2_score(y_test, y_pred_etr)
print(f"The coefficient of determination R2 is {r2_etr}")

# Get the Mean Squared Error
mse_etr = mean_squared_error(y_test, y_pred_etr)
print(f"The Mean Squared Error is {mse_etr}")

# Get the Root Mean Squared Error
rmse_etr = np.sqrt(mse_etr)
print(f"The Root Mean Squared Error is {rmse_etr}")

The coefficient of determination R2 is -0.3614842803484184
The Mean Squared Error is 0.24746235526656532
The Root Mean Squared Error is 0.4974558827339017


### 4. Lasso Regression

##### Least Absolute Shrinkage and Selection Operator (LASSO) is a modification of the Least Square Method which performs very well when the count of features is less as compared to count of observations. It produces solutions by estimating sparse coefficients. It uses L1 norm which is equal to absolute value of the magnitude of coefficients. It performs features selection and shrinkage by reducing coefficients of others to zero

In [11]:
# Instatiate machine learning model
lasso = Lasso()

# Fit the machine learning model with the training data
lasso_model = lasso.fit(X_train_resampled, y_train_resampled)

# Make predictions using the testing data
y_pred_lasso = lasso_model.predict(X_test)

# ------------------------------ Model Evaluation ------------------------------

# Get the coefficient of determination R2
r2_lasso = r2_score(y_test, y_pred_lasso)
print(f"The coefficient of determination R2 is {r2_lasso}")

# Get the Mean Squared Error
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print(f"The Mean Squared Error is {mse_lasso}")

# Get the Root Mean Squared Error
rmse_lasso = np.sqrt(mse_lasso)
print(f"The Root Mean Squared Error is {rmse_lasso}")

The coefficient of determination R2 is -0.3754458520386199
The Mean Squared Error is 0.25
The Root Mean Squared Error is 0.5


### 5. Ridge Regression

##### Ridge Regression is a form of regularized linear regression which performs very well when the count of features is less as compared to the count of observations. It belongs to the class of regression tools which use L2 regularization which adds up L2 penalty which is equals to square of magnitude of coefficients. It can’t zero out coefficients thus, it either includes all coefficient or none of them.

In [12]:
# Instatiate machine learning model
ridge = Ridge()

# Fit the machine learning model with the training data
ridge_model = ridge.fit(X_train_resampled, y_train_resampled)

# Make predictions using the testing data
y_pred_ridge = ridge_model.predict(X_test)

# ------------------------------ Model Evaluation ------------------------------

# Get the coefficient of determination R2
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"The coefficient of determination R2 is {r2_ridge}")

# Get the Mean Squared Error
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print(f"The Mean Squared Error is {mse_ridge}")

# Get the Root Mean Squared Error
rmse_ridge = np.sqrt(mse_ridge)
print(f"The Root Mean Squared Error is {rmse_ridge}")

The coefficient of determination R2 is -0.34974207860155704
The Mean Squared Error is 0.24532810153904533
The Root Mean Squared Error is 0.4953060685465557


### 6. Stochastic Gradient Design

In [13]:
# Instatiate machine learning model
sgd = SGDRegressor()

# Fit the machine learning model with the training data
sgd_model = sgd.fit(X_train_resampled, y_train_resampled)

# Make predictions using the testing data
y_pred_sgd = sgd_model.predict(X_test)

# ------------------------------ Model Evaluation ------------------------------

# Get the coefficient of determination R2
r2_sgd = r2_score(y_test,y_pred_sgd)
print(f"The coefficient of determination R2 is {r2_sgd}")

# Get the Mean Squared Error
mse_sgd = mean_squared_error(y_test, y_pred_sgd)
print(f"The Mean Squared Error is {mse_sgd}")

# Get the Root Mean Squared Error
rmse_sgd = np.sqrt(mse_sgd)
print(f"The Root Mean Squared Error is {rmse_sgd}")

The coefficient of determination R2 is -0.34372419583174363
The Mean Squared Error is 0.244234295708577
The Root Mean Squared Error is 0.49420066340361885


###

## Summary of Model Performance

In [15]:
# Convert model performance into df
data= {'Linear Regression': [mse_lr,rmse_lr,r2_lr],
       'Random Forest': [mse_rf,rmse_rf,r2_rf],
       'Extra Trees': [mse_etr,rmse_etr,r2_etr],
       'Lasso Regression': [mse_lasso,rmse_lasso,r2_lasso],
        'Ridge Regression': [mse_ridge,rmse_ridge,r2_ridge],
       ' Stochastic Gradient Design': [mse_sgd,rmse_sgd,r2_sgd]
               }

index={"R2","Mean Squared Error","Root Mean Squared Error"}

model_summary=pd.DataFrame(data, index=index)
model_summary

Unnamed: 0,Linear Regression,Random Forest,Extra Trees,Lasso Regression,Ridge Regression,Stochastic Gradient Design
Mean Squared Error,0.245339,0.246907,0.247462,0.25,0.245328,0.244234
Root Mean Squared Error,0.495317,0.496897,0.497456,0.5,0.495306,0.494201
R2,-0.349802,-0.358429,-0.361484,-0.375446,-0.349742,-0.343724


##### All R square have a negative value in the models selected above, indicating that the Regression models does not follow the trend of the data, so fits worse than a horizontal line. It is usually the case when there are constraints on either the intercept or the slope of the linear regression line.  