# Cleaning data to get the final table: merged_df_4

In [4]:
import pandas as pd
import numpy as np

In [5]:
%store -r merged_df_4

In [6]:
merged_df_4.tail()

Unnamed: 0,Date,MA,Close,RSI,kc_middle,kc_upper,kc_lower,BB_upperband,BB_lowerband,SMA,left_shoulder,right_shoulder,left_shoulder_diff,right_shoulder_diff,head_shoulders,Price_Movement
10304,2022-02-17,172.6178,168.880005,46.780097,170.007667,178.210924,161.804409,181.03647,157.98353,169.51,176.649994,170.539993,0.0,0.034588,1,0
10305,2022-02-18,172.4622,167.300003,44.462683,169.87873,178.13166,161.6258,180.846092,158.662908,169.7545,176.649994,168.910004,0.0,0.043815,1,0
10306,2022-02-22,172.2574,164.320007,40.397843,169.614029,178.071668,161.156391,180.624198,159.154803,169.8895,176.649994,169.419998,0.0,0.040928,1,0
10307,2022-02-23,171.8698,160.070007,35.423888,169.159552,178.051425,160.267678,180.581833,159.226169,169.904001,176.649994,172.639999,0.0,0.0227,1,0
10308,2022-02-24,171.6098,162.740005,40.389504,168.853859,179.026547,158.681172,180.193545,159.919457,170.056501,176.649994,174.139999,-0.006667,0.007636,1,1


# Now that the data has been cleaned, we can proceed to start with testing out various ML Models. 
## The ML model will take in the following input variables -
Moving Average, Relative Strength Index, Keltner Channel (Upper, Middle and Lower bounds), Bollinger Bands (Upper and Lower bounds), Simple Moving Average and the presence of head and shoulders pattern

# [Model 1]: Random Forest with hyperparameters

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

# Define the features and target variable
features = ['MA', 'RSI', 'kc_middle', 'kc_upper', 'kc_lower', 'BB_upperband', 'BB_lowerband', 'SMA', 'head_shoulders']
target = 'Price_Movement'

# Split the data into training and testing sets
train_size = int(len(merged_df_4) * 0.8)
train_df = merged_df_4.iloc[:train_size]
test_df = merged_df_4.iloc[train_size:]

X_train = train_df[features]
y_train = train_df["Price_Movement"]

X_test = test_df[features]
y_test = test_df["Price_Movement"]
# Define the hyperparameters to search over
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt']
}

# Create a Random Forest classifier
rfc = RandomForestClassifier(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(rfc, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and score
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

# Make predictions on the testing data using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the results
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)


Best hyperparameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Best score: 0.5753607372377835
Accuracy: 0.6115421920465567
Precision: 0.6147082334132694
Recall: 0.7068014705882353
F1-score: 0.6575459598118853


# [Model 2]: Support Vector Machine with hyperparameters

In [45]:
merged_df_4.tail()

Unnamed: 0,Date,MA,RSI,kc_middle,kc_upper,kc_lower,BB_upperband,BB_lowerband,SMA,left_shoulder,right_shoulder,left_shoulder_diff,right_shoulder_diff,head_shoulders,Price_Movement
10304,2022-02-17,172.6178,46.780097,170.007667,178.210924,161.804409,181.03647,157.98353,169.51,176.649994,170.539993,0.0,0.034588,1,0
10305,2022-02-18,172.4622,44.462683,169.87873,178.13166,161.6258,180.846092,158.662908,169.7545,176.649994,168.910004,0.0,0.043815,1,0
10306,2022-02-22,172.2574,40.397843,169.614029,178.071668,161.156391,180.624198,159.154803,169.8895,176.649994,169.419998,0.0,0.040928,1,0
10307,2022-02-23,171.8698,35.423888,169.159552,178.051425,160.267678,180.581833,159.226169,169.904001,176.649994,172.639999,0.0,0.0227,1,0
10308,2022-02-24,171.6098,40.389504,168.853859,179.026547,158.681172,180.193545,159.919457,170.056501,176.649994,174.139999,-0.006667,0.007636,1,1


In [46]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Create the feature matrix X and target vector y from your merged_df_4 dataframe
y = merged_df_4['Price_Movement']

# Split the data into training and testing sets
train_size = int(len(merged_df_4) * 0.8)
train_df = merged_df_4.iloc[:train_size]
test_df = merged_df_4.iloc[train_size:]

X_train = train_df[features]
y_train = train_df["Price_Movement"]

X_test = test_df[features]
y_test = test_df["Price_Movement"]
# Scale the data using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create the SVM classifier and fit it to the training data
svc = SVC(kernel='rbf', C=10, gamma='scale', random_state=42)
svc.fit(X_train, y_train)

# Predict the price movement for the testing data
y_pred = svc.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.5683802133850631


# [Model 3]: XGBoost with hyperparameters

In [47]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

# Define the features and target variable
features = ['MA', 'RSI', 'kc_middle', 'kc_upper', 'kc_lower', 'BB_upperband', 'BB_lowerband', 'left_shoulder', 'right_shoulder', 'head_shoulders']
target = 'Price_Movement'

# Split the data into training and testing sets
train_size = int(len(merged_df_4) * 0.8)
train_df = merged_df_4.iloc[:train_size]
test_df = merged_df_4.iloc[train_size:]

X_train = train_df[features]
y_train = train_df["Price_Movement"]

X_test = test_df[features]
y_test = test_df["Price_Movement"]

# Set up the parameter grid for GridSearchCV
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 150, 200],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.5, 0.8, 1],
    'colsample_bytree': [0.5, 0.8, 1],
}

# Create the XGBoost classifier and perform GridSearchCV
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_grid = GridSearchCV(xgb_model, param_grid=param_grid, cv=5, n_jobs=-1)
xgb_grid.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = xgb_grid.predict(X_test)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
print('Best hyperparameters:', xgb_grid.best_params_)
print('Best score:', xgb_grid.best_score_)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

Accuracy: 0.6115421920465567
Best hyperparameters: {'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1}
Best score: 0.5908785122296341
Precision: 0.6184971098265896
Recall: 0.6884191176470589
F1 score: 0.6515876468029578


## [Model 4]: Decision Tree with hyperparameters

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Create the feature matrix X and target vector y from your merged_df_4 dataframe


X = merged_df_4.drop(columns=['Price_Movement', 'Date'])
y = merged_df_4['Price_Movement']

X_size = int(len(X) * 0.8)
X_train = X.iloc[:X_size]
X_test = X.iloc[X_size:]

y_size = int(len(y) * 0.8)
y_train = y.iloc[:y_size]
y_test = y.iloc[y_size:]

# Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define the hyperparameter space to search
param_dist = {'criterion': ['gini', 'entropy'],
              'max_depth': randint(1, 10),
              'min_samples_split': randint(2, 20),
              'min_samples_leaf': randint(1, 10)}

# Create the decision tree classifier and perform a randomized search to find the best hyperparameters
dtc = DecisionTreeClassifier(random_state=42)
random_search = RandomizedSearchCV(dtc, param_distributions=param_dist, n_iter=100, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Print the best hyperparameters and their corresponding score
print("Best hyperparameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)

# Use the best hyperparameters to create a new decision tree classifier and fit it to the training data
best_dtc = DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=3, min_samples_split=12, random_state=42)
best_dtc.fit(X_train, y_train)

# Predict the price movement for the testing data
y_pred = best_dtc.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Best hyperparameters: {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 8}
Best score: 0.6049456603634894
Accuracy: 0.6081474296799224


# [Model 5]: Logistic Regression 

In [49]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

features = ['MA', 'RSI', 'kc_middle', 'kc_upper', 'kc_lower', 'BB_upperband', 'SMA', 'BB_lowerband', 'left_shoulder', 'right_shoulder', 'head_shoulders']

train_size = int(len(merged_df_4) * 0.8)
train_df = merged_df_4.iloc[:train_size]
test_df = merged_df_4.iloc[train_size:]

X_train = train_df[features]
y_train = train_df["Price_Movement"]

X_test = test_df[features]
y_test = test_df["Price_Movement"]

# Fit logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict labels for testing set
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.5906886517943744


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
