# Final Project - Ensemble ML for SPY Price Trend Prediction

## Tuning using optuna 

### Random Forest, KNN, and SVC 

#_________________________________________________________________________________________________________________________________________

#### Import Library

In [1]:
## Data Manipulation
import numpy as np
import pandas as pd

## Visualization
import matplotlib.pyplot as plt
import plotly.io as pio
pio.kaleido.scope.mathjax = None 

## Model Building and Evaluating
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier# Base Model 1
from sklearn.neighbors import KNeighborsClassifier # Base Model 2
from sklearn.svm import SVC # Base Model 3
from xgboost import XGBClassifier # Meta Model  
from sklearn.metrics import f1_score

from sklearn.base import BaseEstimator, ClassifierMixin



## Improve Opmization
import optuna # Allows you to define an objective function, which you want to optimize
from optuna.visualization import plot_slice, plot_optimization_history
from functools import partial # Provides higher-order functions and operations on callable objects



## Import Functions and Models
from functions import *
from build_models import *

2024-01-19 18:02:00.297804: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-01-19 18:02:00.297869: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


#### Load Data

In [2]:
## Load dataset
df = pd.read_csv('../../Data/df_final.csv', index_col=0)

## Specify Features and Target
X, y = df.iloc[:,:-1].values, df.iloc[:,-1].values

## Set TimeSplit 
TimeSplit = 3

## Number of trials
n_trials = 100

## Set Random State
rs = 55

## Set a seed
set_seeds(seed=rs)



### Objective Functions:  
Evaluation Metric: Max f-score 

In [4]:
###  1) Random Forest
def optimize_RF(trail, x, y):

    ## Specify Hyperparameters range
    n_estimators = trail.suggest_int("n_estimators", 100, 1500)

    max_depth = trail.suggest_int("max_depth", 3, 15)

    max_features = trail.suggest_float("max_features", 0.01, 1.0)

    ## Build Model
    model = RandomForestClassifier( n_jobs=-1, random_state=rs, 
                                    class_weight= cwts(y), # Balance Class
                                    n_estimators=n_estimators, # Set the number of trees (estimators).  
                                    max_depth=max_depth, # Controls the complexity of each tree. 
                                    max_features = max_features, 
                                    min_samples_leaf=1,  # Minimum number of samples in a leaf node
                                    min_samples_split=2, # Min samples required to split an internal node in a decision tree
                                    criterion='gini'  # Use Gini impurity as the split criterion. 
                                    )

    ## Set TS-CV
    tscv = TimeSeriesSplit(n_splits=TimeSplit, gap=1)

    ## Create Variable
    fscore = []

    ## Run for the different training and testing data from TS-CV splits 
    for idx in tscv.split(x):
        train_idx, test_idx = idx[0], idx[1]

        xtrain = x[train_idx]
        ytrain = y[train_idx]

        xtest = x[test_idx]
        ytest = y[test_idx]

        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)

        cv_f1 = f1_score(ytest, preds, average='weighted')
        fscore.append(cv_f1)

    return np.mean(fscore)


#______________________________________________________________________________

###  2) KNN

def optimize_KNN(trail, x, y):

    ## Specify Hyperparameters range
    n_neighbors = trail.suggest_int("n_neighbors", 5, 20)

    ## Build Model
    model = KNeighborsClassifier( n_jobs=-1, 
                                  n_neighbors = n_neighbors
                                 )

    ## Set TS-CV
    tscv = TimeSeriesSplit(n_splits=TimeSplit, gap=1)

    ## Create Variable
    fscore = []

    ## Run for the different training and testing data from TS-CV splits 
    for idx in tscv.split(x):
        train_idx, test_idx = idx[0], idx[1]

        xtrain = x[train_idx]
        ytrain = y[train_idx]

        xtest = x[test_idx]
        ytest = y[test_idx]

        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)

        cv_f1 = f1_score(ytest, preds, average='weighted')
        fscore.append(cv_f1)

    return np.mean(fscore)



#______________________________________________________________________________


###  3) SVC

def optimize_SVC(trail, x, y):

    ## Specify Hyperparameters range
    kernel = trail.suggest_categorical("kernel", ['linear', 'rbf'])
    
    C = trail.suggest_float("C", 0.1, 10)

    ## Build Model
    model = SVC( probability=True, # Allow probability estimate
                 class_weight = cwts(y), # Balance Class
                 kernel= kernel, 
                 C=C, # Regularization parameter. A larger value of C implies less regularization.
                )


    ## Set TS-CV
    tscv = TimeSeriesSplit(n_splits=TimeSplit, gap=1)

    ## Create Variable
    fscore = []

    ## Run for the different training and testing data from TS-CV splits 
    for idx in tscv.split(x):
        train_idx, test_idx = idx[0], idx[1]

        xtrain = x[train_idx]
        ytrain = y[train_idx]

        xtest = x[test_idx]
        ytest = y[test_idx]

        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)

        cv_f1 = f1_score(ytest, preds, average='weighted')
        fscore.append(cv_f1)

    return np.mean(fscore)



## 1 - Tuning Random Forest

In [None]:
if __name__ == "__main__":
    
    ## New function by "partially" fixing some hiperparameters from optimization
    optimization_function = partial(optimize_RF, x=X, y=y)

    ## Create a Max the Objective Function
    rf_study = optuna.create_study(study_name='rf_optuna', 
                                   direction="maximize")

    ## Perform n optimization trials using the Optimization function
    rf_study.optimize(optimization_function, n_trials=n_trials)

    ## Save Best Parameters
    rf_opt_params = pd.DataFrame({'n_estimators':  [rf_study.best_params['n_estimators'] ],
                                  'max_depth': [rf_study.best_params['max_depth'] ],
                                  'max_features':  [rf_study.best_params['max_features'] ]
                                 })
    
    ## Save to CSV
    rf_opt_params.to_csv('../Output/rf_hyperparam_tun.csv', index=False)

In [None]:
## Plot Hyperparameteres
plot_slice(rf_study).write_image("../../Plot/Tuning/hyp_set_rf.pdf")
plot_slice(rf_study)

In [None]:
 ## Plot - Optimization History
plot_optimization_history(rf_study).write_image("../../Plot/Tuning/opt_hist_rf.pdf")
plot_optimization_history(rf_study)

## 2 - Tuning KNN

In [None]:
if __name__ == "__main__":
    
    ## New function by "partially" fixing some hiperparameters from optimization
    optimization_function = partial(optimize_KNN, x=X, y=y)

    ## Create a Max the Objective Function
    knn_study = optuna.create_study(study_name='knn_optuna', 
                                    direction="maximize")

    ## Perform n optimization trials using the Optimization function
    knn_study.optimize(optimization_function, n_trials=n_trials)

    ## Save Best Parameters
    knn_opt_params = pd.DataFrame({'n_neighbors':  [knn_study.best_params['n_neighbors'] ]   })
    
    ## Save to CSV
    knn_opt_params.to_csv('../Output/knn_hyperparam_tun.csv', index=False)

In [None]:
## Plot Hyperparameteres
plot_slice(knn_study).write_image("../../Plot/Tuning/hyp_set_knn.pdf")
plot_slice(knn_study)

In [None]:
 ## Plot - Optimization History
plot_optimization_history(knn_study).write_image("../../Plot/Tuning/opt_hist_knn.pdf")
plot_optimization_history(knn_study)

## 3 - Tuning SVC

In [None]:
if __name__ == "__main__":
    
    ## New function by "partially" fixing some hiperparameters from optimization
    optimization_function = partial(optimize_SVC, x=X, y=y)

    ## Create a Max the Objective Function
    svc_study = optuna.create_study(study_name='svc_optuna', 
                                    direction="maximize")

    ## Perform n optimization trials using the Optimization function
    svc_study.optimize(optimization_function, n_trials=n_trials)

    ## Save Best Parameters
    svc_opt_params = pd.DataFrame({'kernel':  [svc_study.best_params['kernel'] ],
                                   'C': [svc_study.best_params['C'] ],
                                 })
    
    ## Save to CSV
    svc_opt_params.to_csv('../Output/svc_hyperparam_tun.csv', index=False)

In [None]:
## Plot Hyperparameteres
plot_slice(svc_study).write_image("../../Plot/Tuning/hyp_set_svc.pdf")
plot_slice(svc_study)

In [None]:
 ## Plot - Optimization History
plot_optimization_history(svc_study).write_image("../../Plot/Tuning/opt_hist_svc.pdf")
plot_optimization_history(svc_study)