# FINTECH BOOTCAMP - PROJECT 2
## Group 2 Notebook
---
By applying machine learning models, we examine (1) if selective technical indicators could predict the stock direction with statistically significant level (2) Which model is the best (3) Whether we could optimize the model (4) Which time frame the model could generate the best result. 

In [None]:
# Initial import all libraries and dependencies
import yfinance as yf
import matplotlib.dates as mdates
import panel as pn
import datetime
import numpy as np
import pandas as pd
import hvplot.pandas
from finta import TA
# from pandas_datareader import data
import matplotlib.pyplot as plt

# Ignore wanrings
import warnings
warnings.filterwarnings("ignore")


# Machine learning libraries
from sklearn.preprocessing import StandardScaler

### I. DATA FETCHING AND CLEANING

In [None]:
# Define the instruments to download data of a stock/ETF .
tickers = ["AAPL", "TSLA", "MSFT", "SPY", "..."] # to be decided

# Fetch SPY Data from 1/1/2017 until 12/31/2021 and choosing a interval
start_date = datetime.date(2017,1,1)
end_date = datetime.date(2021,12,31)
interval = '1d'

In [None]:
#Use pandas_reader.data.DataReader to load the desired data.
yf.Tickers(tickers[0])
panel_data = yf.download(tickers[0], start = start_date, end = end_date, interval = interval)

# Checkout the data type
type(panel_data)

In [None]:
# Review data
panel_data.head(5)

In [None]:
# Data description and check if null
def data_description(df):
    print("Data Information")
    print(df.info())
    print("-"*50)

In [None]:
data_description(panel_data) # if 0 null and OHLC is floating and Volumne is int, then data is clean to proceed to part II

In [None]:
# IF data is note clean then dropping null or convert datatype
# def data_cleaning(df):
#     df.dropna()

In [None]:
# Convert to ohlcv dataframe to be ready for finta
def ohlcv(df):
    del(df['Close'])
    df = df.rename(columns = {"Open": "open",'High' : 'high', 'Low' : "low", "Adj Close": "close", 'Volume': 'volume'},inplace = True)
    return df

In [None]:
ohlcv(panel_data)
panel_data

### II. DATA PROCESSING AND PREPARATION

In [None]:
# Timeframe for prediction
time_frame = [3,5,7]

# Identify stock direction
def stock_direction(df, days):# days is time frame
    direction = (df['close'].shift(-days) > df['close'])
    direction = direction.iloc[:-days]
    return direction.astype(int) #return y values

In [None]:
stock_direction(panel_data,time_frame[0]) # y values

In [None]:
# Using Finta calculate technical indicators
# Define key window to calculate for technical analysis 
window = [5,14,21,50]
def technical_indicators (df): # https://github.com/peerchemist/finta/blob/master/finta/finta.py
    x = pd.DataFrame()
    for n in range(len(window)) :  ### LOOPING DOES NOT SHOW(?)
        a = TA.BBANDS(df,window[n])
        b = TA.RSI(df,window[n])
        c = TA.PIVOT_FIB(df)
        d = TA.OBV(df)
        e = TA.SMA(df,window[n])
        f = TA.EMA(df,window[n])
        g = TA.ROC(df,window[n])
        k = TA.WILLIAMS(df,window[n])
        temp = pd.concat([a,b,c,d,e,f,g,k],axis = 1)
        x = pd.concat([x,temp],axis=1)
    return x

In [None]:
technical_indicators(panel_data)

In [None]:
def consol_data(df,days):
    consol_data = technical_indicators(df)
    consol_data["direction"] = stock_direction(df,days)
    consol_data.dropna(inplace = True)
    return consol_data

In [None]:
data = consol_data(panel_data,time_frame[0])
data.info()

In [None]:
data.columns

### III. CHOOSING MODELS AND TRAINING MODEL (INDIVIDUAL WORK)

#### 1. Model 1......

## LSTM

In [None]:
# Set the random seed for reproducibility
# Note: This is used for model prototyping, but it is good practice to comment this out and run multiple experiments to evaluate your model.
from numpy.random import seed

seed(1)
from tensorflow import random

random.set_seed(2)

In [None]:
def window_data(df, window, feature_col_number, target_col_number):
    """
    This function accepts the column number for the features (X) and the target (y).
    It chunks the data up with a rolling window of Xt - window to predict Xt.
    It returns two numpy arrays of X and y.
    """
    X = []
    y = []
    for i in range(len(df) - window):
        features = df.iloc[i : (i + window), feature_col_number]
        target = df.iloc[(i + window), target_col_number]
        X.append(features)
        y.append(target)
    return np.array(X), np.array(y).reshape(-1, 1)

In [None]:
# Creating the features (X) and target (y) data using the window_data() function.
window_size = 30
# parsing the most important featurs (S1,S4,21 period SMA,14 perioid RSI,BB LOWER,ROC) 
feature_column = [8,26,44,62,50,21,2,20,38,56,16,34,52,5,23,41,59]
target_column =[72]
X, y = window_data(data, window_size, feature_column, target_column)
print (f"X sample values:\n{X[:5]} \n")
print (f"y sample values:\n{y[:5]}")

In [None]:
X.shape

In [None]:
# Use 70% of the data for training and the remaineder for testing
split=int(0.7 * len(X))
X_train=X[: split]
X_test=X[split :]
y_train=y[: split]
y_test=y[split :]


In [None]:
X_train.shape

### Reshape the X and y to be in 2d array as the  MinMaxScaler only accept 2d arrays 

In [None]:
nsamples, nx, ny = X_train.shape
X_train = X_train.reshape((nsamples,nx*ny))

In [None]:
nsamples, nx, ny = X_test.shape
X_test=X_test.reshape((nsamples,nx*ny))
# X_test

##### Why should we scale target variables in regression problems?

###### A target variable with a large spread of values, in turn, may result in large error gradient values causing weight values to change dramatically, making the learning process unstable.

-----

In [None]:
# Use the MinMaxScaler to scale data between 0 and 1.
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the MinMaxScaler object with the training feature data X_train
scaler.fit(X_train)

# Scale the features training and testing sets
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Fit the MinMaxScaler object with the training target data y_train
scaler.fit(y_train)

# Scale the target training and testing sets
y_train = scaler.transform(y_train)
y_test = scaler.transform(y_test)

### Reducing Dimensions Using PCA

In [None]:
# Use PCA to reduce dimensions to 3 principal components
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X_train= pca.fit_transform(X_train)
X_test=pca.transform(X_test)
print(X_train)

In [None]:
pca.explained_variance_ratio_

### Reshape Features Data for the LSTM Model

The LSTM API from Keras needs to receive the features data as a _vertical vector_, so that we need to reshape the `X` data in the form `reshape((X_train.shape[0], X_train.shape[1], 1))`.

Both sets, training, and testing are reshaped.

In [None]:
# Reshape the features for the model
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
print (f"X_train sample values:\n{X_train[:5]} \n")
print (f"X_test sample values:\n{X_test[:5]}")

### Build and Train the LSTM RNN

In this section, we will design a custom LSTM RNN in Keras and fit (train) it using the training data we defined.

You we need to:

1. Define the model architecture in Keras.

2. Compile the model.

3. Fit the model with the training data.

In [None]:
# Importing required Keras modules
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [None]:
# X_train.shape[1]

* `Dropout`: Dropout is a regularization technique for reducing overfitting in neural networks. This type of layer applies the dropout technique to the input.

In [None]:
# Define the LSTM RNN model.
model = Sequential()

# Initial model setup
number_units = 30
dropout_fraction = 0.5

# Layer 1
model.add(LSTM(
    units=number_units,
    return_sequences=True,
    input_shape=(X_train.shape[1], 1)))
    
model.add(Dropout(dropout_fraction))

# Layer 2
model.add(LSTM(units=number_units, return_sequences=True))
model.add(Dropout(dropout_fraction))

# Layer 3
model.add(LSTM(units=number_units))
model.add(Dropout(dropout_fraction))

# Output layer
model.add(Dense(1, activation = "sigmoid"))

#### Compile the LSTM RNN Model



In [None]:
# Compile the model
model.compile(optimizer="adam", metrics="accuracy", loss = "binary_crossentropy")

In [None]:
# Show the model summary
model.summary()

### Training the Model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='accuracy',
                               patience=10,
                               mode='max',
                               restore_best_weights=True)

In [None]:
# Train the model
model.fit(X_train ,y_train, epochs=100, shuffle=False, batch_size=16, verbose=1,callbacks=[early_stopping])

----

## Model Performance


In [None]:
# Evaluate the model
model.evaluate(X_test, y_test, verbose=0)

### Making Predictions


In [None]:
# Make predictions using the testing data X_test
predicted = model.predict(X_test)


---

### Creat DataFrame for Predicted Vs. Real Prices

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, roc_auc_score, precision_score
from IPython.display import display

THRESHOLD = [0.4,0.5,0.52 ,0.55, 0.57, 0.58 ,0.6,0.7,0.8]
for i in THRESHOLD:
    preds = np.where(model.predict(X_test).ravel() > i, 1, 0)
    print(i)
    df_thresh = pd.DataFrame(data=[accuracy_score(y_test, preds), recall_score(y_test, preds),
                       precision_score(y_test, preds), roc_auc_score(y_test, preds)], 
                 index=["accuracy", "recall", "precision", "roc_auc_score"], columns = ["Scores"])
    display(df_thresh)

In [None]:
THRESHOLD = 0.5
predicted = np.where(predicted > THRESHOLD ,1,0)

In [None]:
# Create a DataFrame of Real and Predicted values
stocks = pd.DataFrame({
    "Actual": y_test.ravel(),
    "Predicted": preds.ravel()
}, index = data.index[-len(y_test): ]) 

# Show the DataFrame's head
stocks.head()

### classification_report

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
print(classification_report(y_test,predicted))


### Plot  roc_curve and auc metrics

In [None]:
# Import the roc_curve and auc metrics from sklearn
from sklearn.metrics import roc_curve, auc

In [None]:
# Making predictions to feed the roc_curve module
train_predictions = model.predict(X_train, batch_size=1000)
test_predictions = model.predict(X_test, batch_size=1000)

In [None]:
# Calculate the ROC curve and AUC for the training set
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, train_predictions)
auc_train = auc(fpr_train, tpr_train)
auc_train = round(auc_train, 4)

# Calculate the ROC curve and AUC for the testing set
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, test_predictions)
auc_test = auc(fpr_test, tpr_test)
auc_test = round(auc_test, 4)

In [None]:
# Create a DataFrame with the fpr and tpr results
roc_df_train = pd.DataFrame({"FPR Train": fpr_train, "TPR Train": tpr_train,})

roc_df_test = pd.DataFrame({"FPR Test": fpr_test, "TPR Test": tpr_test,})

In [None]:
roc_df_train

In [None]:
# Plotting the ROC Curves
roc_df_train.hvplot(
    x="FPR Train",
    y="TPR Train",
    xlim=([-0.05, 1.05]),
    title=f"Train ROC Curve (AUC={auc_train})",
)

In [None]:
roc_df_test.hvplot(
    x="FPR Test",
    y="TPR Test",
    color="red",
    style="--",
    xlim=([-0.05, 1.05]),
    title=f"Test ROC Curve (AUC={auc_test})",
)

#### 3. Model 3......

#### 4. Model 4......

#### 5. Model 5 ......

### IV. ANALYSIS AND EVALUATION (TEAM WORK)

### V. DEPLOYING MODEL (TEAM WORK)

### VI. CONCLUSION (TEAM WORK)