## All

In [1]:
# Step 1: Import necessary libraries
'''
    Pandas: for loading and manipulating the CSV datasets as dataframes
    Numpy: for numerical operations on arrays/matrices
    Sklearn metrics: for evaluating model performance (accuracy, classification report etc) - sklearn preprocessing: for data standardization
    Sklearn model selection: for splitting data into train/test sets
    Keras: for building and training neural network models
    Keras callbacks: for early stopping to prevent overfitting
'''
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

# Step 2: Load and Prepare Data
data_1 = pd.read_csv(r'C:\Users\bravo\OneDrive\OneDrive Files\Desktop\train_set_1.csv')
data_2 = pd.read_csv(r'C:\Users\bravo\OneDrive\OneDrive Files\Desktop\train_set_2.csv')
data_3 = pd.read_csv(r'C:\Users\bravo\OneDrive\OneDrive Files\Desktop\train_set_3.csv')

# Step 3: Generate Features for Financial Time Series Data
def generate_features(data):
    lag = 5
    data['SMA_5'] = data['value'].rolling(window=5).mean()
    data['SMA_20'] = data['value'].rolling(window=20).mean()

    for i in range(1, lag + 1):
        data[f'Lag_{i}'] = data['value'].shift(i)
    
    data['Rolling_STD_5'] = data['value'].rolling(window=5).std()
    data['Rolling_STD_20'] = data['value'].rolling(window=20).std()
    
    roc_period = 5
    data['ROC'] = (data['value'].diff(roc_period).shift(-1) > 0).astype(int)  # Shift ROC as required

    return data

data_1 = generate_features(data_1)
data_2 = generate_features(data_2)
data_3 = generate_features(data_3)

# Step 4: Prepare Features and Labels for all Datasets
def prepare_data(data):
    lag = 5
    data = data.dropna()
    
    X = data[['SMA_5', 'SMA_20', 'Rolling_STD_5', 'Rolling_STD_20'] + [f'Lag_{i}' for i in range(1, lag + 1)]]
    y = data['ROC']

    return X, y

X_1, y_1 = prepare_data(data_1)
X_2, y_2 = prepare_data(data_2)
X_3, y_3 = prepare_data(data_3)

# Step 5: Split Data into Training and Test Sets for all Datasets
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    return X_train, X_test, y_train, y_test

# Step 6: Train and Evaluate Models for all Datasets
def train_and_evaluate_model(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=0)
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", accuracy)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

print("Evaluation for Dataset 1:")
X_train_1, X_test_1, y_train_1, y_test_1 = split_data(X_1, y_1)
model_1 = Sequential()
model_1.add(Dense(32, input_dim=X_train_1.shape[1], activation='relu'))
model_1.add(Dense(1, activation='sigmoid'))
model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping = EarlyStopping(patience=10, restore_best_weights=True)
train_and_evaluate_model(X_train_1, y_train_1, X_test_1, y_test_1, model_1)

print("Evaluation for Dataset 2:")
X_train_2, X_test_2, y_train_2, y_test_2 = split_data(X_2, y_2)
model_2 = Sequential()
model_2.add(Dense(32, input_dim=X_train_2.shape[1], activation='relu'))
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping = EarlyStopping(patience=10, restore_best_weights=True)
train_and_evaluate_model(X_train_2, y_train_2, X_test_2, y_test_2, model_2)

print("Evaluation for Dataset 3:")
X_train_3, X_test_3, y_train_3, y_test_3 = split_data(X_3, y_3)
model_3 = Sequential()
model_3.add(Dense(32, input_dim=X_train_3.shape[1], activation='relu'))
model_3.add(Dense(1, activation='sigmoid'))
model_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping = EarlyStopping(patience=10, restore_best_weights=True)
train_and_evaluate_model(X_train_3, y_train_3, X_test_3, y_test_3, model_3)


ModuleNotFoundError: No module named 'sklearn'

## Background/Key Points 📃

### Simple moving average (SMA):
- Takes the average of the time series over a specified window
- Helps smooth out noise and reveal underlying trends
- E.g. 5-period SMA shows short term trend, 20-period shows longer term

### Lagging features:
- Shifting the time series values backwards in time - Provides historical context for each point
- E.g. Lag_1 is the value from 1 time period ago

### Rolling standard deviation:
- Measures volatility by taking standard deviation over a window - Higher values indicate more volatility/fluctuation

### Rate of change (ROC):
- Binary indicator showing if value increased or decreased - Compares current value to value 'n' periods ago
- Used as label for binary classification models

This is a binary classification model because the target variable, ROC, only takes on two values (0 or 1). The goal is to predict if the time series is going up or down.

Binary classification models predict one of two outcomes, like yes/no, true/false, spam/not-spam etc.

### Some common binary classification algorithms are: 
- Logistic regression 
- SVM (Support Vector Machine) - Neural networks
- Naive Bayes
- Decision trees

The code uses a neural network for binary classification because they can capture complex relationships between the input features like SMA, volatility etc. and the uptrend/downtrend prediction. The sigmoid output activation squashes predictions between 0 and 1 for probability-like outputs.

In summary, the engineered features help uncover patterns that are fed into a neural network binary classifier to predict if the financial time series is going up or down.

### Neural Networks:
A neural network is a type of machine learning model loosely inspired by biological neurons in the brain. The goal is to approximate complex mathematical functions that map input features to output values.
Some key aspects of how they work:
- Made up of layers of "neurons" or nodes
- Input layer receives the feature data
- Hidden layers apply transformations and enable learning of complex patterns - Output layer makes predictions
- Nodes are densely connected across layers
- Connections have weights that are learned during training
- Data flows through the network in a forward pass - Predictions are made based on current weights
- Loss is calculated against true labels
- Weights are adjusted through backpropagation to reduce loss
- Learning involves finding the right weight values that minimize loss In this code, a simple neural net with 1 hidden layer is defined:
- Input layer size determined by the number of time series features
- 1 hidden layer with 32 nodes to learn complex patterns
- Output layer is a single sigmoid node for binary classification
During training:
- Data flows through, predictions made
- Loss calculated against true uptrend/downtrend labels - Backprop adjusts weights to reduce loss
- Process repeats for multiple epochs

After training, the learned model can predict on new data. The neural network essentially learns complex mathematical functions that map the input time series features to the likelihood of an increasing or decreasing trend.

In summary, neural networks learn to unpack complex relationships between the inputs and outputs based on backpropagating loss, making them very versatile and powerful models.

#### Neural Network Implementation:
- Keras is used to define and train the neural network model in the code.
- A Sequential model defines a linear stack of layers.
- The first layer is the Dense input layer. It has 32 nodes and a ReLU activation function.
- The input dimension is set to the number of features in X_train, so it matches the feature data.
- The second layer is a single output node with a sigmoid activation. This squashes outputs between 0 and 1.
- The loss function used is binary cross-entropy, appropriate for binary classification. - The adam optimizer helps adjust weights during backpropagation.
- Metrics like accuracy can be monitored during training.

#### Neural Network Training:
   
- The network is trained for 50 epochs with a batch size of 32. - A validation split of 0.2 is used to monitor overfitting.
- Early stopping callbacks prevent overfitting by stopping if validation loss doesn't improve after 10 epochs.
- Verbose=0 suppresses logging output for cleaner results.
- The fit() method trains the model by propagating data through, calculating loss with respect to labels, and updating weights via backpropagation.

#### Making Predictions:
- After training, the model can make predictions on new data.
- The predict() method runs a forward pass, returning the probability outputs. - A threshold of 0.5 is applied to make binary predictions.

So in summary, Keras provides a high-level API to define, train and use neural network models seamlessly in Python. The code leverages this to effectively build and apply a neural network for time series classification.

### Long short-term memory (LSTM):
- LSTM is a type of neural network model that is well-suited for processing sequential data like
time series or text data.
- It is a special kind of recurrent neural network (RNN). RNNs have looping connections that
allow information to persist across time steps.
- The problem with basic RNNs is that they can struggle with long-term dependencies in
sequences. This is where LSTMs help.
- LSTMs have a more complex structure than basic RNNs, with special units called memory
cells.
- These memory cells can store information for long periods of time. They have gates that
control when to store, use, or forget information.
- This gives LSTMs the ability to learn long-term dependencies in sequence data that basic
RNNs struggle with.
- So in summary, LSTM is a type of RNN that uses special memory cells to store and access
information over long sequences, allowing it to model temporal data effectively.

### RNN
- An RNN is a type of neural network designed to process sequential data, like time series or
text.
- It has a recurrent layer that maintains a state over time.
- This state captures information about previous inputs seen by the network.
- So the RNN "remembers" prior inputs as it processes the next input in a sequence.
- This memory of previous context is what gives RNNs their unique capabilities for sequence
modeling.
For example, in a time series forecasting task:
- The RNN takes in a sequence of data points as input one at a time.
- As it sees each input, the state of the recurrent layer gets updated based on current input and
previous context.
- So the network retains memory of previous time steps' context.
- This contextual information gets passed forward and helps the RNN predict the next value in
the sequence.
- Without the recurrent connection, regular neural networks have no memory and treat each input
independently.
 
In summary,
- RNNs have feedback loops that give them memory over a sequence.
- This lets them learn temporal dynamic behavior, unlike regular neural networks.
- The recurrent layer state captures relevant context from prior inputs to inform predictions.
So RNN models are ideal for processing sequence data like time series, text, audio, etc. Their
memory over sequences makes them uniquely suited for such tasks.

## Data Preperation ⚙️

In [None]:
# Step 1: Import necessary libraries
'''
    Pandas: for loading and manipulating the CSV datasets as dataframes
    Numpy: for numerical operations on arrays/matrices
    Sklearn metrics: for evaluating model performance (accuracy, classification report etc) - sklearn preprocessing: for data standardization
    Sklearn model selection: for splitting data into train/test sets
    Keras: for building and training neural network models
    Keras callbacks: for early stopping to prevent overfitting
'''
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Step 2: Load and Prepare Data

'''
Read each CSV file from the given file path into a separate pandas dataframe 
Making raw time series data is now accessible for preprocessing
'''
FILE_PATH = "C:\Users\bravo\OneDrive\OneDrive Files\Desktop\\"

data_1 = pd.read_csv(r'FILE_PATH + train_set_1.csv')
data_2 = pd.read_csv(r'FILE_PATH + train_set_2.csv')
data_3 = pd.read_csv(r'FILE_PATH + train_set_3.csv')

## Methods (function) 🔑

In [None]:
'''
- Compute simple moving averages (SMA) over 5 and 20 time periods to capture short and long term trends
- Compute lagged values of the time series, shifting values backwards in time. This captures historical context.
- Compute rolling standard deviation to measure volatility
- Generate 'Rate of change' binary indicator showing if value increased or decreased over a period
'''
def generate_features(data):
    lag = 5
    data['SMA_5'] = data['value'].rolling(window=5).mean()
    data['SMA_20'] = data['value'].rolling(window=20).mean()

    for i in range(1, lag + 1):
        data[f'Lag_{i}'] = data['value'].shift(i)
    
    data['Rolling_STD_5'] = data['value'].rolling(window=5).std()
    data['Rolling_STD_20'] = data['value'].rolling(window=20).std()
    
    roc_period = 5
    data['ROC'] = (data['value'].diff(roc_period).shift(-1) > 0).astype(int)  # Shift ROC as required

    return data

In [2]:

'''
- X contains all the engineered features
- y contains the binary 'rate of change' labels - Split features and labels for modeling
'''
def prepare_data(data):
    lag = 5
    data = data.dropna()
    
    X = data[['SMA_5', 'SMA_20', 'Rolling_STD_5', 'Rolling_STD_20'] + [f'Lag_{i}' for i in range(1, lag + 1)]]
    y = data['ROC']

    return X, y


In [None]:
# Step 5: Split Data into Training and Test Sets for all Datasets
'''
- Split each dataset into 80% train and 20% test sets
- Train set is used to fit models, test set is used for evaluation
'''
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    return X_train, X_test, y_train, y_test

In [None]:
# Step 6: Train and Evaluate Models for all Datasets
'''
Fit the model using the training set, evaluate on test set. Calculate accuracy, classification report etc

Parameters
----------
X_train : feature matrix of the training dataset
y_train : target or label values for the training dataset
X_test : feature matrix of the testing dataset
y_test : target values for the testing dataset
model : machine learning model to be trained

Returns
-------
None

stdout
-------
Ex:
Test Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2017
           1       1.00      1.00      1.00      1980

    accuracy                           1.00      3997
   macro avg       1.00      1.00      1.00      3997
weighted avg       1.00      1.00      1.00      3997
        
'''
def train_and_evaluate_model(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=0)
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", accuracy)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

## Implementation 💻

In [None]:
# Step 3: Generate Features for Financial Time Series Data
data_1 = generate_features(data_1)
data_2 = generate_features(data_2)
data_3 = generate_features(data_3)

In [None]:
# Step 4: Prepare Features and Labels for all Datasets
X_1, y_1 = prepare_data(data_1)
X_2, y_2 = prepare_data(data_2)
X_3, y_3 = prepare_data(data_3)

In [None]:
print("Evaluation for Dataset 1:")
X_train_1, X_test_1, y_train_1, y_test_1 = split_data(X_1, y_1)
model_1 = Sequential()
model_1.add(Dense(32, input_dim=X_train_1.shape[1], activation='relu'))
model_1.add(Dense(1, activation='sigmoid'))
model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping = EarlyStopping(patience=10, restore_best_weights=True)
train_and_evaluate_model(X_train_1, y_train_1, X_test_1, y_test_1, model_1)

print("Evaluation for Dataset 2:")
X_train_2, X_test_2, y_train_2, y_test_2 = split_data(X_2, y_2)
model_2 = Sequential()
model_2.add(Dense(32, input_dim=X_train_2.shape[1], activation='relu'))
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping = EarlyStopping(patience=10, restore_best_weights=True)
train_and_evaluate_model(X_train_2, y_train_2, X_test_2, y_test_2, model_2)

print("Evaluation for Dataset 3:")
X_train_3, X_test_3, y_train_3, y_test_3 = split_data(X_3, y_3)
model_3 = Sequential()
model_3.add(Dense(32, input_dim=X_train_3.shape[1], activation='relu'))
model_3.add(Dense(1, activation='sigmoid'))
model_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping = EarlyStopping(patience=10, restore_best_weights=True)
train_and_evaluate_model(X_train_3, y_train_3, X_test_3, y_test_3, model_3)
