In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM 
from tensorflow.keras.callbacks import EarlyStopping

# Step 2: Load and Prepare Data
data_1 = pd.read_csv(r'C:\Users\bravo\OneDrive\OneDrive Files\Desktop\train_set_1.csv')  
data_2 = pd.read_csv(r'C:\Users\bravo\OneDrive\OneDrive Files\Desktop\train_set_2.csv')
data_3 = pd.read_csv(r'C:\Users\bravo\OneDrive\OneDrive Files\Desktop\train_set_3.csv')

# Step 3: Generate Features for Financial Time Series Data
def generate_features(data):
  lag = 5
  data['SMA_5'] = data['value'].rolling(window=5).mean()
  data['SMA_20'] = data['value'].rolling(window=20).mean()
  
  for i in range(1, lag + 1):
    data[f'Lag_{i}'] = data['value'].shift(i)
    
  data['Rolling_STD_5'] = data['value'].rolling(window=5).std()
  data['Rolling_STD_20'] = data['value'].rolling(window=20).std()
  
  roc_period = 1
  data['ROC'] = (data['value'].diff(roc_period).shift(-1) > 0).astype(int) # Shift ROC as required
  
  return data

data_1 = generate_features(data_1)
data_2 = generate_features(data_2) 
data_3 = generate_features(data_3)

# Step 4: Prepare Features and Labels for all Datasets  
def prepare_data(data):
  lag = 5
  data = data.dropna()
  
  X = data[['SMA_5', 'SMA_20', 'Rolling_STD_5', 'Rolling_STD_20'] + [f'Lag_{i}' for i in range(1, lag + 1)]]
  y = data['ROC']
  
  return X, y

X_1, y_1 = prepare_data(data_1)
X_2, y_2 = prepare_data(data_2)
X_3, y_3 = prepare_data(data_3)

# Step 5: Split Data into Training and Test Sets for all Datasets
def split_data(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
  return X_train, X_test, y_train, y_test

# Step 6: Train and Evaluate LSTM Models for all Datasets  
def train_and_evaluate_model(X_train, y_train, X_test, y_test, model):

  model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, 
            callbacks=[early_stopping], verbose=0)
            
  y_pred = (model.predict(X_test) > 0.5).astype(int)

  accuracy = accuracy_score(y_test, y_pred)

  print("Test Accuracy:", accuracy)

  print("Classification Report:")
  # Add zero_division=1 to classification_report to handle the warning
  print(classification_report(y_test, y_pred, zero_division=1))
  
print("Evaluation for Dataset 1:")
X_train_1, X_test_1, y_train_1, y_test_1 = split_data(X_1, y_1)

model_1 = Sequential()
model_1.add(LSTM(32, input_shape=(X_train_1.shape[1], 1)))
model_1.add(Dense(1, activation='sigmoid'))
model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = EarlyStopping(patience=10, restore_best_weights=True)

train_and_evaluate_model(X_train_1, y_train_1, X_test_1, y_test_1, model_1)

print("Evaluation for Dataset 2:")
X_train_2, X_test_2, y_train_2, y_test_2 = split_data(X_2, y_2)  

model_2 = Sequential()
model_2.add(LSTM(32, input_shape=(X_train_2.shape[1], 1)))
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = EarlyStopping(patience=10, restore_best_weights=True)

train_and_evaluate_model(X_train_2, y_train_2, X_test_2, y_test_2, model_2)

print("Evaluation for Dataset 3:")
X_train_3, X_test_3, y_train_3, y_test_3 = split_data(X_3, y_3)

model_3 = Sequential()
model_3.add(LSTM(32, input_shape=(X_train_3.shape[1], 1)))
model_3.add(Dense(1, activation='sigmoid'))
model_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = EarlyStopping(patience=10, restore_best_weights=True)

train_and_evaluate_model(X_train_3, y_train_3, X_test_3, y_test_3, model_3)




Evaluation for Dataset 1:
Test Accuracy: 0.49537152864648487
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.45      0.47      2017
           1       0.49      0.54      0.52      1980

    accuracy                           0.50      3997
   macro avg       0.50      0.50      0.49      3997
weighted avg       0.50      0.50      0.49      3997

Evaluation for Dataset 2:
Test Accuracy: 0.5023767825869402
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.06      0.11      2023
           1       0.50      0.96      0.65      1974

    accuracy                           0.50      3997
   macro avg       0.54      0.51      0.38      3997
weighted avg       0.54      0.50      0.38      3997

Evaluation for Dataset 3:
Test Accuracy: 0.5061295971978984
Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.03      0.