In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import lightgbm as lgb
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from keras.callbacks import EarlyStopping

# Assuming the DataFrame `df` is already loaded with your data
# Example:
# df = pd.read_csv('your_data.csv')  # Uncomment and replace with your actual data loading method

# Handle missing data
print("Before Dropna:", df.shape)
df.fillna(method="bfill", inplace=True)  # Backward fill missing values
df.fillna(method="ffill", inplace=True)  # Forward fill missing values
print("After Fillna:", df.shape)

# Define features and target
X = df[['Open', 'High', 'Low', 'Volume']]  # Use only relevant features
y = df['Close']  # Target variable (closing price)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# ------------------- Data Scaling for LSTM -------------------
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape for LSTM
X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# ------------------- Train Models -------------------

# 1. Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
r2_rf = r2_score(y_test, rf_pred)
mae_rf = mean_absolute_error(y_test, rf_pred)
mse_rf = mean_squared_error(y_test, rf_pred)
rmse_rf = np.sqrt(mse_rf)

# 2. XGBoost
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
r2_xgb = r2_score(y_test, xgb_pred)
mae_xgb = mean_absolute_error(y_test, xgb_pred)
mse_xgb = mean_squared_error(y_test, xgb_pred)
rmse_xgb = np.sqrt(mse_xgb)

# 3. LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)
r2_lgb = r2_score(y_test, lgb_pred)
mae_lgb = mean_absolute_error(y_test, lgb_pred)
mse_lgb = mean_squared_error(y_test, lgb_pred)
rmse_lgb = np.sqrt(mse_lgb)

# ------------------- Train LSTM -------------------

# Define the LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(50, return_sequences=True, input_shape=(X_train_lstm.shape[1], 1)))
lstm_model.add(LSTM(50, return_sequences=False))
lstm_model.add(Dense(1))
lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)

# Fit LSTM model
lstm_model.fit(X_train_lstm, y_train, epochs=100, batch_size=32, verbose=1, callbacks=[early_stopping])

# Predict with LSTM
lstm_pred = lstm_model.predict(X_test_lstm)
r2_lstm = r2_score(y_test, lstm_pred)
mae_lstm = mean_absolute_error(y_test, lstm_pred)
mse_lstm = mean_squared_error(y_test, lstm_pred)
rmse_lstm = np.sqrt(mse_lstm)

# ------------------- Performance Comparison -------------------

# Create a DataFrame to compare model performances
results_df = pd.DataFrame([{
    "Model": "Random Forest", 
    "R² Score": r2_rf, 
    "MAE": mae_rf, 
    "MSE": mse_rf, 
    "RMSE": rmse_rf
}, {
    "Model": "XGBoost", 
    "R² Score": r2_xgb, 
    "MAE": mae_xgb, 
    "MSE": mse_xgb, 
    "RMSE": rmse_xgb
}, {
    "Model": "LightGBM", 
    "R² Score": r2_lgb, 
    "MAE": mae_lgb, 
    "MSE": mse_lgb, 
    "RMSE": rmse_lgb
}])

# Add LSTM results using pd.concat
results_df = pd.concat([results_df, pd.DataFrame([{
    "Model": "LSTM", 
    "R² Score": r2_lstm, 
    "MAE": mae_lstm, 
    "MSE": mse_lstm, 
    "RMSE": rmse_lstm
}])], ignore_index=True)

print("\n🔹 Final Model Performance Comparison:\n", results_df)

# ------------------- Compare Actual vs Predicted -------------------

# Create a DataFrame to show actual vs predicted for all models
comparison_df = pd.DataFrame({
    'Actual': y_test,
    'RF_Prediction': rf_pred,
    'XGB_Prediction': xgb_pred,
    'LGB_Prediction': lgb_pred,
    'LSTM_Prediction': lstm_pred.flatten()  # Flatten LSTM predictions to match the shape
})

print("\n🔹 Actual vs Predicted Comparison:\n", comparison_df.head())  # Display the first few rows of the comparison


Before Dropna: (3154, 12)
After Fillna: (3154, 12)


  df.fillna(method="bfill", inplace=True)  # Backward fill missing values
  df.fillna(method="ffill", inplace=True)  # Forward fill missing values


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000105 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 2523, number of used features: 4
[LightGBM] [Info] Start training from score 147.703985
Epoch 1/100


  super().__init__(**kwargs)


[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - loss: 29277.9453
Epoch 2/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 25832.1484
Epoch 3/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 23848.5059
Epoch 4/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 22760.6543
Epoch 5/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 21673.9375
Epoch 6/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 21084.6172
Epoch 7/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 19973.2266
Epoch 8/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 18239.6094
Epoch 9/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 18726.1016
Epoch 10/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m