# Model Comparison
In this notebook, we retrain the random forest classifier and LSTM model for comparison.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


### Prepare data and split into train and test

In [2]:
# Load the data
data = pd.read_csv('combined_feature_vectors.csv')

# Prepare features and labels
X = data.drop('Label', axis=1)
y = data['Label']

# Scale the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# For LSTM, reshape data to [samples, time steps, features]
X_scaled_lstm = X_scaled.reshape(X_scaled.shape[0], 1, X_scaled.shape[1])

# Split the data into training and testing sets
X_train_rf, X_test_rf, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train_lstm, X_test_lstm = X_train_rf.reshape(X_train_rf.shape[0], 1, X_train_rf.shape[1]), X_test_rf.reshape(X_test_rf.shape[0], 1, X_test_rf.shape[1])

### Train Random Forest and LSTM Models

In [3]:
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Random Forest Model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_rf, y_train)

# Define the early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',   # Monitor the validation loss
    min_delta=0.001,      # Minimum change to qualify as an improvement
    patience=10,          # Stop after 10 epochs without improvement
    restore_best_weights=True  # Restore the best weights found during training
)

# LSTM Model setup
model = Sequential([
    LSTM(50, activation='relu', input_shape=(1, X_train_lstm.shape[2])),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_lstm, y_train, epochs=30, batch_size=32, validation_split=0.1, callbacks=[early_stopping])


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30

### Predict Probabilities and Compute AUC/ROC

In [None]:
from sklearn.metrics import roc_auc_score

# Predict probabilities
rf_probs = rf_classifier.predict_proba(X_test_rf)[:, 1]
lstm_probs = model.predict(X_test_lstm).flatten()  # Flatten is used as model.predict returns a 2D array

# Calculate ROC curve and AUC
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_probs)
rf_auc = auc(rf_fpr, rf_tpr)
lstm_fpr, lstm_tpr, _ = roc_curve(y_test, lstm_probs)
lstm_auc = auc(lstm_fpr, lstm_tpr)

# Display AUC
print("Random Forest AUC: {:.3f}".format(rf_auc))
print("LSTM AUC: {:.3f}".format(lstm_auc))


Random Forest AUC: 1.000
LSTM AUC: 0.989


### Plot ROC Curves for Comparison

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(rf_fpr, rf_tpr, label='Random Forest (AUC = {:.3f})'.format(rf_auc))
plt.plot(lstm_fpr, lstm_tpr, label='LSTM (AUC = {:.3f})'.format(lstm_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend(loc="lower right")
plt.show()
