In [39]:
#---------------------------- Libraries -------------------
import numpy as np
import os
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import xgboost as xgb
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [40]:
# ------------------ Set Paths ------------------
DATA_PATH = "/content"
MODEL_DIR = os.path.join(DATA_PATH, "saved_models")
PLOTS_DIR = os.path.join(MODEL_DIR, "plots")

# Ensure directories exist
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

# ------------------ Load Features ------------------
X = np.load(os.path.join(DATA_PATH, "X.npy"))
y = np.load(os.path.join(DATA_PATH, "y.npy"))
print("Features Loaded:", X.shape, y.shape)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Detect GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using Device: {device.upper()}")

# ------------------ Function to Evaluate Model ------------------
def evaluate_model(model, model_name, is_torch=False):
    if is_torch:
        model.eval()
        with torch.no_grad():
            outputs = model(torch.tensor(X_test, dtype=torch.float32).to(device))
            y_pred = torch.argmax(outputs, dim=1).cpu().numpy()
    else:
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {acc:.4f}")

    # Save Classification Report
    report_path = os.path.join(MODEL_DIR, f"{model_name}_report.txt")
    with open(report_path, "w") as f:
        f.write(classification_report(y_test, y_pred))

    # Save Confusion Matrix Plot
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{model_name} Confusion Matrix")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.savefig(os.path.join(PLOTS_DIR, f"{model_name}_confusion_matrix.png"))
    plt.close()

    return acc

Features Loaded: (4071, 24341) (4071,)
Using Device: CUDA


In [42]:
# ------------------ PyTorch MLP Model ------------------
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 384)
        self.fc2 = nn.Linear(384, 192)
        self.fc3 = nn.Linear(192, 96)
        self.fc4 = nn.Linear(96, 48)
        self.fc5 = nn.Linear(48, 2)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        return self.fc5(x)  # No activation (CrossEntropyLoss expects raw logits)


# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train, dtype=torch.float32)
y_train_torch = torch.tensor(y_train, dtype=torch.long)
X_test_torch = torch.tensor(X_test, dtype=torch.float32)
y_test_torch = torch.tensor(y_test, dtype=torch.long)

# Move data to device
X_train_torch, y_train_torch = X_train_torch.to(device), y_train_torch.to(device)
X_test_torch, y_test_torch = X_test_torch.to(device), y_test_torch.to(device)

# Create DataLoader for mini-batch training
batch_size = 256
train_dataset = TensorDataset(X_train_torch, y_train_torch)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize MLP
mlp_model = MLP(X_train.shape[1]).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.001)

# Train MLP with Mini-Batch Gradient Descent
mlp_path = os.path.join(MODEL_DIR, "MLP.pth")

if os.path.exists(mlp_path):
    print("MLP Model found, loading...")
    mlp_model.load_state_dict(torch.load(mlp_path))
else:
    print("Training PyTorch MLP...")
    mlp_model.train()  # Set to training mode

    epochs = 400
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = mlp_model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        if epoch%50 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

    # Save MLP Model
    torch.save(mlp_model.state_dict(), mlp_path)
    print("MLP Model saved.")

# Evaluate MLP
evaluate_model(mlp_model, "MLP", is_torch=True)

Training PyTorch MLP...
Epoch 1/400, Loss: 0.9520
Epoch 51/400, Loss: 0.1076
Epoch 101/400, Loss: 0.0459
Epoch 151/400, Loss: 0.0637
Epoch 201/400, Loss: 0.0511
Epoch 251/400, Loss: 0.0333
Epoch 301/400, Loss: 0.0362
Epoch 351/400, Loss: 0.0323
MLP Model saved.
MLP Accuracy: 0.9325


0.9325153374233128

In [43]:
# ------------------ Train & Save XGBoost ------------------
xgb_path = os.path.join(MODEL_DIR, "XGBoost.pkl")

if os.path.exists(xgb_path):
    print("XGBoost Model found, loading...")
    xgb_model = joblib.load(xgb_path)
else:
    print("Training XGBoost Model...")

    xgb_model = xgb.XGBClassifier(
        n_estimators=50,  # Increased for better accuracy
        max_depth=6,      # Slightly deeper trees
        learning_rate=0.05,  # More stable training
        tree_method="hist",  # Optimized method
        device="cuda" if device == "cuda" else "cpu",  # Ensure entire model runs on GPU
        eval_metric="logloss"
    )
    xgb_model.fit(X_train, y_train)
    joblib.dump(xgb_model, xgb_path, compress=3)
    print("XGBoost Model saved.")

# Evaluate XGBoost
evaluate_model(xgb_model, "XGBoost")

Training XGBoost Model...
XGBoost Model saved.
XGBoost Accuracy: 0.9264


0.9263803680981595

In [44]:
# ------------------ Train & Save Random Forest ------------------
rf_path = os.path.join(MODEL_DIR, "RandomForest.pkl")

if os.path.exists(rf_path):
    print("Random Forest Model found, loading...")
    rf_model = joblib.load(rf_path)
else:
    print("Training Random Forest Model...")
    rf_model = RandomForestClassifier(
        n_estimators=50,       # Increased for better accuracy (50 trees instead of 30)
        max_depth=6,           # Shallower trees (6 instead of 8) for faster training
        min_samples_split=5,   # Prevents too deep trees, speeds up training
        bootstrap=False,       # No bagging → Faster training
        n_jobs=-1,             # Uses all CPU cores
        verbose=1,             # Shows training progress
        warm_start=True        # Allows incremental training, making it faster
    )
    rf_model.fit(X_train, y_train)
    joblib.dump(rf_model, rf_path, compress=3)
    print("Random Forest Model saved.")

# Evaluate Random Forest
evaluate_model(rf_model, "RandomForest")

Training Random Forest Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   16.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed:    0.0s finished


Random Forest Model saved.
RandomForest Accuracy: 0.9006


0.9006134969325154

In [45]:
# ------------------ Train & Save SVM ------------------
svm_path = os.path.join(MODEL_DIR, "SVM.pkl")

if os.path.exists(svm_path):
    print("SVM Model found, loading...")
    svm_model = joblib.load(svm_path)
else:
    print("Training SVM Model...")
    # Use LinearSVC (MUCH faster for high-dimensional data)
    svm_model = LinearSVC(
        dual=False,         # Faster for large feature sets (24,341 features)
        max_iter=5000,      # Limit iterations to prevent unnecessary computation
        tol=1e-3,           # Slightly relaxed tolerance for speedup
        C=0.5               # Reduced C for faster convergence (default: 1.0)
    )

    svm_model.fit(X_train, y_train)
    joblib.dump(svm_model, svm_path, compress=3)
    print("SVM Model saved.")

# Evaluate SVM
evaluate_model(svm_model, "SVM")

Training SVM Model...
SVM Model saved.
SVM Accuracy: 0.9387


0.9386503067484663

In [46]:
# ------------------ Compare Accuracies ------------------
models = {"SVM": svm_model, "MLP": mlp_model, "XGBoost": xgb_model, "RandomForest": rf_model}
accuracies = {}

for name, model in models.items():
    is_torch = name == "MLP"
    acc = evaluate_model(model, name, is_torch=is_torch)
    accuracies[name] = acc

# Save Accuracy Log
accuracy_log_path = os.path.join(MODEL_DIR, "accuracy_log.txt")
with open(accuracy_log_path, "w") as f:
    for name, acc in accuracies.items():
        f.write(f"{name}: {acc:.4f}\n")

# Print Comparison of Accuracies
best_model_name = max(accuracies, key=accuracies.get)
print("\nModel Comparison:")
for name, acc in accuracies.items():
    print(f"{name}: {acc:.4f}")

print("\nBest Model:", best_model_name)

SVM Accuracy: 0.9387
MLP Accuracy: 0.9325
XGBoost Accuracy: 0.9264


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed:    0.0s finished


RandomForest Accuracy: 0.9006

Model Comparison:
SVM: 0.9387
MLP: 0.9325
XGBoost: 0.9264
RandomForest: 0.9006

Best Model: SVM


In [47]:
from google.colab import files
import os

# Define paths
model_files = [os.path.join(MODEL_DIR, f) for f in os.listdir(MODEL_DIR) if f.endswith((".pkl", ".pth", ".txt"))]
plot_files = [os.path.join(PLOTS_DIR, f) for f in os.listdir(PLOTS_DIR) if f.endswith((".png"))]

# Download models
for file in model_files:
    print(f"Downloading {file} ...")
    files.download(file)

# Download plots
for file in plot_files:
    print(f"Downloading {file} ...")
    files.download(file)

Downloading /content/saved_models/XGBoost_report.txt ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading /content/saved_models/accuracy_log.txt ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading /content/saved_models/SVM.pkl ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading /content/saved_models/MLP.pth ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading /content/saved_models/MLP_report.txt ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading /content/saved_models/XGBoost.pkl ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading /content/saved_models/RandomForest.pkl ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading /content/saved_models/SVM_report.txt ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading /content/saved_models/RandomForest_report.txt ...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [41]:
# import shutil
# folder_to_delete = "/content/download_models"
# shutil.rmtree(folder_to_delete, ignore_errors=True)  # Deletes the folder and its contents
# print(f"{folder_to_delete} Folder deleted successfully.")