In [None]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, f1_score, precision_score
import joblib
import shap

# 1. LOAD DATA

In [None]:
df = pd.read_parquet("data/cic-collection.parquet")

# 2. FEATURE AND TARGET SETUP

In [None]:
X = df.drop(['Label', 'ClassLabel'], axis=1)
categories = pd.Categorical(df['ClassLabel'])

# Map categories for interpretability
for code, category in enumerate(categories.categories):
    print(f"{code}: {category}")
    
y = pd.Categorical(df['ClassLabel']).codes

# 3. TRAIN-TEST SPLIT

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. DEFINE AND TRAIN GPU-ACCELERATED RANDOM FOREST

In [None]:
rf_model = cumlRF(n_estimators=200, max_depth=10, random_state=42)
start_train = time.time()
rf_model.fit(X_train, y_train)
end_train = time.time()
print(f"Training time: {end_train - start_train:.4f} seconds")

# 5. CROSS-VALIDATION

In [None]:
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")

# 6. TEST SET PREDICTIONS

In [None]:
start_test = time.time()
y_pred_test = rf_model.predict(X_test)
end_test = time.time()
print(f"Test prediction time: {end_test - start_test:.4f} seconds")

# Single inference time
start_single = time.time()
single_inference = rf_model.predict(X_test.iloc[:1])
end_single = time.time()
print(f"Single inference time: {end_single - start_single:.4f} seconds")

# 7. METRICS

In [None]:
accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='weighted')
recall = recall_score(y_test, y_pred_test, average='weighted')
f1 = f1_score(y_test, y_pred_test, average='weighted')

print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted): {recall:.4f}")
print(f"F1 Score (weighted): {f1:.4f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
print("\nConfusion Matrix:")
print(conf_matrix)

# 8. SHAP INTERPRETABILITY

In [None]:
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test)

# Plot SHAP Summary
shap.summary_plot(shap_values, X_test)

# 9. SAVE THE MODEL

In [None]:
joblib.dump(rf_model, "rf_model_gpu.pkl")
print("Model saved as rf_model_gpu.pkl")