In [None]:
import pandas as pd

# Read the Label.csv file
labels = pd.read_csv('../data/CICD/Label.csv')
# Read the Data.csv file
data = pd.read_csv('../data/CICD/Data.csv')


In [None]:
# Train/test split
from sklearn.model_selection import train_test_split

# Create X (features) and y (target)
X = data
y = labels['Label']

# Create train/test split with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print the shapes of the resulting splits
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=20, class_weight='balanced', random_state=42, n_jobs=-1)

# Train the model on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': data.columns,
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

In [None]:
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Function to plot confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], fmt),
                 ha="center", va="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

# Plot confusion matrix
class_names = sorted(labels['Label'].unique())
plt.figure(figsize=(10, 8))
plot_confusion_matrix(conf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')
plt.show()

plt.figure(figsize=(10, 8))
plot_confusion_matrix(conf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')
plt.show()

# Binarize the labels for ROC calculation
y_test_bin = label_binarize(y_test, classes=class_names)
n_classes = y_test_bin.shape[1]

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], rf_classifier.predict_proba(X_test)[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curves for each class
plt.figure(figsize=(10, 8))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for label %s' % (roc_auc[i], class_names[i]))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves for Each Class')
plt.legend(loc="lower right")
plt.show()

In [None]:
print("Based on the confusion matrix, the pairs of classes that are most frequently confused with each other are:")

for i in range(len(conf_matrix)):
    for j in range(len(conf_matrix)):
        if i != j:
            print(f"* Class {class_names[i]} is confused with Class {class_names[j]} ({conf_matrix[i, j]} instances)")


In [None]:
# Extract the confusion counts from the confusion matrix
confused_pairs = []
for i in range(len(conf_matrix)):
    for j in range(len(conf_matrix)):
        if i != j and conf_matrix[i, j] > 0:
            confused_pairs.append(((class_names[i], class_names[j]), conf_matrix[i, j]))

# Sort the confused pairs based on the number of instances in descending order
confused_pairs_sorted = sorted(confused_pairs, key=lambda x: x[1], reverse=True)

print("Pairs of classes that are most frequently confused with each other (in descending order):")
for (pair, count) in confused_pairs_sorted:
    print(f"* Class {pair[0]} is confused with Class {pair[1]} ({count} instances)")

In [None]:
from sklearn.inspection import permutation_importance

# 1. Feature importance analysis for class 5

# Calculate permutation importance specifically for class 5
result = permutation_importance(rf_classifier, X_test, y_test, 
                              n_repeats=10, 
                              random_state=42)

# Create DataFrame of feature importance specifically for this problem
class_importance = pd.DataFrame({
    'feature': X_test.columns,
    'importance': result.importances_mean
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features for Classification:")
print(class_importance.head(10))

# 2. Analyze samples that are misclassified
misclassified = X_test[y_test != y_pred]
misclassified_true = y_test[y_test != y_pred]
misclassified_pred = y_pred[y_test != y_pred]

# Focus on class 5 misclassifications
class_5_errors = misclassified[
    (misclassified_true == 5) | (misclassified_pred == 5)
]

print("\nStatistical summary of misclassified samples for class 5:")
print(class_5_errors.describe())

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Define custom class weights, increasing weight for class 5
class_weights = {
    0: 1,
    1: 1,
    2: 1,
    3: 1,
    4: 4,
    5: 5,  # Increase weight for class 5
    6: 1,
    7: 1,
    8: 1,
    9: 1
}

# Initialize the Random Forest Classifier with custom class weights
rf_classifier_weighted = RandomForestClassifier(n_estimators=100,
                                                 max_depth=20,
                                                 class_weight=class_weights,
                                                 random_state=42,
                                                 n_jobs=-1)

# Train the model on the training data
rf_classifier_weighted.fit(X_train, y_train)

# Make predictions on the test data
y_pred_weighted = rf_classifier_weighted.predict(X_test)

# Evaluate the model's performance
accuracy_weighted = accuracy_score(y_test, y_pred_weighted)
print("Accuracy with Weighted Classes:", accuracy_weighted)

# Print the classification report
print("Classification Report with Weighted Classes:")
print(classification_report(y_test, y_pred_weighted))

# Get feature importance
feature_importance_weighted = pd.DataFrame({
    'feature': data.columns,
    'importance': rf_classifier_weighted.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features (Weighted):")
print(feature_importance_weighted.head(10))

In [None]:
# Plot confusion matrix
class_names = sorted(labels['Label'].unique())
plt.figure(figsize=(10, 8))
plot_confusion_matrix(conf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')
plt.show()

plt.figure(figsize=(10, 8))
plot_confusion_matrix(conf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')
plt.show()

In [None]:
import mlflow
from mlflow.models import infer_signature
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd

import io  # Import the io module
import os
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://192.168.1.189:9000"
os.environ["AWS_ACCESS_KEY_ID"] = "roberto"
os.environ["AWS_SECRET_ACCESS_KEY"] = "patilla1"


# Set remote MLflow tracking URI
mlflow.set_tracking_uri("http://192.168.1.86:5050")

# Ensure experiment exists or create it
mlflow.set_experiment("CICD_IDS_Model_v1")

# Start a new run
with mlflow.start_run(run_name="Stacking_Classifier-rf+lr_and_standard_scaler") as run:

    # Automatically log all parameters, metrics, and models
    mlflow.autolog()
    
    
    # Define base estimators
    estimators = [
        ('rf', RandomForestClassifier(n_estimators=100, max_depth=20, class_weight=class_weights, random_state=42, n_jobs=-1)),
        ('lr', LogisticRegression(max_iter=1000,random_state=42))
    ]

    # Create stacking classifier with Logistic Regression as the meta-estimator
    stacking_clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(max_iter=1000),
        cv=5
    )
    
    # Define and train pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', stacking_clf)
    ])
    pipeline.fit(X_train, y_train)

    # Evaluate and log metric
    y_pred_weighted = pipeline.predict(X_test)
    accuracy_weighted = accuracy_score(y_test, y_pred_weighted)
    mlflow.log_metric("accuracy", accuracy_weighted)

    # Generate classification report and log it as an artifact
    report_text = classification_report(y_test, y_pred_weighted)
    
    # Use io.StringIO to create an in-memory text buffer
    buffer = io.StringIO()
    buffer.write(report_text)
    
    # Log the buffer as an artifact
    mlflow.log_text(buffer.getvalue(), "classification_report.txt")

    # Save and log feature importance
    feature_importance_weighted = pd.DataFrame({
        'feature': data.columns,
        'importance': rf_classifier_weighted.feature_importances_
    }).sort_values('importance', ascending=False)
    feature_importance_weighted.to_json("feature_importance.json", orient="records", indent=2)
    mlflow.log_artifact("feature_importance.json")

    # Log pipeline and register it
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="CICD_IDS_Model_v1",
        signature=infer_signature(X_train, y_train)
    )

    print("✅ Model, metrics and artifacts logged to remote MLflow.")

In [None]:
# Load the registered model from MLflow
loaded_model = mlflow.sklearn.load_model("models:/CICD_IDS_Model_v1/Production")

# Run inference on a subset of test data (e.g., first 10 samples)
predictions = loaded_model.predict(X_test.head(10))
print("Predictions for first 10 test samples:")
print(predictions)