# Comparing Original and Enhanced Pipelines in Intrusion Detection

This notebook compares two pipelines for intrusion detection:

- **Original Pipeline**: Data preprocessing and feature selection without Isolation Forest.
- **Enhanced Pipeline**: Data preprocessing and feature selection with Isolation Forest as an anomaly detection filter.

We aim to assess the impact of Isolation Forest on anomaly detection and dataset efficiency before applying tree-based models.


## Import libraries ----- ADDED SOME IMPORTS 4371 -----

In [None]:
!pip install xgboost

# imports below are from 4371 group to make file work
!pip install pandas
!pip install seaborn
# Core data manipulation and scientific libraries
!pip install numpy pandas

# Data visualization libraries
!pip install seaborn matplotlib

# Machine learning libraries
!pip install scikit-learn xgboost

# Imbalanced data handling
!pip install imbalanced-learn

# Hyperparameter optimization libraries
!pip install hyperopt scikit-optimize

# (Optional) Additional dependencies for compatibility
!pip install scipy

# Custom module FCBF (if available locally, or if it's a GitHub repo, use the clone URL)
# Replace "URL_TO_FCBF_MODULE" with the actual URL or location if it's on GitHub or a local file
!pip install git+https://github.com/SantiagoEG/FCBF_module.git


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance

# Isolation Forest Import -- 4371
from sklearn.ensemble import IsolationForest

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import mutual_info_classif
from FCBF_module import FCBF, FCBFK

from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
    ConfusionMatrixDisplay
)

# ---Original Codebase Pipeline (Without Isolation Forest)---

## Read the sampled CICIDS2017 dataset
The CICIDS2017 dataset is publicly available at: https://www.unb.ca/cic/datasets/ids-2017.html  
Due to the large size of this dataset, the sampled subsets of CICIDS2017 is used. The subsets are in the "data" folder.  
If you want to use this code on other datasets (e.g., CAN-intrusion dataset), just change the dataset name and follow the same steps. The models in this code are generic models that can be used in any intrusion detection/network traffic datasets.

In [None]:
# Original Pipeline (Without Isolation Forest)
print("---Original Pipeline (Without Isolation Forest)---")

# Read the sampled CICIDS2017 dataset
df_orig = pd.read_csv('./data/CICIDS2017_sample.csv')

In [None]:
df_orig.Label.value_counts()

### Preprocessing (normalization and padding values)

In [None]:
# Z-score normalization
features_orig = df_orig.dtypes[df_orig.dtypes != 'object'].index
df_orig[features_orig] = df_orig[features_orig].apply(
    lambda x: (x - x.mean()) / (x.std())
)
# Fill empty values by 0
df_orig = df_orig.fillna(0)

### Data sampling
Due to the space limit of GitHub files and the large size of network traffic data, we sample a small-sized subset for model learning using **k-means cluster sampling**

In [None]:
labelencoder_orig = LabelEncoder()
df_orig.iloc[:, -1] = labelencoder_orig.fit_transform(df_orig.iloc[:, -1])

In [None]:
df_orig.Label.value_counts()

In [None]:
# retain the minority class instances and sample the majority class instances
df_minor_orig = df_orig[
    (df_orig['Label'] == 6) | (df_orig['Label'] == 1) | (df_orig['Label'] == 4)
]
df_major_orig = df_orig.drop(df_minor_orig.index)

In [None]:
X_orig = df_major_orig.drop(['Label'], axis=1)
y_orig = df_major_orig['Label'].values
y_orig=np.ravel(y_orig)

In [None]:
# use k-means to cluster the data samples and select a proportion of data from each cluster
from sklearn.cluster import MiniBatchKMeans
kmeans_orig = MiniBatchKMeans(n_clusters=1000, random_state=0).fit(X_orig)

In [None]:
klabel_orig = kmeans_orig.labels_
df_major_orig['klabel'] = klabel_orig

In [None]:
df_major_orig['klabel'].value_counts()

In [None]:
cols_orig = list(df_major_orig)
cols_orig.insert(78, cols_orig.pop(cols_orig.index('Label')))
df_major_orig = df_major_orig.loc[:, cols_orig]

In [None]:
def typicalSampling_orig(group):
    name = group.name
    frac = 0.008
    return group.sample(frac=frac)

result_orig = df_major_orig.groupby(
    'klabel', group_keys=False
).apply(typicalSampling_orig)

In [None]:
result_orig['Label'].value_counts()

## 4371 Had to modify the file below because the recommended way to combine DataFrames in recent versions of pandas is by using the pandas.concat() function

In [None]:
import pandas as pd

# Assuming 'result' and 'df_minor' are already defined and loaded

# No need to drop 'klabel' since it doesn't exist
# If you need to drop another column, ensure it exists
# For example, to drop 'Label' (only if intended, which is usually not the case):
# result = result.drop(['Label'], axis=1)

# Concatenate 'result_orig' and 'df_minor_orig' DataFrames
result_orig = pd.concat([result_orig, df_minor_orig], ignore_index=True)

print("DataFrames concatenated successfully.")
print("Updated DataFrame head:")
print(result_orig.head())

In [None]:
result_orig.to_csv('./data/CICIDS2017_sample_km_orig.csv', index=False)

### split train set and test set

In [None]:
# Original Pipeline (Without Isolation Forest)
print("---Original Pipeline (Without Isolation Forest)---")

# Read the sampled CICIDS2017 dataset
df_orig = pd.read_csv('./data/CICIDS2017_sample_km_orig.csv')
print(df_orig.isnull().sum())

## ----- ADDED LINES BELOW 4371 TO FIX ISSUE WITH ValueError: Input X contains NaN. ------

In [None]:
from sklearn.impute import SimpleImputer

# Create an imputer object with the desired strategy (mean, median, most_frequent)
imputer_orig = SimpleImputer(strategy='mean')

# Apply the imputer to the DataFrame
df_orig[df_orig.columns] = imputer_orig.fit_transform(df_orig)

# fixed the issue by using SimpleImputer to replace the NaN values in your dataset with 
# meaningful statistical estimates (like the mean of each feature column). This transformation eliminated missing 
# values from the dataset, which allowed mutual_info_classif to execute without errors.

In [None]:
X_orig = df_orig.drop(['Label'], axis=1).values
y_orig = df_orig['Label'].values
y_orig=np.ravel(y_orig)

In [None]:
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
    X_orig, y_orig, train_size=0.8, test_size=0.2, random_state=0, stratify=y_orig
)

## Feature engineering

### Feature selection by information gain

In [None]:
from sklearn.feature_selection import mutual_info_classif
importances_orig = mutual_info_classif(X_train_orig, y_train_orig)

In [None]:
# Calculate the sum of importance scores
f_list_orig = sorted(
    zip(map(lambda x: round(x, 4), importances_orig), features_orig), reverse=True
)
Sum_orig = sum([score for score, _ in f_list_orig])

# Initialize Sum variable
Sum = 0
fs = []

for i in range(0, len(f_list_orig)):
    Sum = Sum + f_list_orig[i][0]
    fs.append(f_list_orig[i][1])

In [None]:
# Select the important features from top to bottom until the accumulated importance reaches 90%
f_list2 = sorted(
    zip(map(lambda x: round(x, 4), importances_orig / Sum_orig), features_orig),
    reverse=True
)

Sum2 = 0
fs_selected = []

for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    fs_selected.append(f_list2[i][1])
    if Sum2 >= 0.9:
        break

In [None]:
# Extract feature names from f_list_orig
feature_names_orig = [name for score, name in f_list_orig]

# Now use the list of feature names to select columns
X_fs_orig = df_orig[feature_names_orig].values

In [None]:
X_fs_orig.shape

### Feature selection by Fast Correlation Based Filter (FCBF)

The module is imported from the GitHub repo: https://github.com/SantiagoEG/FCBF_module

In [None]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
fcbf_orig = FCBFK(k=20)
#fcbf.fit(X_fs, y)

In [None]:
X_fss_orig = fcbf_orig.fit_transform(X_fs_orig, y_orig)

In [None]:
X_fss_orig.shape

### Re-split train & test sets after feature selection

In [None]:
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
    X_fss_orig, y_orig, train_size=0.8, test_size=0.2, random_state=0, stratify=y_orig
)


In [None]:
X_train_orig.shape

# Data on origial codebase pipeline without isolation forest filtering

In [None]:
# Class distribution in training data
print("Original Training Data Class Distribution:")
print(pd.Series(y_train_orig).value_counts())

# Dataset size
print(f"Original Training Data Shape: {X_train_orig.shape}")
print(f"Original Test Data Shape: {X_test_orig.shape}")

In [None]:
# Train a Random Forest classifier
rf_orig = RandomForestClassifier(random_state=42)
rf_orig.fit(X_train_orig, y_train_orig)

In [None]:
# Predict on test data
y_pred_orig = rf_orig.predict(X_test_orig)

# Classification report
print("Classification Report for Original Pipeline:")
print(classification_report(y_test_orig, y_pred_orig))


In [None]:
# Confusion matrix
conf_matrix_orig = confusion_matrix(y_test_orig, y_pred_orig)
print("Confusion Matrix for Original Pipeline:")
print(conf_matrix_orig)


In [None]:
# Overall accuracy and F1 score
accuracy_orig = accuracy_score(y_test_orig, y_pred_orig)
f1_orig = f1_score(y_test_orig, y_pred_orig, average='weighted')

print(f"Accuracy for Original Pipeline: {accuracy_orig:.4f}")
print(f"Weighted F1 Score for Original Pipeline: {f1_orig:.4f}")

# ---Modified Pipeline (With Isolation Forest)---

## Read the sampled CICIDS2017 dataset
The CICIDS2017 dataset is publicly available at: https://www.unb.ca/cic/datasets/ids-2017.html  
Due to the large size of this dataset, the sampled subsets of CICIDS2017 is used. The subsets are in the "data" folder.  
If you want to use this code on other datasets (e.g., CAN-intrusion dataset), just change the dataset name and follow the same steps. The models in this code are generic models that can be used in any intrusion detection/network traffic datasets.

In [None]:
# Enhanced Pipeline (With Isolation Forest)
print("---Enhanced Pipeline (With Isolation Forest)---")

df_enh = pd.read_csv('./data/CICIDS2017_sample.csv')

In [None]:
df_enh.Label.value_counts()

### Preprocessing (normalization and padding values)

In [None]:
# Z-score normalization
features_enh = df_enh.dtypes[df_enh.dtypes != 'object'].index
df_enh[features_enh] = df_enh[features_enh].apply(
    lambda x: (x - x.mean()) / (x.std())
)
df_enh = df_enh.fillna(0)

### Data sampling
Due to the space limit of GitHub files and the large size of network traffic data, we sample a small-sized subset for model learning using **k-means cluster sampling**

In [None]:
labelencoder_enh = LabelEncoder()
df_enh.iloc[:, -1] = labelencoder_enh.fit_transform(df_enh.iloc[:, -1])

In [None]:
df_enh.Label.value_counts()

In [None]:
# retain the minority class instances and sample the majority class instances
df_minor_enh = df_enh[
    (df_enh['Label'] == 6) | (df_enh['Label'] == 1) | (df_enh['Label'] == 4)
]
df_major_enh = df_enh.drop(df_minor_enh.index)

In [None]:
X_enh = df_major_enh.drop(['Label'], axis=1)
y_enh = df_major_enh['Label'].values
y_enh=np.ravel(y_enh)

In [None]:
# use k-means to cluster the data samples and select a proportion of data from each cluster
from sklearn.cluster import MiniBatchKMeans
kmeans_enh = MiniBatchKMeans(n_clusters=1000, random_state=0).fit(X_enh)

In [None]:
klabel_enh = kmeans_enh.labels_
df_major_enh['klabel'] = klabel_enh

In [None]:
df_major_enh['klabel'].value_counts()

In [None]:
# Rearranging columns if necessary
cols_enh = list(df_major_enh)
cols_enh.insert(78, cols_enh.pop(cols_enh.index('Label')))
df_major_enh = df_major_enh.loc[:, cols_enh]

In [None]:
def typicalSampling_enh(group):
    name = group.name
    frac = 0.008
    return group.sample(frac=frac)

result_enh = df_major_enh.groupby(
    'klabel', group_keys=False
).apply(typicalSampling_enh)

In [None]:
result_enh['Label'].value_counts()

## 4371 Had to modify the file below because the recommended way to combine DataFrames in recent versions of pandas is by using the pandas.concat() function

In [None]:
import pandas as pd

# Assuming 'result' and 'df_minor' are already defined and loaded

# No need to drop 'klabel' since it doesn't exist
# If you need to drop another column, ensure it exists
# For example, to drop 'Label' (only if intended, which is usually not the case):
# result = result.drop(['Label'], axis=1)

# Concatenate 'result' and 'df_minor' DataFrames
result_enh = pd.concat([result_enh, df_minor_enh], ignore_index=True)

print("DataFrames concatenated successfully.")
print("Updated DataFrame head:")
print(result_enh.head())

In [None]:
result_enh.to_csv('./data/CICIDS2017_sample_km_enh.csv', index=False)

### split train set and test set

In [None]:
# Enhanced Pipeline (With Isolation Forest)
print("---Enhanced Pipeline (With Isolation Forest)---")

# Read the sampled CICIDS2017 dataset
df_enh = pd.read_csv('./data/CICIDS2017_sample_km.csv')
print(df_enh.isnull().sum())

## ----- ADDED LINES BELOW 4371 TO FIX ISSUE WITH ValueError: Input X contains NaN. ------

In [None]:
from sklearn.impute import SimpleImputer

# Create an imputer object with the desired strategy (mean, median, most_frequent)
imputer_enh = SimpleImputer(strategy='mean')
df_enh[df_enh.columns] = imputer_enh.fit_transform(df_enh)

# fixed the issue by using SimpleImputer to replace the NaN values in your dataset with 
# meaningful statistical estimates (like the mean of each feature column). This transformation eliminated missing 
# values from the dataset, which allowed mutual_info_classif to execute without errors.

In [None]:
X_enh = df_enh.drop(['Label'], axis=1).values
y_enh = df_enh['Label'].values
y_enh=np.ravel(y_enh)

In [None]:
X_train_enh, X_test_enh, y_train_enh, y_test_enh = train_test_split(
    X_enh, y_enh, train_size=0.8, test_size=0.2, random_state=0, stratify=y_enh
)

## Feature engineering

### Feature selection by information gain

In [None]:
from sklearn.feature_selection import mutual_info_classif
importances_enh = mutual_info_classif(X_enh, y_enh)

In [None]:
# Define features_enh if not already defined
features_enh = df_enh.dtypes[df_enh.dtypes != 'object'].index

# calculate the sum of importance scores
f_list_enh = sorted(zip(map(lambda x: round(x, 4), importances_enh), features_enh), reverse=True)
Sum_enh = 0
fs_enh = []
for i in range(0, len(f_list_enh)):
    Sum_enh = Sum_enh + f_list_enh[i][0]
    fs_enh.append(f_list_enh[i][1])

In [None]:
# select the important features from top to bottom until the accumulated importance reaches 90%
f_list2_enh = sorted(zip(map(lambda x: round(x, 4), importances_enh / Sum_enh), features_enh), reverse=True)
Sum2_enh = 0
fs_enh = []
for i in range(0, len(f_list2_enh)):
    Sum2_enh = Sum2_enh + f_list2_enh[i][0]
    fs_enh.append(f_list2_enh[i][1])
    if Sum2_enh >= 0.9:
        break

In [None]:
X_fs_enh = df_enh[fs_enh].values

In [None]:
X_fs_enh.shape

### Feature selection by Fast Correlation Based Filter (FCBF)

The module is imported from the GitHub repo: https://github.com/SantiagoEG/FCBF_module

In [None]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
fcbf_enh = FCBFK(k=20)
#fcbf.fit(X_fs, y)

In [None]:
X_fss_enh = fcbf_enh.fit_transform(X_fs_enh, y_enh)

In [None]:
X_fss_enh.shape

## Isolation Forest Implementation 

After performing feature selection using Information Gain (IG) and Fast Correlation-Based Filter (FCBF), we apply the Isolation Forest to detect and filter out anomalies in our dataset. This step enhances our model's ability to differentiate between actual threats and benign unusual behavior by removing potential outliers before training.

n_estimators=100: Number of trees in the forest.
contamination='auto': Let the algorithm decide the proportion of anomalies.
random_state=42: For reproducibility.
Anomaly Detection:

anomaly_predictions == 1: Inliers (normal instances).
anomaly_predictions == -1: Outliers (anomalies).
Filtering Data:

X_filtered: Contains only the inlier instances.
y_filtered: Corresponding labels for inliers.


In [None]:
# Apply Isolation Forest
iso_forest_enh = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
iso_forest_enh.fit(X_fss_enh)

# Obtain anomaly scores and predictions
anomaly_scores_enh = iso_forest_enh.decision_function(X_fss_enh)
anomaly_predictions_enh = iso_forest_enh.predict(X_fss_enh)

# Filter out anomalies
inlier_mask_enh = anomaly_predictions_enh == 1
X_filtered_enh = X_fss_enh[inlier_mask_enh]
y_filtered_enh = y_enh[inlier_mask_enh]

In [None]:
print(f"y_filtered_enh is defined: {'y_filtered_enh' in locals()}")

In [None]:
# Identify indices of removed anomalies
removed_mask_enh = anomaly_predictions_enh == -1
removed_anomalies_indices = np.where(removed_mask_enh)[0]

# Since X_fss_enh is derived from df_enh, ensure indices align
# If necessary, reset index of df_enh
df_enh_reset = df_enh.reset_index(drop=True)

# Get the labels of the removed anomalies
removed_anomalies = df_enh_reset.iloc[removed_anomalies_indices]
removed_labels = removed_anomalies['Label']

print("Labels of Removed Anomalies:")
print(removed_labels.value_counts())

# Calculate the proportion of each class in removed anomalies
removed_label_counts = removed_labels.value_counts()
total_removed = removed_label_counts.sum()
removed_label_proportions = (removed_label_counts / total_removed) * 100

print("\nProportion of Each Class in Removed Anomalies:")
print(removed_label_proportions)


## Visualizing Anomaly Scores

To understand how the Isolation Forest has assigned anomaly scores to our data points, we visualize the distribution of these scores. This helps us assess the threshold and proportion of data considered anomalous, providing insights into the filtering process.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.barplot(x=removed_label_counts.index, y=removed_label_counts.values, palette='viridis')
plt.title('Removed Anomalies by Class')
plt.xlabel('Class Label')
plt.ylabel('Number of Removed Samples')
plt.show()


In [None]:
# Labels of retained inliers
retained_labels = y_filtered_enh

# Plotting the distribution
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.countplot(x=retained_labels, palette='pastel')
plt.title('Retained Inliers Class Distribution')
plt.xlabel('Class Label')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
sns.countplot(x=removed_labels, palette='magma')
plt.title('Removed Anomalies Class Distribution')
plt.xlabel('Class Label')
plt.ylabel('Count')

plt.tight_layout()
plt.show()


In [None]:
# Define benign and threat classes
benign_classes = [0.0]  # Assuming 0.0 corresponds to 'BENIGN'
threat_classes = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]  # Other classes are threats

# Calculate counts in removed anomalies
benign_removed = removed_labels[removed_labels.isin(benign_classes)].count()
threat_removed = removed_labels[removed_labels.isin(threat_classes)].count()

print(f"Benign Anomalies Removed: {benign_removed}")
print(f"Threat Anomalies Removed: {threat_removed}")


## Filtering Out Detected Anomalies

Using the predictions from the Isolation Forest, we filter out the anomalies (outliers) from our dataset. We retain only the inlier data points (those predicted as normal) for model training. This step aims to improve the quality of our training data by removing noise and potential outliers.

In [None]:
# Filter out anomalies
inlier_mask_enh = anomaly_predictions_enh == 1
X_filtered_enh = X_fss_enh[inlier_mask_enh]
y_filtered_enh = y_enh[inlier_mask_enh]

### Re-split train & test sets after feature selection

In [None]:
# Train-test split after filtering
X_train_enh, X_test_enh, y_train_enh, y_test_enh = train_test_split(
    X_filtered_enh, y_filtered_enh, train_size=0.8, test_size=0.2, random_state=0, stratify=y_filtered_enh
)

In [None]:
X_train_enh.shape

In [None]:
pd.Series(y_train_enh).value_counts()

In [None]:
# Display shapes and class distributions
print(f"Enhanced Training Data Shape: {X_train_enh.shape}")
print(f"Enhanced Test Data Shape: {X_test_enh.shape}")
print("Enhanced Training Data Class Distribution:")
print(pd.Series(y_train_enh).value_counts())

In [None]:
# Train a Random Forest classifier
rf_enh = RandomForestClassifier(random_state=42)
rf_enh.fit(X_train_enh, y_train_enh)

In [None]:
# Predict on test data
y_pred_enh = rf_enh.predict(X_test_enh)

# Classification report
print("Classification Report for Enhanced Pipeline:")
print(classification_report(y_test_enh, y_pred_enh))

In [None]:
# Confusion matrix
conf_matrix_enh = confusion_matrix(y_test_enh, y_pred_enh)
print("Confusion Matrix for Enhanced Pipeline:")
print(conf_matrix_enh)

In [None]:
# Overall accuracy and F1 score
accuracy_enh = accuracy_score(y_test_enh, y_pred_enh)
f1_enh = f1_score(y_test_enh, y_pred_enh, average='weighted')

print(f"Accuracy for Enhanced Pipeline: {accuracy_enh:.4f}")
print(f"Weighted F1 Score for Enhanced Pipeline: {f1_enh:.4f}")

# Full Comparison of Original pipeline against Isolation Forest Implementation

In [None]:
total_samples_before = X_fss_enh.shape[0]
total_samples_after = X_filtered_enh.shape[0]
anomalies_removed = total_samples_before - total_samples_after
anomaly_percentage = (anomalies_removed / total_samples_before) * 100

print(f"Total Samples Before Isolation Forest: {total_samples_before}")
print(f"Total Samples After Isolation Forest: {total_samples_after}")
print(f"Anomalies Detected and Removed: {anomalies_removed}")
print(f"Percentage of Anomalies Detected: {anomaly_percentage:.2f}%")

In [None]:
# Compare overall metrics
print("Comparison of Pipelines:")
print(f"Accuracy - Original Pipeline: {accuracy_orig:.4f}")
print(f"Accuracy - Enhanced Pipeline: {accuracy_enh:.4f}\n")

print(f"Weighted F1 Score - Original Pipeline: {f1_orig:.4f}")
print(f"Weighted F1 Score - Enhanced Pipeline: {f1_enh:.4f}")

In [None]:
# Confusion Matrix for Original Pipeline
disp_orig = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_orig, display_labels=rf_orig.classes_)
disp_orig.plot(cmap='Blues')
plt.title('Confusion Matrix - Original Pipeline')
plt.show()

# Confusion Matrix for Enhanced Pipeline
disp_enh = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_enh, display_labels=rf_enh.classes_)
disp_enh.plot(cmap='Greens')
plt.title('Confusion Matrix - Enhanced Pipeline')
plt.show()

In [None]:
# Classification reports as dictionaries
report_orig = classification_report(y_test_orig, y_pred_orig, output_dict=True)
report_enh = classification_report(y_test_enh, y_pred_enh, output_dict=True)

# Convert to DataFrames
df_report_orig = pd.DataFrame(report_orig).transpose()
df_report_enh = pd.DataFrame(report_enh).transpose()

# Per-class F1-scores
print("Per-class F1-scores for Original Pipeline:")
print(df_report_orig['f1-score'])

print("\nPer-class F1-scores for Enhanced Pipeline:")
print(df_report_enh['f1-score'])

In [None]:
# Classes to compare
classes = df_report_orig.index[:-3]  # Exclude 'accuracy', 'macro avg', 'weighted avg'

# Extract F1 scores
f1_scores_orig = df_report_orig.loc[classes, 'f1-score']
f1_scores_enh = df_report_enh.loc[classes, 'f1-score']

# Plotting
x = np.arange(len(classes))
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, f1_scores_orig, width, label='Original Pipeline')
rects2 = ax.bar(x + width/2, f1_scores_enh, width, label='Enhanced Pipeline')

ax.set_ylabel('F1 Score')
ax.set_title('Per-Class F1 Score Comparison')
ax.set_xticks(x)
ax.set_xticklabels(classes, rotation=45)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
print("Class Distribution Before Isolation Forest:")
print(pd.Series(y_enh).value_counts())

print("\nClass Distribution After Isolation Forest:")
print(pd.Series(y_filtered_enh).value_counts())

# Plotting class distributions
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# Before Isolation Forest
sns.countplot(x=y_enh, ax=ax[0])
ax[0].set_title('Before Isolation Forest')
ax[0].set_xlabel('Class')
ax[0].set_ylabel('Count')

# After Isolation Forest
sns.countplot(x=y_filtered_enh, ax=ax[1])
ax[1].set_title('After Isolation Forest')
ax[1].set_xlabel('Class')
ax[1].set_ylabel('Count')

plt.tight_layout()
plt.show()


In [None]:
# Original dataset sizes
print("Original Training Data Shape:", X_train_orig.shape)
print("Original Test Data Shape:", X_test_orig.shape)

# Enhanced dataset sizes
print("\nEnhanced Training Data Shape:", X_train_enh.shape)
print("Enhanced Test Data Shape:", X_test_enh.shape)

In [None]:
summary_data = {
    'Metric': [
        'Total Samples',
        'Anomalies Detected and Removed',
        'Percentage of Anomalies Detected',
        'Training Data Shape',
        'Test Data Shape',
        # 'Time Taken for Preprocessing (seconds)'  # If you have timing data
    ],
    'Original Pipeline': [
        X_fss_orig.shape[0],
        'N/A',
        'N/A',
        X_train_orig.shape,
        X_test_orig.shape,
        # f"{time_orig:.2f}"
    ],
    'Enhanced Pipeline': [
        X_filtered_enh.shape[0],
        anomalies_removed,
        f"{anomaly_percentage:.2f}%",
        X_train_enh.shape,
        X_test_enh.shape,
        # f"{time_enh:.2f}"
    ],
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))


In [None]:
# Compare anomaly scores
plt.figure(figsize=(10, 6))
sns.kdeplot(anomaly_scores_enh, shade=True, color='red')
plt.title('Density Plot of Anomaly Scores')
plt.xlabel('Anomaly Score')
plt.ylabel('Density')
plt.show()

In [None]:
# Create a summary DataFrame
summary_data = {
    'Metric': ['Accuracy', 'Weighted F1 Score'],
    'Original Pipeline': [accuracy_orig, f1_orig],
    'Enhanced Pipeline': [accuracy_enh, f1_enh]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df)

In [None]:
print("Based on the evaluation metrics, the Enhanced Pipeline demonstrates improved performance over the Original Pipeline.")