In [None]:
import pandas as pd

# Read the Label.csv file
labels = pd.read_csv('../data/CICD/Label.csv')
distribution = labels['Label'].value_counts(normalize=True) * 100
print("Label distribution (%):")
print(distribution)


# Read the Data.csv file
data = pd.read_csv('../data/CICD/Data.csv')

# Show information
print("\nDataset Information:")
print(data.info())

# Show descriptive statistics
print("\nDescriptive statistics:")
print(data.describe())

# Show first rows
print("\nFirst rows:")
print(data.head())

# Show dimensions
print(f"\nDataset dimensions: {data.shape[0]} rows x {data.shape[1]} columns")



In [None]:
#Histogram of the first 35 features
import matplotlib.pyplot as plt
data.iloc[:, :35].hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# Histogram of the remaining features (from index 35 to the end)
data.iloc[:, 35:].hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
#Train/test split
from sklearn.model_selection import train_test_split

# Create X (features) and y (target)
X = data
y = labels['Label']

# Create train/test split with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print the shapes of the resulting splits
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)

In [None]:
# Check for missing values in data
missing_values = data.isnull().sum()

# Display features with missing values (if any)
print("Features with missing values:")
print(missing_values[missing_values > 0])

# Calculate percentage of missing values
missing_percentage = (data.isnull().sum() / len(data)) * 100

# Display features with missing values percentage
print("\nPercentage of missing values:")
print(missing_percentage[missing_percentage > 0])

# Get total number of missing values
total_missing = data.isnull().sum().sum()
print(f"\nTotal number of missing values: {total_missing}")

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Select some important numerical features
features_to_plot = [
    'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
    'Flow Bytes/s', 'Flow Packets/s', 'Packet Length Mean',
    'Average Packet Size', 'Flow IAT Mean'
]

# Create box plots
plt.figure(figsize=(15, 10))
data[features_to_plot].boxplot()
plt.xticks(rotation=45, ha='right')
plt.title('Box Plot of Selected Features')
plt.tight_layout()

# Print some statistics about outliers
for feature in features_to_plot:
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1
    outlier_count = data[(data[feature] < Q1 - 1.5 * IQR) | (data[feature] > Q3 + 1.5 * IQR)].shape[0]
    print(f"\nOutliers in {feature}: {outlier_count}")
    print(f"Min: {data[feature].min():.2f}")
    print(f"Max: {data[feature].max():.2f}")

In [None]:
# Check for potential categorical features
categorical_like = []
numerical_features = []

for column in data.columns:
    # Check number of unique values
    n_unique = data[column].nunique()
    
    # If number of unique values is small (less than 10), it might be categorical
    if n_unique < 10:
        categorical_like.append({
            'column': column,
            'unique_values': sorted(data[column].unique()),
            'count': n_unique
        })
    else:
        numerical_features.append(column)

# Print potential categorical features
print("Potential categorical features:")
for feat in categorical_like:
    print(f"\n{feat['column']}:")
    print(f"Number of unique values: {feat['count']}")
    print(f"Unique values: {feat['unique_values']}")

print(f"\nNumber of numerical features: {len(numerical_features)}")

In [None]:
#Feature scaling and transformations

from sklearn.preprocessing import StandardScaler, RobustScaler
import numpy as np

# Separate numerical features that need scaling
# Exclude categorical-like features and flags
features_to_scale = [col for col in numerical_features if col not in [f['column'] for f in categorical_like]]

# Create scalers
standard_scaler = StandardScaler()
robust_scaler = RobustScaler()

# Create copy of data to avoid modifying original
data_scaled = data.copy()

# Apply log transformation to highly skewed features with large ranges
skewed_features = ['Flow Duration', 'Flow Bytes/s', 'Flow Packets/s', 
                  'Flow IAT Mean', 'Flow IAT Max', 'Packet Length Variance']

for feature in skewed_features:
    # Add small constant to handle zeros
    data_scaled[feature] = np.log1p(data_scaled[feature].replace(0, 1e-6))

# Apply robust scaling to features with outliers
outlier_features = ['Packet Length Max', 'Fwd Packet Length Max', 
                   'Bwd Packet Length Max', 'Flow IAT Std']

data_scaled[outlier_features] = robust_scaler.fit_transform(data_scaled[outlier_features])

# Apply standard scaling to remaining numerical features
remaining_features = [f for f in features_to_scale 
                     if f not in skewed_features + outlier_features]

data_scaled[remaining_features] = standard_scaler.fit_transform(data_scaled[remaining_features])

print("Features scaled successfully")
print(f"Number of features transformed: {len(features_to_scale)}")

In [None]:
# Train a classification model

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Create and train Random Forest with class weights to handle imbalance
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=4,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': data.columns,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Print top 10 most important features
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

In [None]:
#Confusion matrix
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np

import matplotlib.pyplot as plt

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')

# Add labels
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

plt.tight_layout()
plt.show()

# Calculate and print overall accuracy
accuracy = (cm.diagonal().sum() / cm.sum()) * 100
print(f"\nOverall Accuracy: {accuracy:.2f}%")

# Calculate per-class accuracy
per_class_accuracy = (cm.diagonal() / cm.sum(axis=1)) * 100
for i, acc in enumerate(per_class_accuracy):
    print(f"Class {i} Accuracy: {acc:.2f}%")

In [None]:
# Calculate precision-recall curve for each class
from sklearn.metrics import precision_recall_curve, average_precision_score

import matplotlib.pyplot as plt

# Get prediction probabilities for each class
y_scores = rf_model.predict_proba(X_test)

# Calculate precision-recall curve for each class
plt.figure(figsize=(10, 8))
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'orange', 'purple', 'brown']

for i in range(10):  # 10 classes (0-9)
    precision, recall, _ = precision_recall_curve(y_test == i, y_scores[:, i])
    avg_precision = average_precision_score(y_test == i, y_scores[:, i])
    
    plt.plot(recall, precision, color=colors[i], 
             label=f'Class {i} (AP = {avg_precision:.2f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Each Class')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Train and evaluate multiple models
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from time import time

# Dictionary to store model results
model_results = {}

# List of models to try
models = {
    'Decision Tree': DecisionTreeClassifier(max_depth=20, class_weight='balanced', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', n_jobs=-1, random_state=42),
    'LinearSVC': LinearSVC(max_iter=1000, class_weight='balanced', random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time()
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store results
    model_results[name] = {
        'accuracy': accuracy,
        'training_time': time() - start_time,
        'report': classification_report(y_test, y_pred)
    }
    
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Training time: {model_results[name]['training_time']:.2f} seconds")
    print("\nClassification Report:")
    print(model_results[name]['report'])


Training Decision Tree...
Decision Tree Results:
Accuracy: 0.9062
Training time: 7.45 seconds

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     71666
           1       0.25      0.87      0.39        77
           2       0.17      0.54      0.25        90
           3       0.38      0.43      0.40       894
           4       0.87      0.64      0.74      6190
           5       0.71      0.64      0.67      5923
           6       0.75      0.75      0.75       927
           7       0.82      0.67      0.73      3347
           8       0.22      0.49      0.30       420
           9       0.01      0.61      0.02        49

    accuracy                           0.91     89583
   macro avg       0.52      0.66      0.52     89583
weighted avg       0.95      0.91      0.93     89583


Training KNN...
KNN Results:
Accuracy: 0.9045
Training time: 27.00 seconds

Classification Report:
              precisio

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Results:
Accuracy: 0.4788
Training time: 121.50 seconds

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.51      0.68     71666
           1       0.02      0.99      0.04        77
           2       0.00      0.12      0.01        90
           3       0.09      0.03      0.04       894
           4       0.22      0.37      0.28      6190
           5       0.12      0.28      0.17      5923
           6       0.11      0.24      0.15       927
           7       0.11      0.52      0.19      3347
           8       0.11      0.32      0.16       420
           9       0.01      0.22      0.01        49

    accuracy                           0.48     89583
   macro avg       0.18      0.36      0.17     89583
weighted avg       0.82      0.48      0.58     89583


Training LinearSVC...
