In [3]:
import pandas as pd

# Read the Label.csv file
labels = pd.read_csv('../data/CICD/Label.csv')
# Read the Data.csv file
data = pd.read_csv('../data/CICD/Data.csv')


In [4]:
#Train/test split
from sklearn.model_selection import train_test_split

# Create X (features) and y (target)
X = data
y = labels['Label']

# Create train/test split with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print the shapes of the resulting splits
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)

Training set shape: (358332, 76)
Testing set shape: (89583, 76)
Training labels shape: (358332,)
Testing labels shape: (89583,)


In [5]:
# Check for potential categorical features
categorical_like = []
numerical_features = []

for column in data.columns:
    # Check number of unique values
    n_unique = data[column].nunique()
    
    # If number of unique values is small (less than 10), it might be categorical
    if n_unique < 10:
        categorical_like.append({
            'column': column,
            'unique_values': sorted(data[column].unique()),
            'count': n_unique
        })
    else:
        numerical_features.append(column)

# Print potential categorical features
print("Potential categorical features:")
for feat in categorical_like:
    print(f"\n{feat['column']}:")
    print(f"Number of unique values: {feat['count']}")
    print(f"Unique values: {feat['unique_values']}")

print(f"\nNumber of numerical features: {len(numerical_features)}")

Potential categorical features:

Fwd PSH Flags:
Number of unique values: 2
Unique values: [np.int64(0), np.int64(1)]

Bwd PSH Flags:
Number of unique values: 1
Unique values: [np.int64(0)]

Fwd URG Flags:
Number of unique values: 1
Unique values: [np.int64(0)]

Bwd URG Flags:
Number of unique values: 1
Unique values: [np.int64(0)]

FIN Flag Count:
Number of unique values: 4
Unique values: [np.int64(0), np.int64(1), np.int64(2), np.int64(3)]

SYN Flag Count:
Number of unique values: 9
Unique values: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(6), np.int64(8), np.int64(10), np.int64(12)]

RST Flag Count:
Number of unique values: 2
Unique values: [np.int64(0), np.int64(1)]

URG Flag Count:
Number of unique values: 1
Unique values: [np.int64(0)]

CWR Flag Count:
Number of unique values: 1
Unique values: [np.int64(0)]

ECE Flag Count:
Number of unique values: 1
Unique values: [np.int64(0)]

Down/Up Ratio:
Number of unique values: 9
Unique values: [np.float64(0

In [6]:
from sklearn.preprocessing import OneHotEncoder

# Identify categorical-like features
categorical_features = [f['column'] for f in categorical_like]

# Create OneHotEncoder object
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit encoder on training data and transform both training and test data
encoder.fit(X_train[categorical_features])

X_train_encoded = encoder.transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

# Get feature names after encoding
encoded_feature_names = encoder.get_feature_names_out(categorical_features)

# Convert encoded data to dataframes
X_train_encoded = pd.DataFrame(X_train_encoded, index=X_train.index, columns=encoded_feature_names)
X_test_encoded = pd.DataFrame(X_test_encoded, index=X_test.index, columns=encoded_feature_names)

# Concatenate encoded features with original dataframes
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

# Remove original categorical features
X_train = X_train.drop(categorical_features, axis=1)
X_test = X_test.drop(categorical_features, axis=1)

print("Categorical features encoded successfully")
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Categorical features encoded successfully
Training set shape: (358332, 103)
Testing set shape: (89583, 103)


In [7]:
from sklearn.preprocessing import StandardScaler, RobustScaler
import numpy as np

# Create copy of data to avoid modifying original
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Separate numerical features that need scaling
# Exclude categorical-like features and flags
features_to_scale = [col for col in X_train.columns if col not in [f['column'] for f in categorical_like]]

# Create scalers
standard_scaler = StandardScaler()
robust_scaler = RobustScaler()

# Apply log transformation to highly skewed features with large ranges
skewed_features = ['Flow Duration', 'Flow Bytes/s', 'Flow Packets/s', 
                  'Flow IAT Mean', 'Flow IAT Max', 'Packet Length Variance']

for feature in skewed_features:
    # Add small constant to handle zeros
    X_train_scaled[feature] = np.log1p(X_train_scaled[feature].replace(0, 1e-6))
    X_test_scaled[feature] = np.log1p(X_test_scaled[feature].replace(0, 1e-6))

# Apply robust scaling to features with outliers
outlier_features = ['Packet Length Max', 'Fwd Packet Length Max', 
                   'Bwd Packet Length Max', 'Flow IAT Std']

# Fit on training data and transform both training and test data
X_train_scaled[outlier_features] = robust_scaler.fit_transform(X_train_scaled[outlier_features])
X_test_scaled[outlier_features] = robust_scaler.transform(X_test_scaled[outlier_features])

# Apply standard scaling to remaining numerical features
remaining_features = [f for f in features_to_scale 
                     if f not in skewed_features + outlier_features]

# Fit on training data and transform both training and test data
X_train_scaled[remaining_features] = standard_scaler.fit_transform(X_train_scaled[remaining_features])
X_test_scaled[remaining_features] = standard_scaler.transform(X_test_scaled[remaining_features])

print("Features scaled successfully")
print(f"Number of features transformed: {len(features_to_scale)}")

Features scaled successfully
Number of features transformed: 103


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from time import time


# Expand models and parameter grids to include more classifiers

# Update models dictionary with new classifiers
models = {
    'Decision Tree': DecisionTreeClassifier(max_depth=20, class_weight='balanced', random_state=42),
    'KNN': KNeighborsClassifier(n_jobs=-1),
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', n_jobs=-1, random_state=42),
    'Random Forest': RandomForestClassifier(n_jobs=-1, class_weight='balanced', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Linear SVC': LinearSVC(random_state=42, max_iter=2000)
}

# Update parameter grids for all models
param_grids = {
    'Decision Tree': {
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': ['balanced']
    },
    'KNN': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    },
    'Logistic Regression': {
        'C': [0.1, 1.0, 10.0],
        'solver': ['lbfgs', 'liblinear'],
        'class_weight': ['balanced']
    },
    'Random Forest': {
        'n_estimators': [50, 100],
        'max_depth': [10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': ['balanced']
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    },
    'Linear SVC': {
        'C': [0.1, 1.0, 10.0],
        'class_weight': ['balanced'],
        'dual': [False]
    }
}

# Dictionary to store best models
best_models = {}

# Perform GridSearch for each model
for name, model in models.items():
    print(f"\nPerforming GridSearch for {name}...")
    start_time = time()
    
    # Create GridSearch object
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        cv=5,
        n_jobs=-1,
        scoring='accuracy',
        verbose=1
    )
    
    # Fit GridSearch
    grid_search.fit(X_train_scaled, y_train)
    
    # Make predictions with best model
    y_pred = grid_search.predict(X_test_scaled)
    
    # Store results
    best_models[name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'test_accuracy': accuracy_score(y_test, y_pred),
        'training_time': time() - start_time,
        'report': classification_report(y_test, y_pred)
    }
    
    print(f"\n{name} Best Results:")
    print(f"Best parameters: {best_models[name]['best_params']}")
    print(f"Best cross-validation score: {best_models[name]['best_score']:.4f}")
    print(f"Test accuracy: {best_models[name]['test_accuracy']:.4f}")
    print(f"Training time: {best_models[name]['training_time']:.2f} seconds")
    print("\nClassification Report:")
    print(best_models[name]['report'])


Performing GridSearch for Random Forest...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
