In [2]:
import pandas as pd

# Read the Label.csv file
labels = pd.read_csv('../data/CICD/Label.csv')
# Read the Data.csv file
data = pd.read_csv('../data/CICD/Data.csv')




In [3]:
# Check for missing values in data
missing_values = data.isnull().sum()

# Display features with missing values (if any)
print("Features with missing values:")
print(missing_values[missing_values > 0])

# Calculate percentage of missing values
missing_percentage = (data.isnull().sum() / len(data)) * 100

# Display features with missing values percentage
print("\nPercentage of missing values:")
print(missing_percentage[missing_percentage > 0])

# Get total number of missing values
total_missing = data.isnull().sum().sum()
print(f"\nTotal number of missing values: {total_missing}")

Features with missing values:
Series([], dtype: int64)

Percentage of missing values:
Series([], dtype: float64)

Total number of missing values: 0


In [4]:
from sklearn.preprocessing import OneHotEncoder

# Identify categorical-like features based on the number of unique values
categorical_like = []
for column in data.columns:
    unique_values = data[column].unique()
    num_unique = len(unique_values)
    if num_unique < 15:  # Adjust threshold as needed
        categorical_like.append({'column': column, 'unique_values': unique_values, 'count': num_unique})

print("Potential categorical features:")
print(categorical_like)

# One-Hot Encode the identified categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # handle_unknown to avoid errors

# Extract column names from categorical_like
categorical_columns = [item['column'] for item in categorical_like]

# Fit and transform the selected columns
encoder.fit(data[categorical_columns])
encoded_data = encoder.transform(data[categorical_columns])

# Create a new DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the encoded DataFrame with the original DataFrame
data = pd.concat([data, encoded_df], axis=1)

# Remove original categorical columns
data = data.drop(columns=categorical_columns)

print("\nDataFrame after one-hot encoding:")
print(data.head())

Potential categorical features:
[{'column': 'Fwd PSH Flags', 'unique_values': array([0, 1]), 'count': 2}, {'column': 'Bwd PSH Flags', 'unique_values': array([0]), 'count': 1}, {'column': 'Fwd URG Flags', 'unique_values': array([0]), 'count': 1}, {'column': 'Bwd URG Flags', 'unique_values': array([0]), 'count': 1}, {'column': 'FIN Flag Count', 'unique_values': array([2, 1, 0, 3]), 'count': 4}, {'column': 'SYN Flag Count', 'unique_values': array([ 4,  0,  2,  1,  3,  8,  6, 10, 12]), 'count': 9}, {'column': 'RST Flag Count', 'unique_values': array([0, 1]), 'count': 2}, {'column': 'URG Flag Count', 'unique_values': array([0]), 'count': 1}, {'column': 'CWR Flag Count', 'unique_values': array([0]), 'count': 1}, {'column': 'ECE Flag Count', 'unique_values': array([0]), 'count': 1}, {'column': 'Down/Up Ratio', 'unique_values': array([  2.,   0.,   1.,   4.,   3.,   5.,   6., 398., 400.]), 'count': 9}, {'column': 'Fwd Bytes/Bulk Avg', 'unique_values': array([0]), 'count': 1}, {'column': 'Fwd P

In [5]:
import numpy as np

# Define a function to identify outliers based on IQR
def find_outliers_iqr(data, threshold=1.5):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers

# Identify numerical features
numerical_features = data.select_dtypes(include=np.number).columns.tolist()

# Calculate outlier counts for each numerical feature
outlier_counts = {}
for feature in numerical_features:
    outliers = find_outliers_iqr(data[feature])
    outlier_counts[feature] = len(outliers)

# Convert the dictionary to a pandas Series for easier sorting
outlier_counts_series = pd.Series(outlier_counts)

# Sort the Series in descending order
outlier_counts_series = outlier_counts_series.sort_values(ascending=False)

# Display the top features with the most outliers
print("Top features with the most outliers:")
print(outlier_counts_series.head(10))

# Define a threshold for outlier count to remove features
outlier_threshold = 0.05 * len(data)  # e.g., remove if more than 5% are outliers

# Identify features to remove
features_to_remove = outlier_counts_series[outlier_counts_series > outlier_threshold].index.tolist()

print("\nFeatures to remove due to excessive outliers:")
print(features_to_remove)

# Remove the identified features from the DataFrame
data = data.drop(columns=features_to_remove)

print("\nDataFrame after removing features with excessive outliers:")
print(data.head())

Top features with the most outliers:
Total Length of Bwd Packet    100596
Bwd Packet Length Mean         97665
Bwd Segment Size Avg           97665
Bwd IAT Std                    93985
Bwd IAT Max                    93353
Fwd Packet Length Std          92847
Bwd IAT Mean                   92249
Fwd Seg Size Min_20            85505
Total Bwd packets              84920
Fwd IAT Total                  84830
dtype: int64

Features to remove due to excessive outliers:
['Total Length of Bwd Packet', 'Bwd Packet Length Mean', 'Bwd Segment Size Avg', 'Bwd IAT Std', 'Bwd IAT Max', 'Fwd Packet Length Std', 'Bwd IAT Mean', 'Fwd Seg Size Min_20', 'Total Bwd packets', 'Fwd IAT Total', 'Bwd IAT Total', 'PSH Flag Count', 'Flow Duration', 'Bwd Header Length', 'ACK Flag Count', 'Bwd Packet Length Std', 'Fwd Header Length', 'Fwd IAT Max', 'Flow IAT Max', 'Flow Bytes/s', 'Total Fwd Packet', 'Bwd Bytes/Bulk Avg', 'FIN Flag Count_0', 'Fwd IAT Mean', 'Fwd Packet Length Min', 'Bwd Bulk Rate Avg', 'Packet Leng

In [6]:
from scipy.stats import skew

# Calculate skewness for each numerical feature
skewness = data.apply(lambda x: skew(x))

# Display skewness for each feature
print("Skewness for each feature:")
print(skewness)

# Identify highly skewed features (e.g., skewness > 5)
highly_skewed = skewness[abs(skewness) > 5]
print("\nHighly skewed features:")
print(highly_skewed)

# Apply log transformation to highly skewed features
for feature in highly_skewed.index:
    data[feature] = np.log1p(data[feature])

print("\nDataFrame after log transformation:")
print(data.head())

# Display skewness after transformation
skewness_after_log = data.apply(lambda x: skew(x))

# Display skewness for each feature
print("\nSkewness after log transformation:")
print(skewness_after_log[highly_skewed.index])

  skewness = data.apply(lambda x: skew(x))


Skewness for each feature:
Bwd Packet Length Max      1.150831
Flow IAT Min             302.925627
Bwd IAT Min              575.966283
Fwd Packets/s             13.617951
Bwd Packets/s              1.802532
Packet Length Max          0.749315
Packet Length Mean         2.075151
Packet Length Std          1.013695
Average Packet Size        2.347307
Subflow Fwd Packets      101.369531
Subflow Fwd Bytes        123.818656
Subflow Bwd Packets       80.615522
Subflow Bwd Bytes         87.507410
Active Mean               82.309694
Active Std                85.949141
Active Max                59.072627
Active Min               111.350921
Idle Mean                 34.374054
Idle Std                 217.291315
Idle Max                  33.604444
Idle Min                  34.894681
Fwd PSH Flags_0          -38.345864
Fwd PSH Flags_1           38.345864
Bwd PSH Flags_0                 NaN
Fwd URG Flags_0                 NaN
Bwd URG Flags_0                 NaN
FIN Flag Count_1          -1.095284
F

  skewness_after_log = data.apply(lambda x: skew(x))


In [7]:
from sklearn.preprocessing import StandardScaler

# Get numerical features that haven't been one-hot encoded or removed
numerical_cols = data.select_dtypes(include=['float64']).columns.tolist()

# Create StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical features
scaled_data = scaler.fit_transform(data[numerical_cols])

# Convert back to DataFrame with proper column names
scaled_df = pd.DataFrame(scaled_data, columns=numerical_cols)

# Print sample of scaled data
print("Sample of scaled features:")
print(scaled_df.head())

# Display basic statistics of scaled data
print("\nScaled data statistics:")
print(scaled_df.describe().round(3))

Sample of scaled features:
   Bwd Packet Length Max  Flow IAT Min  Bwd IAT Min  Fwd Packets/s  \
0               1.970087     -0.117054    -0.191225      -1.074074   
1              -0.656720      2.261261     2.976213      -1.968879   
2              -0.656720     -0.117054     1.818932      -0.835661   
3              -0.656720      2.261261     2.256976      -1.130665   
4              -0.656720      2.261261     2.380707      -1.127333   

   Bwd Packets/s  Packet Length Max  Packet Length Mean  Packet Length Std  \
0      -0.813415           1.656481            3.095354           2.092032   
1      -0.843245          -0.473198           -0.378224          -0.353715   
2      -0.836588           1.656481            1.864967           1.981271   
3      -0.838018          -0.143366           -0.289165          -0.066181   
4      -0.837970          -0.453107           -0.461203          -0.377610   

   Average Packet Size  Subflow Fwd Packets  ...  Down/Up Ratio_398.0  \
0         

In [8]:
# Apply the scaled data to the original data
data[numerical_cols] = scaled_df[numerical_cols]

# Display the first few rows of the updated DataFrame
print("DataFrame after scaling:")
print(data.head())

DataFrame after scaling:
   Bwd Packet Length Max  Flow IAT Min  Bwd IAT Min  Fwd Packets/s  \
0               1.970087     -0.117054    -0.191225      -1.074074   
1              -0.656720      2.261261     2.976213      -1.968879   
2              -0.656720     -0.117054     1.818932      -0.835661   
3              -0.656720      2.261261     2.256976      -1.130665   
4              -0.656720      2.261261     2.380707      -1.127333   

   Bwd Packets/s  Packet Length Max  Packet Length Mean  Packet Length Std  \
0      -0.813415           1.656481            3.095354           2.092032   
1      -0.843245          -0.473198           -0.378224          -0.353715   
2      -0.836588           1.656481            1.864967           1.981271   
3      -0.838018          -0.143366           -0.289165          -0.066181   
4      -0.837970          -0.453107           -0.461203          -0.377610   

   Average Packet Size  Subflow Fwd Packets  ...  Down/Up Ratio_398.0  \
0           

In [9]:
from sklearn.preprocessing import RobustScaler

# Re-identify numerical features after previous transformations
numerical_cols = data.select_dtypes(include=['float64']).columns.tolist()

# Fit RobustScaler
robust_scaler = RobustScaler()

# Fit and transform the numerical features
robust_scaled_data = robust_scaler.fit_transform(data[numerical_cols])

# Convert back to DataFrame with proper column names
robust_scaled_df = pd.DataFrame(robust_scaled_data, columns=numerical_cols)

# Print sample of scaled data
print("Sample of Robust Scaled features:")
print(robust_scaled_df.head())

# Display basic statistics of Robust Scaled data
print("\nRobust Scaled data statistics:")
print(robust_scaled_df.describe().round(3))

Sample of Robust Scaled features:
   Bwd Packet Length Max  Flow IAT Min  Bwd IAT Min  Fwd Packets/s  \
0               1.931601      0.000000     0.000000      -0.962511   
1              -0.065663      1.512942     2.070435      -1.520710   
2              -0.065663      0.000000     1.313964      -0.813784   
3              -0.065663      1.512942     1.600297      -0.997814   
4              -0.065663      1.512942     1.681175      -0.995735   

   Bwd Packets/s  Packet Length Max  Packet Length Mean  Packet Length Std  \
0      -0.312227           1.076154            3.501396           1.623147   
1      -0.329591           0.097692            0.061907           0.190693   
2      -0.325716           1.076154            2.283083           1.558276   
3      -0.326549           0.249231            0.150092           0.359100   
4      -0.326521           0.106923           -0.020258           0.176698   

   Average Packet Size  Subflow Fwd Packets  ...  Down/Up Ratio_398.0  \
0  

In [10]:
# Apply the robust scaled data to the original data
data[numerical_cols] = robust_scaled_df[numerical_cols]

# Display the first few rows of the updated DataFrame
print("DataFrame after Robust Scaling:")
print(data.head())

DataFrame after Robust Scaling:
   Bwd Packet Length Max  Flow IAT Min  Bwd IAT Min  Fwd Packets/s  \
0               1.931601      0.000000     0.000000      -0.962511   
1              -0.065663      1.512942     2.070435      -1.520710   
2              -0.065663      0.000000     1.313964      -0.813784   
3              -0.065663      1.512942     1.600297      -0.997814   
4              -0.065663      1.512942     1.681175      -0.995735   

   Bwd Packets/s  Packet Length Max  Packet Length Mean  Packet Length Std  \
0      -0.312227           1.076154            3.501396           1.623147   
1      -0.329591           0.097692            0.061907           0.190693   
2      -0.325716           1.076154            2.283083           1.558276   
3      -0.326549           0.249231            0.150092           0.359100   
4      -0.326521           0.106923           -0.020258           0.176698   

   Average Packet Size  Subflow Fwd Packets  ...  Down/Up Ratio_398.0  \
0    

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create X (features) and y (target)
X = data
y = labels['Label']

# Create train/test split with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Create and train Random Forest with class weights to handle imbalance
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': data.columns,
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     71666
           1       0.24      0.90      0.37        77
           2       0.48      0.52      0.50        90
           3       0.41      0.25      0.31       894
           4       0.80      0.68      0.73      6190
           5       0.59      0.87      0.70      5923
           6       0.66      0.57      0.61       927
           7       0.77      0.69      0.73      3347
           8       0.26      0.45      0.32       420
           9       0.08      0.29      0.12        49

    accuracy                           0.92     89583
   macro avg       0.53      0.62      0.54     89583
weighted avg       0.94      0.92      0.93     89583


Top 10 Most Important Features:
                  feature  importance
5       Packet Length Max    0.129096
6      Packet Length Mean    0.127660
8     Average Packet Size    0.127537
4           Bwd Packets/s    0.11

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd

# Create X (features) and y (target)
X = data
y = labels['Label']


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']
}

# Grid search with cross-validation
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='f1_weighted', verbose=1)
grid_search.fit(X_train_resampled, y_train_resampled)

# Best model
best_rf = grid_search.best_estimator_

# Make predictions
y_pred = best_rf.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     71666
           1       0.26      0.79      0.39        77
           2       0.08      0.49      0.14        90
           3       0.26      0.36      0.30       894
           4       0.78      0.63      0.70      6190
           5       0.66      0.67      0.67      5923
           6       0.55      0.60      0.57       927
           7       0.69      0.70      0.69      3347
           8       0.15      0.42      0.22       420
           9       0.02      0.35      0.03        49

    accuracy                           0.91     89583
   macro avg       0.44      0.60      0.47     89583
weighted avg       0.93      0.91      0.92     89583

Best Parameters: {'class_weight': 'balanced', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


: 