# Loading CICDoS2019 dataset

In [None]:
import glob
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from joblib import dump

In [None]:
dfs = []
data_dir = 'data'
csv_files = glob.glob(os.path.join(data_dir, '**', '*.csv'), recursive=True)

# check if 'data/combined_df.csv' exists
combined_csv = 'data/combined_data.csv'

if os.path.exists(combined_csv):
    print(f"Loading existing {combined_csv}")
    combined_df = pd.read_csv(combined_csv)
else:
    print(f"Creating {combined_csv }")
    # for data_file in csv_files[:1]: # for testing
    for data_file in csv_files:
        print("Loading dataset:", data_file)
        df = pd.read_csv(data_file, low_memory=False)
        df.columns = df.columns.str.strip()

        # Drop unnecessary columns
        # df.drop(columns=['Timestamp', 'Flow ID', 'SimillarHTTP'], inplace=True)
        # df.drop(columns=['Timestamp', 'SimillarHTTP'], inplace=True)

        # Encode labels as binary (benign or not)
        df['Label'] = df['Label'].apply(lambda x: 1 if x == 'BENIGN' else 0)
        dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.to_csv(combined_csv , index=False)


In [None]:
# Encode categorical features
label_encoder = LabelEncoder()
combined_df['Source IP'] = label_encoder.fit_transform(combined_df['Source IP'])
combined_df['Destination IP'] = label_encoder.fit_transform(combined_df['Destination IP'])

# Split the data into features and target
X = combined_df.drop(columns=['Label'])
y = combined_df['Label']

# Replace infinite or very large values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute NaN values with the mean of each feature
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Features used in FlowGuard

selected_features = [
    'Source IP', 'Source Port', 'Destination IP', 'Destination Port',
    'Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
    'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
    'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean',
    'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min',
    'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
    'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
    'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
    'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
    'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
    'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
    'Bwd Header Length'
]

X = X[selected_features]

In [None]:
def classify(predictions, threshold=0.5):
    return [1 if pred >= threshold else 0 for pred in predictions]

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_class = classify(y_pred)
    
    accuracy = accuracy_score(y_test, y_pred_class)
    print("Accuracy:", accuracy)
    print(classification_report(y_test, y_pred_class))

def evaluate_classifier(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print(classification_report(y_test, y_pred))

# Train classifiers
classifiers = {
    "ID3 Classifier": DecisionTreeClassifier(),
    "Naive Bayes Classifier": GaussianNB(),
    # "Random Forest Classifier": RandomForestClassifier(),
    "Linear Regression": LinearRegression(),
}

for name, classifier in classifiers.items():
    print("Training", name, "...")
    classifier.fit(X_train, y_train)
    # evaluate_classifier(classifier, X_test, y_test)
    # Evaluate Linear Regression model for classification
    evaluate_model(linear_reg_model, X_test, y_test)
    print('-----------------------------------\n')

    # Save the model
    # dump(classifier, f'{name}_combined_model.joblib')
    # print("Model saved.")

In [None]:


# Assuming you have your dataset loaded into a DataFrame called 'data'
# # Separate features and target variable
# X = data.drop(columns=['target_column'])
# y = data['target_column']

# Create a random forest classifier
rf = RandomForestClassifier()

# Fit the model to the data
rf.fit(X, y)

# Get feature importances
importances = rf.feature_importances_

# Get feature names
feature_names = X.columns

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Rearrange feature names so they match the sorted feature importances
sorted_feature_names = [feature_names[i] for i in indices]

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), sorted_feature_names, rotation=90)
plt.xlim([-1, X.shape[1]])
plt.tight_layout()
plt.show()

# Applying ML on the datasete

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import pandas as pd

# # Assuming you have your data loaded into a pandas DataFrame called 'data'
# # with the selected features and labels
# selected_features = [
#     'Source IP', 'Source Port', 'Destination IP', 'Destination Port',
#     'Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
#     'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
#     'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean',
#     'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min',
#     'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
#     'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
#     'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
#     'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
#     'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
#     'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
#     'Bwd Header Length'
# ]

# # Splitting data into features and labels
# X = data[selected_features].values
# y = data['Label'].values

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshaping features for LSTM input (assuming a 3D input shape)
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Building the LSTM model
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
