In [26]:
# External Imports
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


# Helper Constants
DATA_DIR      = "data/"
DDOS_NAME     = "ddos_data"
PHISHING_NAME = "phishing_data"
ANDROID_NAME  = "android_malware_data"
NAME_SUFFIX   = ".csv"

ALG_RF        = "RandomForest"
ALG_LR        = "LogisticRegression"

X_TRAIN       = "x_train"
X_TEST        = "x_test"
Y_TRAIN       = "y_train"
Y_TEST        = "y_test"

ACCURACY      = "accuracy"
CLASS_REPORT  = "class_report"
CM            = "confusion_matrix"

In [27]:
# Create a placeholder dictionary
dataset_dictionary = {
    DDOS_NAME: None,
    PHISHING_NAME: None,
    ANDROID_NAME: None
}

# Load datasets
for key in dataset_dictionary.keys():
    dataset_dictionary[key] = pd.read_csv(DATA_DIR + key + NAME_SUFFIX)

  dataset_dictionary[key] = pd.read_csv(DATA_DIR + key + NAME_SUFFIX)


In [28]:
# Visualize data info
for key in dataset_dictionary.keys():
    print(key)
    print(dataset_dictionary[key].info())

ddos_data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151200 entries, 0 to 151199
Data columns (total 23 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   ip.src           151200 non-null  object
 1   ip.dst           151200 non-null  object
 2   tcp.srcport      151200 non-null  int64 
 3   tcp.dstport      151200 non-null  int64 
 4   ip.proto         151200 non-null  int64 
 5   frame.len        151200 non-null  int64 
 6   tcp.flags.syn    151200 non-null  int64 
 7   tcp.flags.reset  151200 non-null  int64 
 8   tcp.flags.push   151200 non-null  int64 
 9   tcp.flags.ack    151200 non-null  int64 
 10  ip.flags.mf      151200 non-null  int64 
 11  ip.flags.df      151200 non-null  int64 
 12  ip.flags.rb      151200 non-null  int64 
 13  tcp.seq          151200 non-null  int64 
 14  tcp.ack          151200 non-null  int64 
 15  frame.time       151200 non-null  object
 16  Packets          151200 non-null  int64 
 17  

In [29]:
# Additional pre-preprocessing step for the phishing dataset so the support functions work out of the box
dataset_dictionary[PHISHING_NAME].rename(columns={'CLASS_LABEL': 'Label'}, inplace=True)

In [30]:
# Support functions

def preprocessing(data, target_column="Label"):
    """Helper function to split dataset into test and train data"""
    # Check if the target column exists
    if target_column not in data.columns:
        raise ValueError(f"The target column '{target_column}' does not exist in the dataset.")

    # Identify categorical and numerical columns
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns

    # Handle missing values
    # For numerical columns, impute with mean
    if not numerical_cols.empty:
        imputer_num = SimpleImputer(strategy='mean')
        data[numerical_cols] = imputer_num.fit_transform(data[numerical_cols])

    # For categorical columns, impute with most frequent value
    if not categorical_cols.empty:
        imputer_cat = SimpleImputer(strategy='most_frequent')
        data[categorical_cols] = imputer_cat.fit_transform(data[categorical_cols])

    # Use LabelEncoder for categorical columns
    le = LabelEncoder()
    for col in categorical_cols:
        data[col] = le.fit_transform(data[col].astype(str))

    # Drop unnecessary columns
    if 'id' in data.columns:
        data = data.drop(columns=['id'])

    # Drop non-numeric columns (IP addresses and timestamps for the ddos dataset)
    if 'ip.src' in data.columns: 
        data = data.drop(columns=['ip.src', 'ip.dst', 'frame.time'], errors='ignore')

    # Handle missing values
    data = data.dropna()

    # Separate features and target
    x = data.drop(columns=[target_column])
    y = data[target_column]

    # Normalize numerical features
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x)

    # Split into train and test sets
    x_train, x_test, y_train, y_test = train_test_split(
        x_scaled, y, test_size=0.2, random_state=42
    )

    print("Data preprocessing completed successfully.")

    return x_train, x_test, y_train, y_test

def train_model(algorithm, x_train, y_train):
    """Fit the model based on the train data"""
    if algorithm == ALG_RF:
        model = RandomForestClassifier(random_state=42)
    elif algorithm == ALG_LR:
        model = LogisticRegression(max_iter=1000)

    model.fit(x_train, y_train)

    return model

def evaluate_model(model, x_test, y_test):
    """Evaluate the model performance"""
    y_pred = model.predict(x_test)

    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print("Model Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(class_report)
    print("Explicitly show TP, TN, FP, FN metrics")
    print(cm, "\n")

    return accuracy, class_report, cm

In [31]:
# Create a placeholder dictionary
preprocessed_data_dictionary = {
    DDOS_NAME: None,
    PHISHING_NAME: None,
    ANDROID_NAME: None
}

# Preprocess the datasets
for key in preprocessed_data_dictionary.keys():
    # Placeholder dictionary for the preprocessed data
    pp_data = {}

    # Preprocess the data and store in helper dictionary
    pp_data[X_TRAIN], pp_data[X_TEST], pp_data[Y_TRAIN], pp_data[Y_TEST] = preprocessing(dataset_dictionary[key])

    # Store the preprocessed data per dataset
    preprocessed_data_dictionary[key] = pp_data

Data preprocessing completed successfully.
Data preprocessing completed successfully.
Data preprocessing completed successfully.


In [32]:
# Create a placeholder dictionary
models_data_dictionary = {
    DDOS_NAME: None,
    PHISHING_NAME: None,
    ANDROID_NAME: None
}

# Train the models
for key in preprocessed_data_dictionary.keys():
    # Placeholder dictionary for the trained models
    models = { ALG_RF: None, ALG_LR: None }

    # Train a RandomForest model per dataset
    print(f'Training {ALG_RF} model on {key} dataset...')
    models[ALG_RF] = train_model(ALG_RF, preprocessed_data_dictionary[key][X_TRAIN], preprocessed_data_dictionary[key][Y_TRAIN])
    print('Training done!')

    # Train a LogisticRegression model per dataset
    print(f'Training {ALG_LR} model on {key} dataset...')
    models[ALG_LR] = train_model(ALG_LR, preprocessed_data_dictionary[key][X_TRAIN], preprocessed_data_dictionary[key][Y_TRAIN])
    print('Training done!')

    # Save the trained models in the dictionary
    models_data_dictionary[key] = models

Training RandomForest model on ddos_data dataset...
Training done!
Training LogisticRegression model on ddos_data dataset...
Training done!
Training RandomForest model on phishing_data dataset...
Training done!
Training LogisticRegression model on phishing_data dataset...
Training done!
Training RandomForest model on android_malware_data dataset...
Training done!
Training LogisticRegression model on android_malware_data dataset...
Training done!


In [37]:
# Helper dictionary to hold evaluations
model_evaluation_dictionary = {
    DDOS_NAME: {},
    PHISHING_NAME: {},
    ANDROID_NAME: {}
}

for key in models_data_dictionary.keys():
    print(f'Evaluating {ALG_RF} model trained on {key} dataset...')
    accuracy, class_report, cm = evaluate_model(
        models_data_dictionary[key][ALG_RF],
        preprocessed_data_dictionary[key][X_TEST],
        preprocessed_data_dictionary[key][Y_TEST]
    )

    model_evaluation_dictionary[key][ALG_RF] = {ACCURACY: accuracy, CLASS_REPORT: class_report, CM: cm}

    print('Evaluation done!\n')

    print(f'Evaluating {ALG_LR} model trained on {key} dataset...')
    accuracy, class_report, cm = evaluate_model(
        models_data_dictionary[key][ALG_LR],
        preprocessed_data_dictionary[key][X_TEST],
        preprocessed_data_dictionary[key][Y_TEST]
    )

    model_evaluation_dictionary[key][ALG_LR] = {ACCURACY: accuracy, CLASS_REPORT: class_report, CM: cm}

    print('Evaluation done!\n')

Evaluating RandomForest model trained on ddos_data dataset...
Model Performance:
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15169
           1       1.00      1.00      1.00      7608
           2       1.00      1.00      1.00      7463

    accuracy                           1.00     30240
   macro avg       1.00      1.00      1.00     30240
weighted avg       1.00      1.00      1.00     30240

Explicitly show TP, TN, FP, FN metrics
[[15169     0     0]
 [    0  7608     0]
 [    0     0  7463]] 

Evaluation done!

Evaluating LogisticRegression model trained on ddos_data dataset...
Model Performance:
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15169
           1       1.00      1.00      1.00      7608
           2       1.00      1.00      1.00      7463

    accuracy                           1.00     30240
   macro avg       1.0

In [None]:
# Show Graphics
# TODO: Plot the diagrams
# Go through the models and plot stuff