In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
def string2numeric_hash(text):
    import hashlib
    return int(hashlib.md5(text).hexdigest()[:8], 16)

In [3]:
def data_cleaning(data):
    # Flows Packet/s e Bytes/s - Replace infinity by 0
    data = data.replace('Infinity','0')
    data = data.replace(np.inf,0)
    #samples = samples.replace('nan','0')
    data[' Flow Packets/s'] = pd.to_numeric(data[' Flow Packets/s'])
    
    data['Flow Bytes/s'] = data['Flow Bytes/s'].fillna(0)
    data['Flow Bytes/s'] = pd.to_numeric(data['Flow Bytes/s'])
    
    
    #Label
    # Create a label encoder object
    label_encoder = LabelEncoder()
    
    # Fit the label encoder to the label column and transform the labels
    data[' Label'] = label_encoder.fit_transform(data[' Label'])
    
    # Get the mapping between original labels and encoded values
    label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

    return data

In [4]:
def feature_selection(data):
    X = data.iloc[:,0:(data.shape[1]-1)]
    y = data.iloc[:,-1]
    # Initialize a random forest classifier 
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Fit the classifier to the entire dataset to get feature importances
    clf.fit(X, y)
    
    feature_importances = clf.feature_importances_
    importance_dict = dict(zip(data.columns, feature_importances))
    sorted_importance_dict = dict(sorted(importance_dict.items(), key=lambda item: item[1], reverse=True))
    top_50_features = dict(list(sorted_importance_dict.items())[:50])
    data_important = data[list(top_50_features.keys())]
    data_important[' Label'] = data[' Label']
    return data_important

In [5]:
def data_preprocessing():
    data = pd.read_csv("Dataset\\CIC_DDoS.csv")
    cols = [' Source IP',' Destination IP','Flow ID','SimillarHTTP','Unnamed: 0',' Source Port',' Timestamp',' Inbound']
    data = data.drop(columns=cols)
    data = data_cleaning(data)
    data_important = feature_selection(data)
    return data_important

In [6]:
data = data_preprocessing()

In [7]:
X = data.iloc[:, :49]
y = data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

In [8]:
train_data = pd.concat([X_train, y_train], axis=1)
train_data.to_csv('Dataset\\Train\\train_data.csv', index=False)

In [9]:
test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('Dataset\\Test\\test_data.csv', index=False)