In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("../TrafficLabelling/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")

# print number of rows
print("Number of rows:", df.shape[0])

Number of rows: 225745


In [None]:
#trim all and lower all columns
df.columns = df.columns.str.strip()
#trim all and lower all columns
df.columns = df.columns.str.lower()

In [None]:
# Check for missing or null values and handle them
print(df.isnull().sum()) # check for null values
df.dropna() # drop rows with null values

# Check for duplicate records and remove them if necessary
print(df.duplicated().sum()) # check for duplicates
df.drop_duplicates() # drop duplicate rows

In [None]:
# Convert label column to binary values
df["label"] = df["label"].apply(lambda x: 0 if "BENIGN" in x else 1)
df = df.rename(columns={'label': 'is attack'})

In [None]:
# Create new features that may be useful in detecting attacks
df["total packet length"] = df["total length of fwd packets"] + df["total length of bwd packets"]
df["packet length ratio"] = df["total length of fwd packets"] / (df["total length of bwd packets"] + 0.1)
df["packet rate"] = df["total fwd packets"] + df["total backward packets"]
df["flow duration (ms)"] = df["flow duration"] / 1000

In [None]:
# Compute the correlation matrix between all features and the target variable
corr_matrix = df.corr()['is attack'].sort_values()

In [None]:
# Set threshold for correlation with target variable
corr_threshold = 0.2
# get columns with NaN correlation
nan_columns = list(corr_matrix[corr_matrix.isna()].index) 

In [None]:
# Get a list of column names with correlation below a certain threshold (e.g. 0.1)
low_corr_cols = list(corr_matrix[corr_matrix.abs() < corr_threshold].index) 
#append nan columns to low_corr_cols
low_corr_cols.extend(nan_columns)
# Drop columns with low correlation to target variable
df = df.drop(low_corr_cols, axis=1)

In [None]:
# Export the new data set
df.to_csv("preprocessed_dataset.csv", index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Visualize the distribution of the target variable
sns.countplot(x="is attack", data=df)
plt.title("Distribution of the target variable")
plt.show()

In [None]:
sns.heatmap(df.corr(), annot=False, cmap="coolwarm")
plt.title("Correlation matrix of the numerical columns")
plt.show()

In [None]:
# Visualize the box plots of the numerical columns
plt.figure(figsize=(20,10))
sns.boxplot(data=df)
plt.title("Box plots of the numerical columns")
plt.xticks(rotation=90)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('preprocessed_dataset.csv')

# Split the dataset into training and test datasets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Export the datasets as CSV files
train_df.to_csv('training.csv', index=False)
test_df.to_csv('test.csv', index=False)