In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv("./sampled_sdn_dataset.csv")

In [3]:
train_df.columns = train_df.columns.str.strip()
print(train_df.columns)


Index(['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
       'Fwd Seg Siz

In [4]:
#selected features
features = ['TotLen Fwd Pkts',
       'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Mean', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Pkts/s', 'Flow IAT Max',
       'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd Header Len', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'PSH Flag Cnt',
       'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
        'Subflow Fwd Byts', 'Subflow Bwd Byts',
                   'Init Fwd Win Byts', 'Init Bwd Win Byts']

In [5]:
train_df["Label"].value_counts()

Label
1    115238
0     84762
Name: count, dtype: int64

In [6]:
features_to_keep = ['TotLen Fwd Pkts',
       'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Mean', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Pkts/s', 'Flow IAT Max',
       'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd Header Len', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'PSH Flag Cnt',
       'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
        'Subflow Fwd Byts', 'Subflow Bwd Byts',
                   'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Label']

# Keep only the desired columns
df = train_df[features_to_keep]

In [7]:
print(train_df['Label'])
y = train_df["Label"]
X = train_df[features_to_keep]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

0         0
1         1
2         1
3         0
4         1
         ..
199995    1
199996    1
199997    1
199998    1
199999    0
Name: Label, Length: 200000, dtype: int64
Shape of X: (200000, 28)
Shape of y: (200000,)


In [8]:
df["Label"].value_counts()

Label
1    115238
0     84762
Name: count, dtype: int64

In [9]:
# Seperating into different dataframes

In [10]:
# List of class labels you want to keep
selected_classes = [1,0]

# Filter the original dataframe to include only the selected classes
filtered_df = df[df['Label'].isin(selected_classes)]
print(filtered_df.head())
# Export the filtered dataframe to a CSV file
filtered_df.to_csv('High_samples_RF.csv', index=False)

   TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Mean  \
0             97.0            231.0             97.0         24.250000   
1             20.0            964.0             20.0          6.666667   
2             20.0            964.0             20.0          6.666667   
3             37.0             53.0             37.0         37.000000   
4              0.0              0.0              0.0          0.000000   

   Bwd Pkt Len Max  Bwd Pkt Len Min  Bwd Pkt Len Mean  Bwd Pkt Len Std  \
0            231.0              0.0             57.75            115.5   
1            964.0              0.0            241.00            482.0   
2            964.0              0.0            241.00            482.0   
3             53.0             53.0             53.00              0.0   
4              0.0              0.0              0.00              0.0   

   Flow Pkts/s  Flow IAT Max  ...    Pkt Len Var  PSH Flag Cnt  Pkt Size Avg  \
0     1.473726      100000.0  

In [11]:
# List of class labels you want to keep
selected_classes = [0,1]

# Filter the original dataframe to include only the selected classes
filtered_df2 = df[df['Label'].isin(selected_classes)]

# Export the filtered dataframe to a CSV file
filtered_df2.to_csv('Mid_samples_RF.csv', index=False)

In [12]:
## Smote experiment on minority class in Low_samples file 

In [13]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

# Load the data from CSV
df = pd.read_csv('Mid_samples_RF.csv')

# Clean the dataset
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame)
    
    # Drop rows with missing values
    df.dropna(inplace=True)
    
    # Drop rows with infinity or negative infinity
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    df = df[indices_to_keep]

    return df

cleaned_df = clean_dataset(df)

# Encode categorical labels
label_encoder = LabelEncoder()
cleaned_df['Label'] = label_encoder.fit_transform(cleaned_df['Label'])


In [14]:
# Separate features and labels
X = cleaned_df.drop(columns=['Label'])
y = cleaned_df['Label']
print(y)
# Apply SMOTE to balance classes with desired ratios
smote = SMOTE(sampling_strategy=0.9, random_state=42)


0         0
1         1
2         1
3         0
4         1
         ..
199995    1
199996    1
199997    1
199998    1
199999    0
Name: Label, Length: 200000, dtype: int64


In [15]:
print("Class distribution before SMOTE:")
print(y.value_counts())


Class distribution before SMOTE:
Label
1    115238
0     84762
Name: count, dtype: int64


In [16]:
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create a new DataFrame for the resampled data
resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df['Label'] = y_resampled

# Add a 'data_type' column to indicate whether the data is original or augmented
cleaned_df['data_type'] = 'Original'
resampled_df['data_type'] = 'Augmented'

# Concatenate the original and augmented dataframes
final_df = pd.concat([cleaned_df, resampled_df], ignore_index=True)

# Decode the categorical labels back to original values
final_df['Label'] = label_encoder.inverse_transform(final_df['Label'])

In [17]:
# Cheking number of augmented and original data per class
augmented_heartbleed_count = final_df[(final_df['data_type'] == 'Augmented') & (final_df['Label'] == 0)].shape[0]
print("Number of Augmented instances with Label '0':", augmented_heartbleed_count)

augmented_heartbleed_count = final_df[(final_df['data_type'] == 'Augmented') & (final_df['Label'] == 1)].shape[0]
print("Number of Augmented instances with Label '1':", augmented_heartbleed_count)


Number of Augmented instances with Label '0': 103714
Number of Augmented instances with Label '1': 115238


In [18]:
#Now I'll drop the augmented data from majority class to match expected number of samples

In [19]:
final_df = final_df.drop(final_df[(final_df['Label'] == 1) & (final_df['data_type'] == 'Augmented')].index)


In [20]:
df["Label"].value_counts()

Label
1    115238
0     84762
Name: count, dtype: int64

In [21]:
final_df["Label"].value_counts()

Label
0    188476
1    115238
Name: count, dtype: int64

In [22]:
final_df.to_csv('Low_samples_Aug_RF.csv', index=False)

In [23]:
#Working with High Samples

In [24]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

# Load the data from CSV
df = pd.read_csv('High_samples_RF.csv')

# Clean the dataset
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame)
    
    # Drop rows with missing values
    df.dropna(inplace=True)
    
    # Drop rows with infinity or negative infinity
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    df = df[indices_to_keep]

    return df

cleaned_df = clean_dataset(df)

# Encode categorical labels
label_encoder = LabelEncoder()
cleaned_df['Label'] = label_encoder.fit_transform(cleaned_df['Label'])

In [25]:
#Downsample Benign Samples

In [26]:
from sklearn.utils import resample

In [27]:
majority_class = 1
minority_classes = [0]


In [28]:
majority_samples = df[df['Label'] == majority_class]
minority_samples = df[df['Label'].isin(minority_classes)]

In [29]:
# Downsample majority class
minority_class_size = len(minority_samples)
downsampled_majority = resample(majority_samples,
                                replace=False,  # Without replacement
                                n_samples=minority_class_size,  # Desired number of samples
                                random_state=42)  # For reproducibility


In [30]:
# Combine minority and downsampled majority samples
balanced_dataset = pd.concat([downsampled_majority, minority_samples])

In [31]:
# Shuffle the dataset to ensure randomness
balanced_dataset = balanced_dataset.sample(frac=1, random_state=42)

In [32]:
balanced_dataset["Label"].value_counts()

Label
1    84762
0    84762
Name: count, dtype: int64

In [33]:
#No need of augmenting the data for this class. I'll just export it in a csv file

In [34]:
balanced_dataset.to_csv('High_samples_Aug_RF.csv', index=False)

In [35]:
#Now I'll make one final copy of all augmented data. 

In [36]:
df1 = pd.read_csv("Low_samples_Aug_RF.csv")
#df2 = pd.read_csv("Mid_samples_Aug_RF.csv")  # in actual training, original distribution was used
df3 = pd.read_csv("High_samples_Aug_RF.csv")

In [37]:
df = pd.concat([df1, df3], ignore_index=True)

In [38]:
df.to_csv('Train_Aug_RF.csv', index=False)