In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC

In [43]:
dataframe = pd.read_csv('CICIDS_DoS_2.csv')
dataframe.columns = dataframe.columns.str.strip()

In [44]:
selectedColumns = [
    'Dst Port', 'Total Length of Fwd Packet', 'Fwd Packet Length Max',
    'Fwd Packet Length Mean', 'Bwd Packet Length Max', 'Bwd Packet Length Min',
    'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Bwd IAT Total',
    'Bwd IAT Max', 'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
    'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count', 'PSH Flag Count', 'URG Flag Count',
    'Average Packet Size', 'Fwd Segment Size Avg', 'Bwd Segment Size Avg',
    'Subflow Fwd Bytes', 'Label'
]

In [45]:
benignDataframe = dataframe[dataframe['Label'] == 'BENIGN']
benignDataframe = benignDataframe.iloc[:100000]

In [46]:
# check for duplicates in the subset of selected features
duplicates_mask = benignDataframe.duplicated(keep='first')
num_duplicates = duplicates_mask.sum()
print(f"Number of duplicate rows based on selected features: {num_duplicates}")

# remove duplicates directly
benignDataframe = benignDataframe.drop_duplicates(keep='first')
print(f"Shape of dataset after removing duplicates: {benignDataframe.shape}")

# select relevant columns 
benignDataframe = benignDataframe[selectedColumns]
benignDataframe

Number of duplicate rows based on selected features: 0
Shape of dataset after removing duplicates: (100000, 91)


Unnamed: 0,Dst Port,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Bwd IAT Total,Bwd IAT Max,...,Packet Length Std,Packet Length Variance,SYN Flag Count,PSH Flag Count,URG Flag Count,Average Packet Size,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,Label
0,443,0,0,0.000000,0,0,0.000000,0.000000,0,0,...,0.000000,0.000000,1,0,0,0.000,0.000000,0.000000,0,BENIGN
1,443,0,0,0.000000,0,0,0.000000,0.000000,0,0,...,0.000000,0.000000,1,0,0,0.000,0.000000,0.000000,0,BENIGN
2,443,0,0,0.000000,0,0,0.000000,0.000000,0,0,...,0.000000,0.000000,1,0,0,0.000,0.000000,0.000000,0,BENIGN
3,443,0,0,0.000000,0,0,0.000000,0.000000,0,0,...,0.000000,0.000000,1,0,0,0.000,0.000000,0.000000,0,BENIGN
4,443,0,0,0.000000,0,0,0.000000,0.000000,0,0,...,0.000000,0.000000,1,0,0,0.000,0.000000,0.000000,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263533,53,84,42,42.000000,95,95,95.000000,0.000000,3,3,...,30.599564,936.333333,0,0,0,68.500,42.000000,95.000000,21,BENIGN
263534,443,11583,1654,609.631579,1034,0,393.904762,514.532205,119994789,10272554,...,676.486765,457634.342949,0,15,0,496.375,609.631579,393.904762,289,BENIGN
263535,123,48,48,48.000000,48,48,48.000000,0.000000,0,0,...,0.000000,0.000000,0,0,0,48.000,48.000000,48.000000,24,BENIGN
263536,53,68,34,34.000000,50,50,50.000000,0.000000,3,3,...,9.237604,85.333333,0,0,0,42.000,34.000000,50.000000,17,BENIGN


In [47]:
benignDataframe.shape

(100000, 23)

In [None]:
import random

# Create a new DataFrame with 100k rows, all values set to 0
new_rows = pd.DataFrame(0, index=range(100000), columns=benignDataframe.columns)

new_rows['Dst Port'] = random.randrange(0, 65535)
new_rows['SYN Flag Count'] = 1
new_rows['Bwd IAT Total'] = np.random.uniform(0.05, 2.5, 100000)
new_rows['Bwd IAT Max'] = np.random.uniform(0.05, 2.5, 100000)
new_rows['Label'] = 'DoS'

df = pd.concat([benignDataframe, new_rows], ignore_index=True)
df.tail(10) 

Unnamed: 0,Dst Port,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Bwd IAT Total,Bwd IAT Max,...,Packet Length Std,Packet Length Variance,SYN Flag Count,PSH Flag Count,URG Flag Count,Average Packet Size,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,Label
199990,99991,0,0,0.0,0,0,0.0,0.0,1.065703,1.797706,...,0.0,0.0,1,0,0,0.0,0.0,0.0,0,DoS
199991,99992,0,0,0.0,0,0,0.0,0.0,0.079261,1.937256,...,0.0,0.0,1,0,0,0.0,0.0,0.0,0,DoS
199992,99993,0,0,0.0,0,0,0.0,0.0,0.809124,1.992616,...,0.0,0.0,1,0,0,0.0,0.0,0.0,0,DoS
199993,99994,0,0,0.0,0,0,0.0,0.0,1.370376,1.575269,...,0.0,0.0,1,0,0,0.0,0.0,0.0,0,DoS
199994,99995,0,0,0.0,0,0,0.0,0.0,0.424415,0.381265,...,0.0,0.0,1,0,0,0.0,0.0,0.0,0,DoS
199995,99996,0,0,0.0,0,0,0.0,0.0,0.682916,2.213969,...,0.0,0.0,1,0,0,0.0,0.0,0.0,0,DoS
199996,99997,0,0,0.0,0,0,0.0,0.0,2.443729,1.712988,...,0.0,0.0,1,0,0,0.0,0.0,0.0,0,DoS
199997,99998,0,0,0.0,0,0,0.0,0.0,0.061574,0.44841,...,0.0,0.0,1,0,0,0.0,0.0,0.0,0,DoS
199998,99999,0,0,0.0,0,0,0.0,0.0,1.68334,1.045264,...,0.0,0.0,1,0,0,0.0,0.0,0.0,0,DoS
199999,100000,0,0,0.0,0,0,0.0,0.0,2.330913,1.51837,...,0.0,0.0,1,0,0,0.0,0.0,0.0,0,DoS


In [54]:
df.shape

(200000, 23)

In [55]:
shuffled_df = df.sample(frac=1).reset_index(drop=True)

In [56]:
# Save the DataFrame to a CSV file
shuffled_df.to_csv('only_zeros.csv', index=False)
print("DataFrame saved to 'output.csv'")

DataFrame saved to 'output.csv'
