In [1]:
import pandas as pd
import os

In [2]:
# Define the folder path where your CSV files are stored
folder_path = "/kaggle/input/svagdataset/Test/Test"  # Replace with your actual folder path

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty list to store DataFrames
dataframes = []

# Iterate through each CSV file
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    # Add a 'Label' column and set its value to the file name (without the extension)
    df['Label'] = os.path.splitext(csv_file)[0]
    dataframes.append(df)

# Concatenate all DataFrames into one
test = pd.concat(dataframes, ignore_index=True)

In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1614182 entries, 0 to 1614181
Data columns (total 46 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   Header_Length    1614182 non-null  float64
 1   Protocol Type    1614182 non-null  float64
 2   Duration         1614182 non-null  float64
 3   Rate             1614182 non-null  float64
 4   Srate            1614182 non-null  float64
 5   Drate            1614182 non-null  float64
 6   fin_flag_number  1614182 non-null  float64
 7   syn_flag_number  1614182 non-null  float64
 8   rst_flag_number  1614182 non-null  float64
 9   psh_flag_number  1614182 non-null  float64
 10  ack_flag_number  1614182 non-null  float64
 11  ece_flag_number  1614182 non-null  float64
 12  cwr_flag_number  1614182 non-null  float64
 13  ack_count        1614182 non-null  float64
 14  syn_count        1614182 non-null  float64
 15  fin_count        1614182 non-null  float64
 16  rst_count        1

In [4]:
print(test['Label'].value_counts())

Label
TCP_IP-DDoS-UDP1_test.pcap           205654
TCP_IP-DDoS-ICMP2_test.pcap          195692
TCP_IP-DDoS-TCP_test.pcap            182598
TCP_IP-DDoS-SYN_test.pcap            172397
TCP_IP-DDoS-UDP2_test.pcap           156416
TCP_IP-DDoS-ICMP1_test.pcap          154007
TCP_IP-DoS-UDP_test.pcap             137553
TCP_IP-DoS-SYN_test.pcap              98595
TCP_IP-DoS-ICMP_test.pcap             98432
TCP_IP-DoS-TCP_test.pcap              82096
MQTT-DDoS-Connect_Flood_test.pcap     41916
Benign_test.pcap                      37607
Recon-Port_Scan_test.pcap             22622
MQTT-DoS-Publish_Flood_test.pcap       8505
MQTT-DDoS-Publish_Flood_test.pcap      8416
Recon-OS_Scan_test.pcap                3834
MQTT-DoS-Connect_Flood_test.pcap       3131
MQTT-Malformed_Data_test.pcap          1747
ARP_Spoofing_test.pcap                 1744
Recon-VulScan_test.pcap                1034
Recon-Ping_Sweep_test.pcap              186
Name: count, dtype: int64


In [5]:
test['Label'] = test['Label'].str.replace('_test.pcap', '', regex=False)
test['Label'] = test['Label'].str.replace(r'\d+', '', regex=True)

In [6]:
# Define the mapping dictionary
label_mapping = {
    'MQTT-Malformed_Data': 'Malformed_Data',
    'TCP_IP-DDoS-ICMP': 'DDoS-ICMP',
    'TCP_IP-DoS-ICMP': 'DoS-ICMP',
    'TCP_IP-DoS-TCP': 'DoS-TCP',
    'MQTT-DDoS-Connect_Flood': 'DDoS-Connect_Flood',
    'MQTT-DoS-Connect_Flood': 'DoS-Connect_Flood',
    'Benign': 'Benign',
    'TCP_IP-DoS-UDP': 'DoS-UDP',
    'Recon-Ping_Sweep': 'Ping_Sweep',
    'MQTT-DDoS-Publish_Flood': 'DDoS-Publish_Flood',
    'TCP_IP-DDoS-SYN': 'DDoS-SYN',
    'TCP_IP-DDoS-TCP': 'DDoS-TCP',
    'TCP_IP-DoS-SYN': 'DoS-SYN',
    'TCP_IP-DDoS-UDP': 'DDoS-UDP',
    'Recon-Port_Scan': 'Port_Scan',
    'Recon-OS_Scan': 'OS_Scan',
    'MQTT-DoS-Publish_Flood': 'DoS-Publish_Flood',
    'Recon-VulScan': 'VulScan',
    'ARP_Spoofing': 'ARP_Spoofing'
}

# Apply the mapping to the 'Label' column
test['Label'] = test['Label'].replace(label_mapping)

In [7]:
# Verify the changes
print(test['Label'].value_counts())

Label
DDoS-UDP              362070
DDoS-ICMP             349699
DDoS-TCP              182598
DDoS-SYN              172397
DoS-UDP               137553
DoS-SYN                98595
DoS-ICMP               98432
DoS-TCP                82096
DDoS-Connect_Flood     41916
Benign                 37607
Port_Scan              22622
DoS-Publish_Flood       8505
DDoS-Publish_Flood      8416
OS_Scan                 3834
DoS-Connect_Flood       3131
Malformed_Data          1747
ARP_Spoofing            1744
VulScan                 1034
Ping_Sweep               186
Name: count, dtype: int64


In [8]:
test.isnull().sum()

Header_Length      0
Protocol Type      0
Duration           0
Rate               0
Srate              0
Drate              0
fin_flag_number    0
syn_flag_number    0
rst_flag_number    0
psh_flag_number    0
ack_flag_number    0
ece_flag_number    0
cwr_flag_number    0
ack_count          0
syn_count          0
fin_count          0
rst_count          0
HTTP               0
HTTPS              0
DNS                0
Telnet             0
SMTP               0
SSH                0
IRC                0
TCP                0
UDP                0
DHCP               0
ARP                0
ICMP               0
IGMP               0
IPv                0
LLC                0
Tot sum            0
Min                0
Max                0
AVG                0
Std                0
Tot size           0
IAT                0
Number             0
Magnitue           0
Radius             0
Covariance         0
Variance           0
Weight             0
Label              0
dtype: int64

In [9]:
test.duplicated().sum()

2065

In [10]:
test =  test.drop_duplicates(keep="first")

In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1612117 entries, 0 to 1614181
Data columns (total 46 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   Header_Length    1612117 non-null  float64
 1   Protocol Type    1612117 non-null  float64
 2   Duration         1612117 non-null  float64
 3   Rate             1612117 non-null  float64
 4   Srate            1612117 non-null  float64
 5   Drate            1612117 non-null  float64
 6   fin_flag_number  1612117 non-null  float64
 7   syn_flag_number  1612117 non-null  float64
 8   rst_flag_number  1612117 non-null  float64
 9   psh_flag_number  1612117 non-null  float64
 10  ack_flag_number  1612117 non-null  float64
 11  ece_flag_number  1612117 non-null  float64
 12  cwr_flag_number  1612117 non-null  float64
 13  ack_count        1612117 non-null  float64
 14  syn_count        1612117 non-null  float64
 15  fin_count        1612117 non-null  float64
 16  rst_count        161211

In [12]:
# Optionally, save the merged DataFrame to a new CSV file
test.to_csv('Test.csv', index=False)