In [14]:
import pandas as pd
import os

In [4]:
# Assuming you have the path to the folder containing CSV files
folder_path = 'C:/Users/GCU/Downloads/CICIoT2023'

# List to hold all individual DataFrames
dataframes = []

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        # Read the CSV file, and immediately change float64 columns to float32
        df_32 = pd.read_csv(file_path).astype({col: 'float32' for col in pd.read_csv(file_path, nrows=1).select_dtypes(include=['float64']).columns})
        dataframes.append(df_32)

# Concatenate all the DataFrames into a single DataFrame
df = pd.concat(dataframes, ignore_index=True)

# Now df contains all the data from the CSV files with float64 columns converted to float32

In [5]:
for col in df.select_dtypes(include=['object']).columns:
    if df[col].nunique() < df.shape[0] * 0.5:  # Threshold for conversion
        df[col] = df[col].astype('category')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46686579 entries, 0 to 46686578
Data columns (total 47 columns):
 #   Column           Dtype   
---  ------           -----   
 0   flow_duration    float32 
 1   Header_Length    float32 
 2   Protocol Type    float32 
 3   Duration         float32 
 4   Rate             float32 
 5   Srate            float32 
 6   Drate            float32 
 7   fin_flag_number  float32 
 8   syn_flag_number  float32 
 9   rst_flag_number  float32 
 10  psh_flag_number  float32 
 11  ack_flag_number  float32 
 12  ece_flag_number  float32 
 13  cwr_flag_number  float32 
 14  ack_count        float32 
 15  syn_count        float32 
 16  fin_count        float32 
 17  urg_count        float32 
 18  rst_count        float32 
 19  HTTP             float32 
 20  HTTPS            float32 
 21  DNS              float32 
 22  Telnet           float32 
 23  SMTP             float32 
 24  SSH              float32 
 25  IRC              float32 
 26  TCP         

In [7]:
print(df['label'].value_counts())

label
DDoS-ICMP_Flood            7200504
DDoS-UDP_Flood             5412287
DDoS-TCP_Flood             4497667
DDoS-PSHACK_Flood          4094755
DDoS-SYN_Flood             4059190
DDoS-RSTFINFlood           4045285
DDoS-SynonymousIP_Flood    3598138
DoS-UDP_Flood              3318595
DoS-TCP_Flood              2671445
DoS-SYN_Flood              2028834
BenignTraffic              1098195
Mirai-greeth_flood          991866
Mirai-udpplain              890576
Mirai-greip_flood           751682
DDoS-ICMP_Fragmentation     452489
MITM-ArpSpoofing            307593
DDoS-UDP_Fragmentation      286925
DDoS-ACK_Fragmentation      285104
DNS_Spoofing                178911
Recon-HostDiscovery         134378
Recon-OSScan                 98259
Recon-PortScan               82284
DoS-HTTP_Flood               71864
VulnerabilityScan            37382
DDoS-HTTP_Flood              28790
DDoS-SlowLoris               23426
DictionaryBruteForce         13064
BrowserHijacking              5859
CommandInjecti

In [8]:
# Calculate the percentage of zeros in each column
percentage_zeros = (df == 0).mean() * 100

# Identify columns where the percentage of zeros is greater than 99%
columns_to_drop = percentage_zeros[percentage_zeros > 99].index

# Drop these columns from the DataFrame
df.drop(columns=columns_to_drop, inplace=True)

# df now has the columns removed where more than 99% of the values were zeros

In [9]:
df.isnull().sum()

flow_duration      0
Header_Length      0
Protocol Type      0
Duration           0
Rate               0
Srate              0
fin_flag_number    0
syn_flag_number    0
rst_flag_number    0
psh_flag_number    0
ack_flag_number    0
ack_count          0
syn_count          0
fin_count          0
urg_count          0
rst_count          0
HTTP               0
HTTPS              0
TCP                0
UDP                0
ICMP               0
IPv                0
LLC                0
Tot sum            0
Min                0
Max                0
AVG                0
Std                0
Tot size           0
IAT                0
Number             0
Magnitue           0
Radius             0
Covariance         0
Variance           0
Weight             0
label              0
dtype: int64

In [10]:
df = df.drop_duplicates(keep="first")

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28098546 entries, 0 to 46686577
Data columns (total 37 columns):
 #   Column           Dtype   
---  ------           -----   
 0   flow_duration    float32 
 1   Header_Length    float32 
 2   Protocol Type    float32 
 3   Duration         float32 
 4   Rate             float32 
 5   Srate            float32 
 6   fin_flag_number  float32 
 7   syn_flag_number  float32 
 8   rst_flag_number  float32 
 9   psh_flag_number  float32 
 10  ack_flag_number  float32 
 11  ack_count        float32 
 12  syn_count        float32 
 13  fin_count        float32 
 14  urg_count        float32 
 15  rst_count        float32 
 16  HTTP             float32 
 17  HTTPS            float32 
 18  TCP              float32 
 19  UDP              float32 
 20  ICMP             float32 
 21  IPv              float32 
 22  LLC              float32 
 23  Tot sum          float32 
 24  Min              float32 
 25  Max              float32 
 26  AVG              

In [12]:
print(df['label'].value_counts())

label
DDoS-UDP_Flood             5412287
DDoS-SynonymousIP_Flood    3065966
DoS-UDP_Flood              2959733
DDoS-SYN_Flood             1933447
DDoS-ICMP_Flood            1809173
DoS-TCP_Flood              1778908
DDoS-PSHACK_Flood          1647084
DoS-SYN_Flood              1629596
DDoS-TCP_Flood             1569605
BenignTraffic              1098177
DDoS-RSTFINFlood           1071959
Mirai-udpplain              890576
Mirai-greeth_flood          673232
Mirai-greip_flood           550402
DDoS-ICMP_Fragmentation     443979
MITM-ArpSpoofing            307591
DDoS-UDP_Fragmentation      286925
DDoS-ACK_Fragmentation      274933
DNS_Spoofing                178873
Recon-HostDiscovery         134345
Recon-OSScan                 98112
Recon-PortScan               82124
DoS-HTTP_Flood               71786
VulnerabilityScan            37382
DDoS-HTTP_Flood              28772
DDoS-SlowLoris               23426
DictionaryBruteForce         13064
BrowserHijacking              5859
CommandInjecti

In [13]:
df.to_csv('C:/Users/GCU/Downloads/ciciot-2023.csv', index=False)