In [12]:
import os
import pandas as pd

# Define the folder containing the dataset
folder_path = "E:\\01-12"

# Define TLS-related ports and Protocol (TCP = 6)
tls_ports = [443, 993, 995, 8443]
protocol_value = 6  # TCP

# Define chunk size for large files
chunk_size = 100000  # 100k rows per chunk
large_file_threshold = 1.5 * (1024 ** 3)  # 1.5GB in bytes

# List to store TLS traffic DataFrames
tls_attacks_list = []

# Iterate through all files in the folder
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    
    # Check if it's a CSV file
    if file_name.endswith(".csv"):
        file_size = os.path.getsize(file_path)

        print(f"Processing: {file_name} | Size: {round(file_size / (1024 ** 3), 2)} GB")
        
        if file_size > large_file_threshold:
            # Process large files in chunks
            print(f" {file_name} is large. Processing in chunks...")
            for chunk in pd.read_csv(file_path, chunksize=chunk_size, low_memory=False):
                tls_chunk = chunk[(chunk[' Destination Port'].isin(tls_ports))]
                if not tls_chunk.empty:
                    tls_attacks_list.append(tls_chunk)
        else:
            # Process small files normally
            df = pd.read_csv(file_path, low_memory=False)
            tls_attacks = df[(df[' Destination Port'].isin(tls_ports))]
            if not tls_attacks.empty:
                tls_attacks_list.append(tls_attacks)

# Combine all extracted TLS attack data
if tls_attacks_list:
    combined_tls_attacks = pd.concat(tls_attacks_list, ignore_index=True)

    # Save the final dataset
    output_path = os.path.join(folder_path, "TLS_Combined_Attacks2.csv")
    combined_tls_attacks.to_csv(output_path, index=False)
    
    print(f"\n Final TLS attack dataset saved at: {output_path}")
    print("TLS Attack Class Distribution:\n", combined_tls_attacks[' Label'].value_counts())
else:
    print("\n No TLS-based attack traffic found in any file.")

Processing: DrDoS_DNS.csv | Size: 1.99 GB
 DrDoS_DNS.csv is large. Processing in chunks...
Processing: DrDoS_LDAP.csv | Size: 0.85 GB
Processing: DrDoS_MSSQL.csv | Size: 1.76 GB
 DrDoS_MSSQL.csv is large. Processing in chunks...
Processing: DrDoS_NetBIOS.csv | Size: 1.58 GB
 DrDoS_NetBIOS.csv is large. Processing in chunks...
Processing: DrDoS_NTP.csv | Size: 0.6 GB
Processing: DrDoS_SNMP.csv | Size: 2.02 GB
 DrDoS_SNMP.csv is large. Processing in chunks...
Processing: DrDoS_SSDP.csv | Size: 1.17 GB
Processing: DrDoS_UDP.csv | Size: 1.4 GB
Processing: Syn.csv | Size: 0.59 GB
Processing: TFTP.csv | Size: 8.66 GB
 TFTP.csv is large. Processing in chunks...
Processing: TLS_Combined_Attacks.csv | Size: 0.01 GB
Processing: UDPLag.csv | Size: 0.15 GB

 Final TLS attack dataset saved at: E:\01-12\TLS_Combined_Attacks2.csv
TLS Attack Class Distribution:
 BENIGN           30450
TFTP              1180
DrDoS_SNMP         340
DrDoS_DNS          327
DrDoS_MSSQL        279
DrDoS_NetBIOS      264
Syn

In [15]:
import os
import pandas as pd

# Define the new folder containing CICDDoS2019 dataset
folder_path = "E:\\03-11"

# Define TLS-related ports and Protocol (TCP = 6)
tls_ports = [443, 993, 995, 8443]

# Define chunk size for large files
chunk_size = 100000  # 100k rows per chunk
large_file_threshold = 1.5 * (1024 ** 3)  # 1.5GB in bytes

# List to store TLS traffic DataFrames
tls_attacks_list = []

# Iterate through all files in the folder
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    
    # Check if it's a CSV file
    if file_name.endswith(".csv"):
        file_size = os.path.getsize(file_path)

        print(f"Processing: {file_name} | Size: {round(file_size / (1024 ** 3), 2)} GB")
        
        if file_size > large_file_threshold:
            # Process large files in chunks
            print(f" {file_name} is large. Processing in chunks...")
            for chunk in pd.read_csv(file_path, chunksize=chunk_size, low_memory=False):
                tls_chunk = chunk[(chunk[' Destination Port'].isin(tls_ports))]
                if not tls_chunk.empty:
                    tls_attacks_list.append(tls_chunk)
        else:
            # Process small files normally
            df = pd.read_csv(file_path, low_memory=False)
            tls_attacks = df[(df[' Destination Port'].isin(tls_ports))]
            if not tls_attacks.empty:
                tls_attacks_list.append(tls_attacks)

# Combine all extracted TLS attack data
if tls_attacks_list:
    combined_tls_attacks = pd.concat(tls_attacks_list, ignore_index=True)

    # Merge with previous dataset (TLS_Combined_Attacks2.csv)
    prev_dataset_path = "E:\\TLS_Combined_Attacks2.csv"
    if os.path.exists(prev_dataset_path):
        prev_data = pd.read_csv(prev_dataset_path, low_memory=False)
        combined_tls_attacks = pd.concat([prev_data, combined_tls_attacks], ignore_index=True)
        print("Merged with previous dataset: TLS_Combined_Attacks2.csv")

    # Save the final merged dataset
    output_path = "E:\\TLS_Final_Merged_Attacks.csv"
    combined_tls_attacks.to_csv(output_path, index=False)
    
    print(f"\nFinal TLS attack dataset saved at: {output_path}")
    print("TLS Attack Class Distribution:\n", combined_tls_attacks[' Label'].value_counts())
else:
    print("\n No TLS-based attack traffic found in any file.")

Processing: LDAP.csv | Size: 0.81 GB
Processing: MSSQL.csv | Size: 2.22 GB
 MSSQL.csv is large. Processing in chunks...
Processing: NetBIOS.csv | Size: 1.32 GB
Processing: Portmap.csv | Size: 0.07 GB
Processing: Syn.csv | Size: 1.75 GB
 Syn.csv is large. Processing in chunks...
Processing: UDP.csv | Size: 1.67 GB
 UDP.csv is large. Processing in chunks...
Processing: UDPLag.csv | Size: 0.3 GB

Final TLS attack dataset saved at: E:\TLS_Final_Merged_Attacks.csv
TLS Attack Class Distribution:
 BENIGN     21500
MSSQL        355
Syn          335
NetBIOS      245
LDAP         119
UDP           59
Portmap       20
Name:  Label, dtype: int64
