In [None]:
import pandas as pd
import random

def load_and_sample_data(file_paths, benign_samples=4000, attack_samples=1004):
    all_data = []

    for file_path in file_paths:
        print(f"Loading {file_path}...")
        df = pd.read_csv(file_path)

        label = file_path.split('/')[-1].replace('.csv', '')
        df['label'] = label

        if 'Benign' in file_path:
            sampled_df = df.sample(n=benign_samples, random_state=42, replace=False)
        else:
            sampled_df = df.sample(n=min(attack_samples, len(df)), random_state=42, replace=False)

        all_data.append(sampled_df)

    final_df = pd.concat(all_data, ignore_index=True)
    print("Data sampling completed successfully!")
    return final_df

file_paths = [
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/Benign%20Traffic.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/DDoS%20ICMP%20Flood.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/DDoS%20UDP%20Flood.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/DoS%20ICMP%20Flood.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/DoS%20UDP%20Flood.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/DoS%20UDP%20Flood.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/MITM%20ARP%20Spoofing.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/MQTT%20DDoS%20Publish%20Flood.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/MQTT%20DoS%20Connect%20Flood.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/MQTT%20DoS%20Publish%20Flood.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/MQTT%20Malformed.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/Recon%20OS%20Scan.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/Recon%20Ping%20Sweep.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/Recon%20Port%20Scan.csv",
    "http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/Recon%20Vulnerability%20Scan.csv"
]

balanced_data = load_and_sample_data(file_paths)
balanced_data.to_csv("balanced_data.csv", index=False)
print("Balanced dataset saved as 'balanced_data.csv'")


Loading http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/Benign%20Traffic.csv...
Loading http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/DDoS%20ICMP%20Flood.csv...
Loading http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/DDoS%20UDP%20Flood.csv...
Loading http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/DoS%20ICMP%20Flood.csv...
Loading http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/DoS%20UDP%20Flood.csv...
Loading http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/DoS%20UDP%20Flood.csv...
Loading http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularIoTAttacks-2024/Dataset/CIC-BCCC-NRC-IoMT-2024/MITM%20ARP%20Spoofing.csv...
Loading http://205.174.165.80/IOTDataset/CIC-BCCC-NRC-TabularI

In [None]:
import pandas as pd

df = pd.read_csv("/content/balanced_data.csv")
unique_attacks = df['Attack Name'].unique()
unique_attacks


array(['Benign Traffic', 'DDoS ICMP Flood', 'DDoS UDP Flood',
       'DoS ICMP Flood', 'DoS UDP Flood', 'MITM ARP Spoofing',
       'MQTT DDoS Publish Flood', 'MQTT DoS Connect Flood',
       'MQTT DoS Publish Flood', 'MQTT Malformed', 'Recon OS Scan',
       'Recon Ping Sweep', 'Recon Port Scan', 'Recon Vulnerability Scan'],
      dtype=object)

#3
**bold text**

In [None]:
import pandas as pd
import random

def load_and_sample_data(file_paths, benign_samples=4000, attack_samples=1004):
    all_data = []

    for file_path in file_paths:
        print(f"Loading {file_path}...")
        df = pd.read_csv(file_path)

        label = file_path.split('/')[-1].replace('.csv', '')
        df['label'] = label

        if 'Benign' in file_path:
            sampled_df = df.sample(n=benign_samples, random_state=42, replace=False)
        else:
            sampled_df = df.sample(n=min(attack_samples, len(df)), random_state=42, replace=False)

        all_data.append(sampled_df)

    final_df = pd.concat(all_data, ignore_index=True)
    print("Data sampling completed successfully!")
    return final_df

file_paths = [
"http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/Benign&Bruteforce/benign/BenignTraffic.pcap_Flow.csv",
    "http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/DDoS/DDoS-HTTP%20Flood/DDoS-HTTP_Flood-.pcap_Flow.csv",
    "http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/DDoS/DDoS%20ACK%20Fragmentation/DDoS-ACK_Fragmentation11.pcap_Flow.csv",
    "http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/DDoS/DDoS%20ICMP%20Flood/DDoS-ICMP_Flood4.pcap_Flow.csv",
    "http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/DoS/DoS-HTTP_Flood/",
    "http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/DoS/DoS-TCP_Flood/",
    "http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/DoS/DoS-UDP_Flood/",
    "http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/DoS/DoS%20SYN%20Flood/",
    "http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/Mirai/Mirai-greeth_flood6.pcap_Flow.csv",
    "http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/VulnerabilityScan/VulnerabilityScan.pcap_Flow.csv",
    "http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/spoofing/ARP%20Spoofing/MITM-ArpSpoofing.pcap_Flow.csv",
    "http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/spoofing/DNS%20Spoofing/DNS_Spoofing.pcap_Flow.csv",
    "http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/sqlinjection/SqlInjection.pcap_Flow.csv",
]

balanced_data = load_and_sample_data(file_paths)
balanced_data.to_csv("3_DATA.csv", index=False)
print("Balanced dataset saved as 'balanced_data.csv'")


Loading http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/Benign&Bruteforce/benign/BenignTraffic.pcap_Flow.csv...
Loading http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/DDoS/DDoS-HTTP%20Flood/DDoS-HTTP_Flood-.pcap_Flow.csv...
Loading http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/DDoS/DDoS%20ACK%20Fragmentation/DDoS-ACK_Fragmentation11.pcap_Flow.csv...
Loading http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/DDoS/DDoS%20ICMP%20Flood/DDoS-ICMP_Flood4.pcap_Flow.csv...
Loading http://205.174.165.80/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/Anomaly%20Detection%20-%20Flow%20Based%20features/FlowmeterResult/DoS/DoS-HTTP_Flood/