In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
def load_parquet_files(folder_path):
    parquet_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.parquet')]
    df_list = [pd.read_parquet(file) for file in parquet_files]
    concatenated_df = pd.concat(df_list, ignore_index=True)
    return concatenated_df

In [3]:
df = load_parquet_files("C:/Users/shubham/Desktop/IP-Spoofing/Datasets")

In [4]:
df.shape

(431371, 78)

In [5]:
len(df.columns)

78

In [9]:
df.to_csv("data.csv", index=False)

In [6]:
df.columns

Index(['Protocol', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Fwd Packets Length Total',
       'Bwd Packets Length Total', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Fla

In [7]:
df["Label"].value_counts()

Label
DrDoS_NTP        121368
TFTP              98917
Benign            97831
Syn               49373
UDP               18090
DrDoS_UDP         10420
UDP-lag            8872
MSSQL              8523
DrDoS_MSSQL        6212
DrDoS_DNS          3669
DrDoS_SNMP         2717
LDAP               1906
DrDoS_LDAP         1440
Portmap             685
NetBIOS             644
DrDoS_NetBIOS       598
UDPLag               55
WebDDoS              51
Name: count, dtype: int64

In [2]:
from scapy.all import sniff, IP, TCP, UDP
import pandas as pd
import numpy as np
from collections import defaultdict
import time

# Dictionary to store flow information
flows = defaultdict(lambda: {
    'packets': [],
    'timestamps': [],
    'fwd_packets': 0,
    'bwd_packets': 0,
    'fwd_bytes': 0,
    'bwd_bytes': 0,
    'fwd_packet_lengths': [],
    'bwd_packet_lengths': [],
    'fwd_iat': [],
    'bwd_iat': [],
    'start_time': None,
    'end_time': None,
    'protocol': None,
    'src_ip': None,
    'dst_ip': None,
    'src_port': None,
    'dst_port': None,
})

# Function to process each packet


def process_packet(packet):
    if IP in packet:
        src_ip = packet[IP].src
        dst_ip = packet[IP].dst
        protocol = packet[IP].proto
        src_port = None
        dst_port = None

        if TCP in packet:
            src_port = packet[TCP].sport
            dst_port = packet[TCP].dport
        elif UDP in packet:
            src_port = packet[UDP].sport
            dst_port = packet[UDP].dport

        # Define flow key
        flow_key = (src_ip, dst_ip, src_port, dst_port, protocol)

        # Update flow information
        flows[flow_key]['packets'].append(packet)
        flows[flow_key]['timestamps'].append(packet.time)
        flows[flow_key]['protocol'] = protocol
        flows[flow_key]['src_ip'] = src_ip
        flows[flow_key]['dst_ip'] = dst_ip
        flows[flow_key]['src_port'] = src_port
        flows[flow_key]['dst_port'] = dst_port

        if flows[flow_key]['start_time'] is None:
            flows[flow_key]['start_time'] = packet.time
        flows[flow_key]['end_time'] = packet.time

        packet_length = len(packet)
        if src_ip == flow_key[0]:  # Forward direction
            flows[flow_key]['fwd_packets'] += 1
            flows[flow_key]['fwd_bytes'] += packet_length
            flows[flow_key]['fwd_packet_lengths'].append(packet_length)
            if len(flows[flow_key]['fwd_packet_lengths']) > 1:
                flows[flow_key]['fwd_iat'].append(
                    packet.time - flows[flow_key]['timestamps'][-2])
        else:  # Backward direction
            flows[flow_key]['bwd_packets'] += 1
            flows[flow_key]['bwd_bytes'] += packet_length
            flows[flow_key]['bwd_packet_lengths'].append(packet_length)
            if len(flows[flow_key]['bwd_packet_lengths']) > 1:
                flows[flow_key]['bwd_iat'].append(
                    packet.time - flows[flow_key]['timestamps'][-2])


# Capture packets for 10 seconds
print("Capturing packets for 10 seconds...")
sniff(prn=process_packet, timeout=5)

# Calculate flow statistics
flow_data = []
for flow_key, flow_info in flows.items():
    if len(flow_info['packets']) == 0:
        continue

    # Basic flow information
    flow_duration = flow_info['end_time'] - flow_info['start_time']
    total_fwd_packets = flow_info['fwd_packets']
    total_bwd_packets = flow_info['bwd_packets']
    total_fwd_bytes = flow_info['fwd_bytes']
    total_bwd_bytes = flow_info['bwd_bytes']

    # Packet length statistics
    fwd_packet_lengths = flow_info['fwd_packet_lengths']
    bwd_packet_lengths = flow_info['bwd_packet_lengths']
    fwd_packet_length_max = max(
        fwd_packet_lengths) if fwd_packet_lengths else 0
    fwd_packet_length_min = min(
        fwd_packet_lengths) if fwd_packet_lengths else 0
    fwd_packet_length_mean = np.mean(
        fwd_packet_lengths) if fwd_packet_lengths else 0
    fwd_packet_length_std = np.std(
        fwd_packet_lengths) if fwd_packet_lengths else 0

    bwd_packet_length_max = max(
        bwd_packet_lengths) if bwd_packet_lengths else 0
    bwd_packet_length_min = min(
        bwd_packet_lengths) if bwd_packet_lengths else 0
    bwd_packet_length_mean = np.mean(
        bwd_packet_lengths) if bwd_packet_lengths else 0
    bwd_packet_length_std = np.std(
        bwd_packet_lengths) if bwd_packet_lengths else 0

    # Inter-arrival time statistics
    fwd_iat = flow_info['fwd_iat']
    bwd_iat = flow_info['bwd_iat']
    fwd_iat_mean = np.mean(fwd_iat) if fwd_iat else 0
    fwd_iat_std = np.std(fwd_iat) if fwd_iat else 0
    fwd_iat_max = max(fwd_iat) if fwd_iat else 0
    fwd_iat_min = min(fwd_iat) if fwd_iat else 0

    bwd_iat_mean = np.mean(bwd_iat) if bwd_iat else 0
    bwd_iat_std = np.std(bwd_iat) if bwd_iat else 0
    bwd_iat_max = max(bwd_iat) if bwd_iat else 0
    bwd_iat_min = min(bwd_iat) if bwd_iat else 0

    # Flow bytes/s and packets/s
    flow_bytes_per_sec = (total_fwd_bytes + total_bwd_bytes) / \
        flow_duration if flow_duration > 0 else 0
    flow_packets_per_sec = (total_fwd_packets + total_bwd_packets) / \
        flow_duration if flow_duration > 0 else 0

    # Store flow statistics
    flow_data.append({
        'src_ip': flow_info['src_ip'],
        'dst_ip': flow_info['dst_ip'],
        'protocol': flow_info['protocol'],
        'flow_duration': flow_duration,
        'total_fwd_packets': total_fwd_packets,
        'total_bwd_packets': total_bwd_packets,
        'fwd_packets_length_total': total_fwd_bytes,
        'bwd_packets_length_total': total_bwd_bytes,
        'fwd_packet_length_max': fwd_packet_length_max,
        'fwd_packet_length_min': fwd_packet_length_min,
        'fwd_packet_length_mean': fwd_packet_length_mean,
        'fwd_packet_length_std': fwd_packet_length_std,
        'bwd_packet_length_max': bwd_packet_length_max,
        'bwd_packet_length_min': bwd_packet_length_min,
        'bwd_packet_length_mean': bwd_packet_length_mean,
        'bwd_packet_length_std': bwd_packet_length_std,
        'flow_bytes_per_sec': flow_bytes_per_sec,
        'flow_packets_per_sec': flow_packets_per_sec,
        'fwd_iat_mean': fwd_iat_mean,
        'fwd_iat_std': fwd_iat_std,
        'fwd_iat_max': fwd_iat_max,
        'fwd_iat_min': fwd_iat_min,
        'bwd_iat_mean': bwd_iat_mean,
        'bwd_iat_std': bwd_iat_std,
        'bwd_iat_max': bwd_iat_max,
        'bwd_iat_min': bwd_iat_min,
    })

# Convert to DataFrame
flow_df = pd.DataFrame(flow_data)
flow_df.head()
flow_df.to_csv("flow_data.csv", index=False)

Capturing packets for 10 seconds...


In [3]:
df = pd.read_csv("flow_data.csv")
df.head()

Unnamed: 0,src_ip,dst_ip,protocol,flow_duration,total_fwd_packets,total_bwd_packets,fwd_packets_length_total,bwd_packets_length_total,fwd_packet_length_max,fwd_packet_length_min,...,flow_bytes_per_sec,flow_packets_per_sec,fwd_iat_mean,fwd_iat_std,fwd_iat_max,fwd_iat_min,bwd_iat_mean,bwd_iat_std,bwd_iat_max,bwd_iat_min
0,20.42.65.84,192.168.101.18,6,1.193877,20,0,1704,0,521,54,...,1427.282447,16.752141,0.062836,0.187374,0.738441,0.0,0,0,0,0
1,192.168.101.18,20.42.65.84,6,0.456262,6,0,13561,0,7523,54,...,29721.950724,13.150336,0.091252,0.154376,0.39704,5.3e-05,0,0,0,0
2,20.42.65.84,192.168.101.18,6,0.443504,7,0,946,0,519,54,...,2133.014146,15.783403,0.073917,0.165284,0.443504,0.0,0,0,0,0
3,192.168.101.18,20.42.65.84,6,0.000398,3,0,186,0,78,54,...,467429.924506,7539.192331,0.000199,0.000193,0.000392,6e-06,0,0,0,0
4,192.168.101.18,52.168.117.168,6,1.173154,5,0,19306,0,16460,54,...,16456.49422,4.262015,0.293288,0.355299,0.870261,5e-05,0,0,0,0


In [6]:
df.columns

Index(['src_ip', 'dst_ip', 'protocol', 'flow_duration', 'total_fwd_packets',
       'total_bwd_packets', 'fwd_packets_length_total',
       'bwd_packets_length_total', 'fwd_packet_length_max',
       'fwd_packet_length_min', 'fwd_packet_length_mean',
       'fwd_packet_length_std', 'bwd_packet_length_max',
       'bwd_packet_length_min', 'bwd_packet_length_mean',
       'bwd_packet_length_std', 'flow_bytes_per_sec', 'flow_packets_per_sec',
       'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min',
       'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min'],
      dtype='object')