In [6]:
import pandas as pd
import numpy as np

# Sample data (you should replace this with your actual network flow data)
data = {
    'Flow Start Time': ['2023-10-01 08:00:00', '2023-10-01 08:00:01', '2023-10-01 08:00:02'],
    'Flow End Time': ['2023-10-01 08:00:03', '2023-10-01 08:00:04', '2023-10-01 08:00:05'],
    'Packet Lengths': [[500, 600, 400], [300, 200, 100], [700, 800, 900]],
    # Add more data columns as needed
}

# Convert data to DataFrame
df = pd.DataFrame(data)

# Function to calculate features for a single flow


def calculate_features(flow_data):
    features = {}

    # Calculate Flow Duration (in seconds)
    flow_start_time = pd.to_datetime(flow_data['Flow Start Time'])
    flow_end_time = pd.to_datetime(flow_data['Flow End Time'])
    flow_duration = (flow_end_time - flow_start_time).total_seconds()
    features['Flow Duration'] = flow_duration

    # Calculate other features here, e.g., Active Std, Tot Fwd Pkts, etc.
    # You'll need to define the specific calculations for each feature

    return features


# Calculate features for each flow
all_features = []
for index, flow_data in df.iterrows():
    flow_features = calculate_features(flow_data)
    all_features.append(flow_features)

# Convert the list of feature dictionaries to a DataFrame
feature_df = pd.DataFrame(all_features)

# Display the calculated features
print(feature_df)


   Flow Duration
0            3.0
1            3.0
2            3.0


In [18]:
import pyshark
import pandas as pd
import numpy as np

# Specify the path to your .pcap file
pcap_file = 'test.pcap'

# Function to calculate features for a flow
def calculate_features(flow_packets):
    features = {}
    
    # Extract Flow Duration
    start_time = flow_packets[0].sniff_timestamp
    end_time = flow_packets[-1].sniff_timestamp
    flow_duration = float(end_time) - float(start_time)
    features['Flow Duration'] = flow_duration

    # Initialize variables for other features
    active_std = 0.0
    tot_fwd_pkts = 0
    pkt_len_max = 0
    cwe_flag_count = 0
    pkt_len_min = float('inf')
    fwd_pkts_per_s = 0.0
    ece_flag_count = 0
    fwd_seg_size_min = float('inf')
    tot_bwd_pkts = 0
    pkt_len_std = 0.0
    fwd_pkt_len_mean = 0.0
    flow_iat_max = 0.0
    fwd_iat_tot = 0.0

    # Calculate other features
    for packet in flow_packets:
        # Count forward and backward packets
        if 'IP' in packet and 'tcp' in packet:
            if packet.ip.src == flow_packets[0].ip.src:
                tot_fwd_pkts += 1
            else:
                tot_bwd_pkts += 1

        # Calculate Packet Length Max and Min
        if 'IP' in packet and 'tcp' in packet and 'data' in packet:
            pkt_len = int(packet.data.len)
            pkt_len_max = max(pkt_len_max, pkt_len)
            pkt_len_min = min(pkt_len_min, pkt_len)

            # Calculate Fwd Pkts/s
            if flow_duration > 0:
                fwd_pkts_per_s = tot_fwd_pkts / flow_duration

        # Count CWE Flag (You may need to adapt this based on your data)
        if 'TCP' in packet and hasattr(packet.tcp, 'flags_cwe'):
            cwe_flag_count += 1

        # Calculate ECE Flag Count (You may need to adapt this based on your data)
        if 'TCP' in packet and hasattr(packet.tcp, 'flags_ecn_echo'):
            ece_flag_count += 1

        # Calculate Fwd Seg Size Min
        if 'TCP' in packet and hasattr(packet.tcp, 'options_wscale'):
            fwd_seg_size_min = min(fwd_seg_size_min, int(packet.tcp.options_wscale))

        # Calculate Packet Length Standard Deviation and Mean
        if 'IP' in packet and 'tcp' in packet and 'data' in packet:
            pkt_len = int(packet.data.len)
            pkt_len_std += (pkt_len - fwd_pkt_len_mean) ** 2
            fwd_pkt_len_mean = ((fwd_pkt_len_mean * (tot_fwd_pkts - 1)) + pkt_len) / tot_fwd_pkts

        # Calculate Flow IAT Max and Fwd IAT Tot
        if 'IP' in packet and 'tcp' in packet:
            timestamp = float(packet.sniff_timestamp)
            flow_iat = timestamp - float(start_time)
            flow_iat_max = max(flow_iat_max, flow_iat)
            
            if tot_fwd_pkts > 1:
                fwd_iat = timestamp - float(flow_packets[-2].sniff_timestamp)
                fwd_iat_tot += fwd_iat

    # Calculate Active Std (You may need to adapt this based on your data)
    if tot_fwd_pkts > 1:
        active_std = np.std([float(packet.sniff_timestamp) for packet in flow_packets])

    features['Active Std'] = active_std
    features['Tot Fwd Pkts'] = tot_fwd_pkts
    features['Pkt Len Max'] = pkt_len_max
    features['CWE Flag Count'] = cwe_flag_count
    features['Pkt Len Min'] = pkt_len_min
    features['Fwd Pkts/s'] = fwd_pkts_per_s
    features['ECE Flag Cnt'] = ece_flag_count
    features['Fwd Seg Size Min'] = fwd_seg_size_min
    features['Tot Bwd Pkts'] = tot_bwd_pkts
    features['Pkt Len Std'] = np.sqrt(pkt_len_std / tot_fwd_pkts) if tot_fwd_pkts > 1 else 0.0
    features['Fwd Pkt Len Mean'] = fwd_pkt_len_mean
    features['Flow IAT Max'] = flow_iat_max
    features['Fwd IAT Tot'] = fwd_iat_tot

    return features

# Read the .pcap file
cap = pyshark.FileCapture(pcap_file)

# Initialize a list to store features for each flow
all_features = []

# Initialize variables to track the current flow
current_flow_packets = []
current_flow_key = None

# Process each packet in the .pcap file
for packet in cap:
    if 'IP' in packet and 'tcp' in packet:
        # Determine the flow key based on source and destination IP and ports
        flow_key = (packet.ip.src, packet.ip.dst, packet.tcp.srcport, packet.tcp.dstport)

        # If the flow key changes, calculate features for the previous flow
        if flow_key != current_flow_key and current_flow_packets:
            flow_features = calculate_features(current_flow_packets)
            all_features.append(flow_features)

            # Reset for the next flow
            current_flow_packets = []

        # Append the packet to the current flow
        current_flow_packets.append(packet)
        current_flow_key = flow_key

# Calculate features for the last flow in the pcap file
if current_flow_packets:
    flow_features = calculate_features(current_flow_packets)
    all_features.append(flow_features)

# Convert the list of feature dictionaries to a DataFrame
feature_df = pd.DataFrame(all_features)

# Display the calculated features
print(feature_df)

RuntimeError: Cannot run the event loop while another loop is running

In [32]:
import dpkt
import pandas as pd
import numpy as np

# Specify the path to your .pcap file
pcap_file = 'test.pcap'

# Function to calculate features for a flow


def calculate_features(flow_packets):
    features = {}

    # Extract Flow Duration
    start_time = flow_packets[0][0]
    end_time = flow_packets[-1][0]
    flow_duration = float(end_time - start_time)
    features['Flow Duration'] = flow_duration

    # Initialize variables for other features
    active_std = 0.0
    tot_fwd_pkts = 0
    pkt_len_max = 0
    cwe_flag_count = 0
    pkt_len_min = float('inf')
    fwd_pkts_per_s = 0.0
    ece_flag_count = 0
    fwd_seg_size_min = float('inf')
    tot_bwd_pkts = 0
    pkt_len_std = 0.0
    fwd_pkt_len_mean = 0.0
    flow_iat_max = 0.0
    fwd_iat_tot = 0.0

    # Calculate other features
    for timestamp, ip, tcp in flow_packets:
        # Count forward and backward packets
        if ip.src == flow_packets[0][1].src:
            tot_fwd_pkts += 1
        else:
            tot_bwd_pkts += 1

        # Calculate Packet Length Max and Min
        pkt_len = len(tcp.data)
        pkt_len_max = max(pkt_len_max, pkt_len)
        pkt_len_min = min(pkt_len_min, pkt_len)

        # Calculate Fwd Pkts/s
        if flow_duration > 0:
            fwd_pkts_per_s = tot_fwd_pkts / flow_duration

        # Count CWE Flag (You may need to adapt this based on your data)
        if hasattr(tcp, 'cwe_flag'):
            cwe_flag_count += 1

        # Calculate ECE Flag Count (You may need to adapt this based on your data)
        if hasattr(tcp, 'ece_flag'):
            ece_flag_count += 1

        # Calculate Fwd Seg Size Min
        if hasattr(tcp, 'win'):
            win_scale_option = None
            for option_type, option_value in dpkt.tcp.parse_opts(tcp.opts):
                if option_type == 3:  # Window Scale Option
                    win_scale_option = option_value
                    break

            if win_scale_option is not None:
                # Convert win_scale_option from bytes to integer
                win_scale_option = int.from_bytes(win_scale_option, byteorder='big')
                fwd_seg_size_min = min(fwd_seg_size_min, win_scale_option)

        # Calculate Packet Length Standard Deviation and Mean
        pkt_len_std += (pkt_len - fwd_pkt_len_mean) ** 2
        fwd_pkt_len_mean = (
            (fwd_pkt_len_mean * (tot_fwd_pkts - 1)) + pkt_len) / tot_fwd_pkts

        # Calculate Flow IAT Max and Fwd IAT Tot
        if timestamp is not None:
            flow_iat = timestamp - start_time
            flow_iat_max = max(flow_iat_max, flow_iat)

            if tot_fwd_pkts > 1:
                fwd_iat = timestamp - flow_packets[-2][0]
                fwd_iat_tot += fwd_iat

    # Calculate Active Std (You may need to adapt this based on your data)
    if tot_fwd_pkts > 1:
        active_std = np.std([timestamp for timestamp, _, _ in flow_packets])

    features['Active Std'] = active_std
    features['Tot Fwd Pkts'] = tot_fwd_pkts
    features['Pkt Len Max'] = pkt_len_max
    features['CWE Flag Count'] = cwe_flag_count
    features['Pkt Len Min'] = pkt_len_min
    features['Fwd Pkts/s'] = fwd_pkts_per_s
    features['ECE Flag Cnt'] = ece_flag_count
    features['Fwd Seg Size Min'] = fwd_seg_size_min
    features['Tot Bwd Pkts'] = tot_bwd_pkts
    features['Pkt Len Std'] = np.sqrt(
        pkt_len_std / tot_fwd_pkts) if tot_fwd_pkts > 1 else 0.0
    features['Fwd Pkt Len Mean'] = fwd_pkt_len_mean
    features['Flow IAT Max'] = flow_iat_max
    features['Fwd IAT Tot'] = fwd_iat_tot

    return features


# Open the .pcap file
with open(pcap_file, 'rb') as file:
    pcap = dpkt.pcap.Reader(file)

    # Initialize a list to store features for each flow
    all_features = []

    # Initialize variables to track the current flow
    current_flow_packets = []
    current_flow_key = None

    # Process each packet in the .pcap file
    for timestamp, packet_data in pcap:
        current_flow_key = None
        packet = dpkt.ethernet.Ethernet(packet_data)
        if isinstance(packet.data, dpkt.ip.IP) and isinstance(packet.data.data, dpkt.tcp.TCP):
            ip = packet.data
            tcp = ip.data

            # Determine the flow key based on source and destination IP and ports
            flow_key = (ip.src, ip.dst, tcp.sport, tcp.dport)

            # If the flow key changes, calculate features for the previous flow
            if flow_key != current_flow_key and current_flow_packets:
                flow_features = calculate_features(current_flow_packets)
                all_features.append(flow_features)

                # Reset for the next flow
                current_flow_packets = []

            # Append the packet to the current flow
            current_flow_packets.append((timestamp, ip, tcp))
            current_flow_key = flow_key

    # Calculate features for the last flow in the pcap file
    if current_flow_packets:
        flow_features = calculate_features(current_flow_packets)
        all_features.append(flow_features)

# Create a DataFrame from the extracted features
feature_df = pd.DataFrame(all_features)

# Display the calculated features
print(feature_df.columns)


Index(['Flow Duration', 'Active Std', 'Tot Fwd Pkts', 'Pkt Len Max',
       'CWE Flag Count', 'Pkt Len Min', 'Fwd Pkts/s', 'ECE Flag Cnt',
       'Fwd Seg Size Min', 'Tot Bwd Pkts', 'Pkt Len Std', 'Fwd Pkt Len Mean',
       'Flow IAT Max', 'Fwd IAT Tot'],
      dtype='object')
