In [1]:
from scapy.all import *
import csv
import os
import pandas as pd
import numpy as np



In [3]:
def run_for_all(pcap_file):
    packets=rdpcap(pcap_file)
    if 'netflix' in pcap_file or 'youtube' in pcap_file:
        label = 'video_stream'
    elif 'skype' in pcap_file:
        label = 'messaging'
    elif 'sftp' in pcap_file:
        label = 'file_transfer'
    elif 'regular_browsing' in pcap_file:
        label = 'regular_browsing'
    else:
        print(pcap_file)
        
    print(label,pcap_file, len(packets))

    packet_list = []
    
    for packet in packets:
        if IP in packet:
            packet_info = {
                'label': label,
                'timestamp': packet.time,
                'source_ip': packet[IP].src,
                'destination_ip': packet[IP].dst,
                'protocol': packet[IP].proto,
                'source_port': packet[IP].sport if TCP in packet or UDP in packet else None,
                'destination_port': packet[IP].dport if TCP in packet or UDP in packet else None,
                'packet_length': packet[IP].len
            }
            packet_list.append(packet_info)
    
    df = pd.DataFrame(packet_list)
    return df

In [9]:
df = run_for_all("dataset/nonvpn_sftp_capture2.pcap")

file_transfer dataset/nonvpn_sftp_capture2.pcap 338865


In [10]:
print(df.head)

<bound method NDFrame.head of                 label          timestamp     source_ip destination_ip  \
0       file_transfer  1563285946.421604    10.113.1.2     10.115.1.2   
1       file_transfer  1563285946.421654    10.113.1.2     10.115.1.2   
2       file_transfer  1563285946.421993    10.115.1.2     10.113.1.2   
3       file_transfer  1563285946.422040    10.115.1.2     10.113.1.2   
4       file_transfer  1563285946.422849  10.113.1.150   10.115.1.123   
...               ...                ...           ...            ...   
338860  file_transfer  1563285955.549798    10.115.1.2     10.113.1.2   
338861  file_transfer  1563285955.550136    10.113.1.2     10.115.1.2   
338862  file_transfer  1563285955.550200    10.113.1.2     10.115.1.2   
338863  file_transfer  1563285955.551482  10.115.1.123   10.113.1.150   
338864  file_transfer  1563285955.551855  10.113.1.150   10.115.1.123   

        protocol  source_port  destination_port  packet_length  
0             17        1892

In [11]:
max_timestamp = df['timestamp'].max()
min_timestamp = df['timestamp'].min()

In [16]:
print(max_timestamp)
print(min_timestamp)
difference = max_timestamp - min_timestamp
print("difference = ", difference)
print()

1563285955.551855
1563285946.421604
difference =  9.130251



In [17]:
print(len(df)/difference)

37114.53277680975035626074245


In [19]:
unique_protocols = df['protocol'].unique()
print(unique_protocols)

[17  6]


In [20]:
df = run_for_all("dataset/vpn_youtube_capture1.pcap")

video_stream dataset/vpn_youtube_capture1.pcap 119639


In [4]:
def analyse_flow(df):
    df = df.sort_values(by='timestamp')
    print(df.head)
    max_timestamp = df['timestamp'].max()
    min_timestamp = df['timestamp'].min()
    print(max_timestamp)
    print(min_timestamp)
    difference = max_timestamp - min_timestamp
    print("difference = ", difference)
    print()
    print(len(df))
    print(len(df)/difference)
    print()

In [33]:
analyse_flow(df)

<bound method NDFrame.head of                   label          timestamp       source_ip destination_ip  \
0      regular_browsing  1710712073.207083    192.168.0.24  192.168.0.255   
1      regular_browsing  1710712074.232191    192.168.0.24  192.168.0.255   
2      regular_browsing  1710712075.254705    192.168.0.24  192.168.0.255   
3      regular_browsing  1710712076.176512    192.168.0.24  192.168.0.255   
4      regular_browsing  1710712077.200380    192.168.0.24  192.168.0.255   
...                 ...                ...             ...            ...   
82515  regular_browsing  1710712363.188150   192.168.0.189  54.159.138.33   
82516  regular_browsing  1710712363.205631    192.168.0.24  192.168.0.255   
82517  regular_browsing  1710712363.206057  100.20.199.161  192.168.0.189   
82518  regular_browsing  1710712363.206702  54.184.255.164  192.168.0.189   
82519  regular_browsing  1710712363.294630   54.159.138.33  192.168.0.189   

       protocol  source_port  destination_por

In [34]:
df = run_for_all("dataset/nonvpn_netflix_capture1.pcap")

video_stream dataset/nonvpn_netflix_capture1.pcap 240855


In [35]:
analyse_flow(df)

<bound method NDFrame.head of                label          timestamp     source_ip destination_ip  \
0       video_stream  1561555171.088646    10.122.1.2     10.124.1.2   
1       video_stream  1561555171.089023    10.124.1.2     10.122.1.2   
2       video_stream  1561555171.091051  10.122.1.103   10.124.1.194   
3       video_stream  1561555171.091322  10.124.1.194   10.122.1.103   
4       video_stream  1561555171.091661  10.122.1.103   10.124.1.194   
...              ...                ...           ...            ...   
240850  video_stream  1561556950.080565  10.122.1.103   10.124.1.194   
240851  video_stream  1561556950.080592  10.124.1.194   10.122.1.103   
240852  video_stream  1561556950.080601  10.124.1.194   10.122.1.103   
240853  video_stream  1561556950.080609  10.124.1.194   10.122.1.103   
240854  video_stream  1561556950.080631  10.124.1.194   10.122.1.103   

        protocol  source_port  destination_port  packet_length  
0             17        45509           

In [36]:
df = run_for_all("dataset/nonvpn_skype-chat_capture3.pcap")

messaging dataset/nonvpn_skype-chat_capture3.pcap 21194


In [37]:
analyse_flow(df)

<bound method NDFrame.head of            label          timestamp     source_ip destination_ip  protocol  \
0      messaging  1563463092.850291  10.116.1.162   10.118.1.100         6   
1      messaging  1563463095.912290  10.118.1.100   10.116.1.162         6   
2      messaging  1563463095.914080  10.116.1.162   10.118.1.100         6   
3      messaging  1563463095.969538  10.118.1.100   10.116.1.162         6   
4      messaging  1563463097.807768  10.116.1.162   10.118.1.100         6   
...          ...                ...           ...            ...       ...   
21189  messaging  1563477186.391517  10.116.1.162   10.118.1.100         6   
21190  messaging  1563477186.448646  10.118.1.100   10.116.1.162         6   
21191  messaging  1563477186.449170  10.116.1.162   10.118.1.100         6   
21192  messaging  1563477186.488271  10.118.1.100   10.116.1.162         6   
21193  messaging  1563477186.529076  10.116.1.162   10.118.1.100         6   

       source_port  destination_p

In [38]:
df = run_for_all("dataset/regular_browsing1.pcap")

regular_browsing dataset/regular_browsing1.pcap 82574


In [39]:
analyse_flow(df)


<bound method NDFrame.head of                   label          timestamp       source_ip destination_ip  \
0      regular_browsing  1710712073.207083    192.168.0.24  192.168.0.255   
1      regular_browsing  1710712074.232191    192.168.0.24  192.168.0.255   
2      regular_browsing  1710712075.254705    192.168.0.24  192.168.0.255   
3      regular_browsing  1710712076.176512    192.168.0.24  192.168.0.255   
4      regular_browsing  1710712077.200380    192.168.0.24  192.168.0.255   
...                 ...                ...             ...            ...   
82515  regular_browsing  1710712363.188150   192.168.0.189  54.159.138.33   
82516  regular_browsing  1710712363.205631    192.168.0.24  192.168.0.255   
82517  regular_browsing  1710712363.206057  100.20.199.161  192.168.0.189   
82518  regular_browsing  1710712363.206702  54.184.255.164  192.168.0.189   
82519  regular_browsing  1710712363.294630   54.159.138.33  192.168.0.189   

       protocol  source_port  destination_por

In [20]:
def aggregate_packets(df):
    # Assuming 'df' is your DataFrame
    # First, sort the DataFrame by the 'timestamp'
    print(df.columns)
    print(df.head())
    df = df.sort_values(by='timestamp')
    df['timestamp'] = df['timestamp'].astype(float)
    # Convert timestamps to a datetime format for easier manipulation
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    
    # Calculate time differences to find inter-arrival times
    df['time_diff'] = df['timestamp'].diff().dt.total_seconds().fillna(0)
    
    # Group by 0.25 second intervals. Assuming the first timestamp is the start.
    df['time_interval'] = (df['timestamp'] - df['timestamp'].min()).dt.total_seconds() // 0.25
    
    # Group by 'time_interval' and calculate the required attributes
    aggregated_data = df.groupby('time_interval').agg(
        number_of_packets=('timestamp', 'size'),
        average_packet_length=('packet_length', 'mean'),
        port=('destination_port', 'first'),  # Assuming 'port' means destination_port here
        number_of_unique_dests=('destination_ip', 'nunique'),
        TCP_protocol_count=('protocol', lambda x: (x==6).sum()),
        UDP_protocol_count=('protocol', lambda x: (x==17).sum()),
        #Inter_arrival_time_variance=('time_diff', np.var),
        #Packet_length_variance=('packet_length', np.var)
        #Inter_arrival_time_variance=('time_diff', lambda x: np.var(x, ddof=1)),
        #Packet_length_variance=('packet_length', lambda x: np.var(x, ddof=1))
        Packet_length_variance=('packet_length', lambda x: x.var() if len(x) > 1 else 0),
        Inter_arrival_time_variance=('time_diff', lambda x: x.var() if len(x) > 1 else 0)
    ).reset_index(drop=True)
    
    return aggregated_data

In [46]:
df = run_for_all("dataset/nonvpn_sftp_capture2.pcap")

file_transfer dataset/nonvpn_sftp_capture2.pcap 338865


In [47]:
analyse_flow(df)

<bound method NDFrame.head of                 label          timestamp     source_ip destination_ip  \
0       file_transfer  1563285946.421604    10.113.1.2     10.115.1.2   
1       file_transfer  1563285946.421654    10.113.1.2     10.115.1.2   
2       file_transfer  1563285946.421993    10.115.1.2     10.113.1.2   
3       file_transfer  1563285946.422040    10.115.1.2     10.113.1.2   
4       file_transfer  1563285946.422849  10.113.1.150   10.115.1.123   
...               ...                ...           ...            ...   
338860  file_transfer  1563285955.549798    10.115.1.2     10.113.1.2   
338861  file_transfer  1563285955.550136    10.113.1.2     10.115.1.2   
338862  file_transfer  1563285955.550200    10.113.1.2     10.115.1.2   
338863  file_transfer  1563285955.551482  10.115.1.123   10.113.1.150   
338864  file_transfer  1563285955.551855  10.113.1.150   10.115.1.123   

        protocol  source_port  destination_port  packet_length  
0             17        1892

In [55]:
stfp_agg_data = aggregate_packets(df)

In [56]:
print(stfp_agg_data)

    number_of_packets  average_packet_length   port  number_of_unique_dests  \
0                  54             163.055556     53                       4   
1                5076             917.598109     53                       4   
2                7251             915.878362     22                       2   
3                7305             908.544011     22                       2   
4                7477             905.931523     22                       2   
5                7728             900.857660     22                       2   
6                9922             914.801653  45280                       2   
7               10402             900.618150     22                       2   
8                9729             913.475588  45280                       2   
9                9412             909.008075     22                       2   
10               9609             912.338016  45280                       2   
11              10164             908.276466     22 

In [5]:
messaging_df = run_for_all("dataset/nonvpn_skype-chat_capture3.pcap")
print(messaging_df.head())
print(messaging_df.columns)

messaging dataset/nonvpn_skype-chat_capture3.pcap 21194
       label          timestamp     source_ip destination_ip  protocol  \
0  messaging  1563463092.850291  10.116.1.162   10.118.1.100         6   
1  messaging  1563463095.912290  10.118.1.100   10.116.1.162         6   
2  messaging  1563463095.914080  10.116.1.162   10.118.1.100         6   
3  messaging  1563463095.969538  10.118.1.100   10.116.1.162         6   
4  messaging  1563463097.807768  10.116.1.162   10.118.1.100         6   

   source_port  destination_port  packet_length  
0      58739.0            5061.0             40  
1       5061.0           58739.0            275  
2      58739.0            5061.0            495  
3       5061.0           58739.0             40  
4      58739.0            5061.0            205  
Index(['label', 'timestamp', 'source_ip', 'destination_ip', 'protocol',
       'source_port', 'destination_port', 'packet_length'],
      dtype='object')


In [21]:
messaging_agg_data = aggregate_packets(messaging_df)

Index(['label', 'timestamp', 'source_ip', 'destination_ip', 'protocol',
       'source_port', 'destination_port', 'packet_length'],
      dtype='object')
       label          timestamp     source_ip destination_ip  protocol  \
0  messaging  1563463092.850291  10.116.1.162   10.118.1.100         6   
1  messaging  1563463095.912290  10.118.1.100   10.116.1.162         6   
2  messaging  1563463095.914080  10.116.1.162   10.118.1.100         6   
3  messaging  1563463095.969538  10.118.1.100   10.116.1.162         6   
4  messaging  1563463097.807768  10.116.1.162   10.118.1.100         6   

   source_port  destination_port  packet_length  
0      58739.0            5061.0             40  
1       5061.0           58739.0            275  
2      58739.0            5061.0            495  
3       5061.0           58739.0             40  
4      58739.0            5061.0            205  


In [22]:
print(messaging_agg_data)

      number_of_packets  average_packet_length     port  \
0                     1              40.000000   5061.0   
1                     3             270.000000  58739.0   
2                     2             204.500000   5061.0   
3                     1              40.000000   5061.0   
4                     3             194.333333  58739.0   
...                 ...                    ...      ...   
7254                  7             628.285714   5061.0   
7255                  1             466.000000   5061.0   
7256                  6             682.500000  58739.0   
7257                  2             554.000000  58739.0   
7258                  8             445.750000  58739.0   

      number_of_unique_dests  TCP_protocol_count  UDP_protocol_count  \
0                          1                   1                   0   
1                          2                   3                   0   
2                          2                   2                   0   
3  

In [23]:
nan_packet_length_variance = messaging_agg_data['Packet_length_variance'].isna().sum()
nan_interarrival_time_variance = messaging_agg_data['Inter_arrival_time_variance'].isna().sum()

print("Number of NaN in Packet_length_variance:", nan_packet_length_variance)
print("Number of NaN in Inter_arrival_time_variance:", nan_interarrival_time_variance)

Number of NaN in Packet_length_variance: 0
Number of NaN in Inter_arrival_time_variance: 0


In [24]:
nan_packet_length = messaging_df['packet_length'].isna().sum()
nan_timestamp = messaging_df['timestamp'].isna().sum()

print("Number of NaN in 'packet_length' column:", nan_packet_length)
print("Number of NaN in 'timestamp' column:", nan_timestamp)
print("total rows = ", len(messaging_df))

Number of NaN in 'packet_length' column: 0
Number of NaN in 'timestamp' column: 0
total rows =  21194


In [31]:
data = pd.read_csv('agg_flow_data.csv')
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [32]:
data.columns

Index(['number_of_packets', 'average_packet_length', 'number_of_unique_dests',
       'TCP_protocol_count', 'UDP_protocol_count', 'Packet_length_variance',
       'Inter_arrival_time_variance', 'label'],
      dtype='object')

In [34]:
average_packet_length = data.groupby('label')['number_of_packets'].mean()
print(average_packet_length)

label
file_transfer       1241.254545
messaging              2.917121
regular_browsing      43.604619
video_stream         271.257358
Name: number_of_packets, dtype: float64
