In [1]:
import numpy as np
import pandas as pd
from importlib import reload

In [2]:
# define constants
TRACE_FILE_NAME = 'data/youtube.pcapng' # replace with your raw trace .pcapng/.pcap file name
TRACE_TCP_PACKET_FEATURE_FILE_NAME = 'data/youtube_tcp_pkt.csv' # replace with your favorite tcp packet feature .csv file name
TRACE_UDP_PACKET_FEATURE_FILE_NAME = 'data/youtube_udp_pkt.csv' # replace with your favorite udp packet feature .csv file name
TRACE_PACKET_FEATURE_FILE_NAME = 'data/youtube_pkt.csv' # replace with your favorite packet feature .csv file name
LOCAL_IP = '172.16.26.207' # your local ip

## Extract TCP packets

In [3]:
# convert raw trace to readable udp and tcp packet feature csv file
from python import packet_feature
reload(packet_feature)
%time packet_feature.tcp_generate(TRACE_FILE_NAME,TRACE_TCP_PACKET_FEATURE_FILE_NAME)

Conversion done
CPU times: user 3.71 ms, sys: 7.87 ms, total: 11.6 ms
Wall time: 7.47 s


In [4]:
# read in packet feature csv file and do some transformation
import ipaddress
tcp_pkt_feature_df = pd.read_csv(TRACE_TCP_PACKET_FEATURE_FILE_NAME)
filterer = tcp_pkt_feature_df.apply(lambda row:(not pd.isnull(row['ip.src']) and ipaddress.IPv4Address(row['ip.src']).is_global) or (not pd.isnull(row['ip.dst']) and ipaddress.IPv4Address(row['ip.dst']).is_global),axis=1)
tcp_pkt_feature_df = tcp_pkt_feature_df[filterer]
record_num = tcp_pkt_feature_df.shape[0]
tcp_pkt_feature_df['remote_ip'] = tcp_pkt_feature_df.apply(lambda row:row['ip.dst'] if ipaddress.IPv4Address(row['ip.dst']).is_global else row['ip.src'],axis=1) if record_num > 0 else None
tcp_pkt_feature_df['remote_ip2num'] = tcp_pkt_feature_df.apply(lambda row:int(ipaddress.IPv4Address(row['remote_ip'])),axis=1) if record_num > 0 else None
tcp_pkt_feature_df['protocol'] = 'tcp' if record_num > 0 else None
tcp_pkt_feature_df['is_tcp'] = 1 if record_num > 0 else None
tcp_pkt_feature_df['is_udp'] = 0 if record_num > 0 else None
tcp_pkt_feature_df.rename(columns={'tcp.len':'pkt_len'},inplace=True)

In [5]:
# view the shape of the dataset: (number of records, number of features)
tcp_pkt_feature_df.shape

(65980, 17)

In [6]:
# view the data types for each feature
tcp_pkt_feature_df.dtypes

ip.src                  object
ip.dst                  object
tcp.srcport              int64
tcp.dstport              int64
pkt_len                  int64
frame.time_relative    float64
tcp.seq                  int64
tcp.ack                  int64
tcp.flags.ack            int64
tcp.flags.syn            int64
tcp.flags.fin            int64
tcp.stream               int64
remote_ip               object
remote_ip2num            int64
protocol                object
is_tcp                   int64
is_udp                   int64
dtype: object

In [7]:
# view the statistical features of each numerical feature
tcp_pkt_feature_df.describe()

Unnamed: 0,tcp.srcport,tcp.dstport,pkt_len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream,remote_ip2num,is_tcp,is_udp
count,65980.0,65980.0,65980.0,65980.0,65980.0,65980.0,65980.0,65980.0,65980.0,65980.0,65980.0,65980.0,65980.0
mean,16986.905941,46160.021552,909.981176,231.568551,15203660.0,4687994.0,0.997166,0.003092,0.003789,32.692543,2835349000.0,1.0,0.0
std,27486.571467,27477.928085,581.245107,170.945878,16684040.0,11632340.0,0.053162,0.055519,0.061439,35.635168,411213800.0,0.0,0.0
min,80.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69408140.0,1.0,0.0
25%,443.0,443.0,0.0,76.302237,40613.0,11463.0,1.0,0.0,0.0,21.0,2915180000.0,1.0,0.0
50%,443.0,62685.0,1288.0,212.022055,8181101.0,31449.0,1.0,0.0,0.0,21.0,2915180000.0,1.0,0.0
75%,62649.0,62685.0,1288.0,371.201224,29401080.0,50960.0,1.0,0.0,0.0,21.0,2915180000.0,1.0,0.0
max,62776.0,62776.0,1288.0,550.894756,50617980.0,50617980.0,1.0,1.0,1.0,154.0,3757885000.0,1.0,0.0


In [8]:
# view the first 5 records
tcp_pkt_feature_df.head()

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,pkt_len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream,remote_ip,remote_ip2num,protocol,is_tcp,is_udp
0,74.125.29.189,172.16.44.192,443,62381,59,0.0,1,1,1,0,0,0,74.125.29.189,1249713597,tcp,1,0
1,172.16.44.192,74.125.29.189,62381,443,0,5.7e-05,1,60,1,0,0,0,74.125.29.189,1249713597,tcp,1,0
2,172.16.44.192,104.107.38.42,62625,80,0,0.17588,1,1,1,0,1,1,104.107.38.42,1751852586,tcp,1,0
3,104.107.38.42,172.16.44.192,80,62625,0,0.182967,1,2,1,0,1,1,104.107.38.42,1751852586,tcp,1,0
4,172.16.44.192,104.107.38.42,62625,80,0,0.183044,2,2,1,0,0,1,104.107.38.42,1751852586,tcp,1,0


## Extract UDP packets

In [9]:
# convert raw trace to readable udp and tcp packet feature csv file
from python import packet_feature
reload(packet_feature)
%time packet_feature.udp_generate(TRACE_FILE_NAME,TRACE_UDP_PACKET_FEATURE_FILE_NAME,True)

Conversion done


No error


CPU times: user 3.44 ms, sys: 7.16 ms, total: 10.6 ms
Wall time: 6.78 s


In [10]:
udp_pkt_feature_df = pd.read_csv(TRACE_UDP_PACKET_FEATURE_FILE_NAME)
udp_pkt_feature_df

Unnamed: 0,ip.src,ip.dst,udp.srcport,udp.dstport,udp.length,frame.time_relative,udp.stream
0,172.16.44.192,128.122.0.11,51287,53,57,3.832918,0
1,128.122.0.11,172.16.44.192,53,51287,498,3.844097,0
2,172.16.44.192,128.122.0.11,50095,53,46,6.527524,1
3,128.122.0.11,172.16.44.192,53,50095,375,6.605329,1
4,172.16.44.192,255.255.255.255,17500,17500,177,13.740094,2
5,172.16.44.192,172.16.47.255,17500,17500,177,13.741026,3
6,172.16.44.192,128.122.0.11,64640,53,39,15.824121,4
7,128.122.0.11,172.16.44.192,53,64640,338,15.838859,4
8,172.16.44.192,128.122.0.11,60627,53,59,17.170264,5
9,128.122.0.11,172.16.44.192,53,60627,360,17.172587,5


In [11]:
# read in packet feature csv file and do some transformation
import ipaddress
def filter_illegal(row):
    try:
        ipaddress.IPv4Address(row['ip.src'])
        ipaddress.IPv4Address(row['ip.dst'])
        return (not pd.isnull(row['ip.src']) and ipaddress.IPv4Address(row['ip.src']).is_global) or (not pd.isnull(row['ip.dst']) and ipaddress.IPv4Address(row['ip.dst']).is_global)
    except ValueError as e:
        print(e)
        return False                                                                                
                                                                                                
udp_pkt_feature_df = pd.read_csv(TRACE_UDP_PACKET_FEATURE_FILE_NAME)
filterer = udp_pkt_feature_df.apply(filter_illegal,axis=1)
udp_pkt_feature_df = udp_pkt_feature_df[filterer]
record_num = udp_pkt_feature_df.shape[0]
udp_pkt_feature_df['remote_ip'] = udp_pkt_feature_df.apply(lambda row:row['ip.dst'] if ipaddress.IPv4Address(row['ip.dst']).is_global else row['ip.src'],axis=1) if record_num > 0 else None
udp_pkt_feature_df['remote_ip2num'] = udp_pkt_feature_df.apply(lambda row:int(ipaddress.IPv4Address(row['remote_ip'])),axis=1) if record_num > 0 else None
udp_pkt_feature_df['protocol'] = 'udp' if record_num > 0 else None
udp_pkt_feature_df['is_tcp'] = 0 if record_num > 0 else None
udp_pkt_feature_df['is_udp'] = 1 if record_num > 0 else None
udp_pkt_feature_df.rename(columns={'udp.length':'pkt_len'},inplace=True)

Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in 'nan'
Expected 4 octets in '172.16.44.192,172.19.218.241'
Expected 4 octets in '172.16.44.192,172.19.218.241'
Expected 4 octets in '172.16.44.192,172.19.218.241'
Expected 4 octets in '172.16.44.192,172.19.218.241'
Expected 4 octets in '172.16.44.192,172.19.218.241'
Expected 4 octets in 'nan'


In [12]:
# view the shape of the dataset: (number of records, number of features)
udp_pkt_feature_df.shape

(148, 12)

In [13]:
# view the data types for each feature
udp_pkt_feature_df.dtypes

ip.src                  object
ip.dst                  object
udp.srcport              int64
udp.dstport              int64
pkt_len                  int64
frame.time_relative    float64
udp.stream               int64
remote_ip               object
remote_ip2num            int64
protocol                object
is_tcp                   int64
is_udp                   int64
dtype: object

In [14]:
# view the statistical features of each numerical feature
udp_pkt_feature_df.describe()

Unnamed: 0,udp.srcport,udp.dstport,pkt_len,frame.time_relative,udp.stream,remote_ip2num,is_tcp,is_udp
count,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0
mean,30500.628378,24119.858108,214.912162,395.519799,37.918919,2424540000.0,0.0,1.0
std,28783.37488,28241.002863,175.69779,170.897268,20.719544,647559000.0,0.0,0.0
min,53.0,53.0,35.0,3.832918,0.0,2155479000.0,0.0,1.0
25%,53.0,53.0,49.75,347.057401,21.0,2155479000.0,0.0,1.0
50%,50027.5,1900.0,136.5,491.469643,38.0,2155479000.0,0.0,1.0
75%,56656.0,55405.25,358.25,492.689237,54.0,2155479000.0,0.0,1.0
max,65497.0,65497.0,517.0,550.88291,79.0,4026532000.0,0.0,1.0


In [15]:
# view the first 5 records
udp_pkt_feature_df.head()

Unnamed: 0,ip.src,ip.dst,udp.srcport,udp.dstport,pkt_len,frame.time_relative,udp.stream,remote_ip,remote_ip2num,protocol,is_tcp,is_udp
0,172.16.44.192,128.122.0.11,51287,53,57,3.832918,0,128.122.0.11,2155479051,udp,0,1
1,128.122.0.11,172.16.44.192,53,51287,498,3.844097,0,128.122.0.11,2155479051,udp,0,1
2,172.16.44.192,128.122.0.11,50095,53,46,6.527524,1,128.122.0.11,2155479051,udp,0,1
3,128.122.0.11,172.16.44.192,53,50095,375,6.605329,1,128.122.0.11,2155479051,udp,0,1
6,172.16.44.192,128.122.0.11,64640,53,39,15.824121,4,128.122.0.11,2155479051,udp,0,1


## Combine TCP with UDP packets

In [16]:
# combine dataframes
pkt_feature_df = tcp_pkt_feature_df[['remote_ip2num','is_tcp','is_udp','pkt_len']].append(udp_pkt_feature_df[['remote_ip2num','is_tcp','is_udp','pkt_len']],ignore_index=True)

In [17]:
# shape
pkt_feature_df.shape

(66128, 4)

In [18]:
###### column types
pkt_feature_df.dtypes

remote_ip2num    int64
is_tcp           int64
is_udp           int64
pkt_len          int64
dtype: object

In [19]:
# describe
pkt_feature_df.describe()

Unnamed: 0,remote_ip2num,is_tcp,is_udp,pkt_len
count,66128.0,66128.0,66128.0,66128.0
mean,2834430000.0,0.997762,0.002238,908.425553
std,412343800.0,0.047256,0.047256,581.581654
min,69408140.0,0.0,0.0,0.0
25%,2915180000.0,1.0,0.0,0.0
50%,2915180000.0,1.0,0.0,1288.0
75%,2915180000.0,1.0,0.0,1288.0
max,4026532000.0,1.0,1.0,1288.0


In [20]:
# head 5 records
pkt_feature_df.head()

Unnamed: 0,remote_ip2num,is_tcp,is_udp,pkt_len
0,1249713597,1,0,59
1,1249713597,1,0,0
2,1751852586,1,0,0
3,1751852586,1,0,0
4,1751852586,1,0,0


In [21]:
# write to csv
pkt_feature_df.to_csv(TRACE_PACKET_FEATURE_FILE_NAME, index=False)