### Import Libraries

In [4]:
import sys
from scapy.all import *
import numpy as np
import pandas as pd

import pickle
import datetime
import sys
import os
import ipaddress
import netaddr

### Upload Files from Local Machine (for Google Collaboratory)

Here is code to upload a data file from the local home machine. For the purposes of this notebook, you should upload a packet capture ("pcap") file. A pcap file is a packet capture file that is generated from a network traffic capture, using a tool such as [Wireshark](https://wireshark.org) or tcpdump.

In [10]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving google_home.pcap to google_home.pcap
User uploaded file "google_home.pcap" with length 77630488 bytes


### Example of Using Scapy PcapReader to parse pcap.

This is a toy function.  All it does is use Scapy's PcapReader function and shows the first destination IP address in the packet capture.

In [5]:
def load_pcap(pcap_file):
  for pkt in PcapReader(pcap_file):
    if IP in pkt:
      print(pkt[IP].dst)
      break

If you upload to Google Collaboratory as above, you can load the pcap without a directory.  Otherwise you have to tell Python/Jupyter where the file is.

In [6]:
dir = '../example_pcaps'
file = "{}/google_home.pcap".format(dir)
print(file)

../example_pcaps/google_home.pcap


In [7]:
load_pcap(file)

255.255.255.255


### Create a Pandas Data Frame from a Pcap

In [9]:
def pcap_to_dict(pcap_file):
    '''Parses a pcap file into a list of dicts.
    Arguments:
      pcap_file: string filepath of pcap file
    Returns:
      List of dicts with one dict per packet in pcap file.
        The dicts have the following key/value pairs:
          "time"     : time the packet was receieved in seconds since epoch
          "datetime" : time the packet was received as a datetime object
          "length"   : length of packet in bytes
          "mac_src"  : source MAC address
          "mac_dst"  : destination MAC address
          "ip_src"   : source IP address
          "ip_dst"   : destination IP address
          "protocol" : 'TCP', 'UDP', 'ICMP', or None
          "port_src" : source port
          "port_dst" : destination port
          "is_dns"   : True if packet is DNS packet, else False
          "dns_query" : string DNS query
          "dns_resp" : string DNS response'''
          
    data = []
    with PcapReader(pcap_file) as pcap_reader:
        for i, pkt in enumerate(pcap_reader):
            pkt_dict = {}
            #if i % 1000 == 0 and i != 0: print(i)
            try:
                if Ether not in pkt:
                    continue
                
                pkt_dict["time"] = pkt.time
                pkt_dict["datetime"] = datetime.datetime.fromtimestamp(pkt.time)
                pkt_dict["length"] = len(pkt)
                pkt_dict["mac_dst"] = pkt[Ether].dst
                pkt_dict["mac_src"] = pkt[Ether].src
                pkt_dict["ip_dst"] = None
                pkt_dict["ip_src"] = None
                pkt_dict["protocol"] = None
                pkt_dict["port_dst"] = None
                pkt_dict["port_src"] =  None
                pkt_dict["is_dns"] = False
                pkt_dict["dns_query"] = None
                pkt_dict["dns_resp"] = None
                
                if IP in pkt:
                    pkt_dict["ip_dst"] = pkt[IP].dst
                    pkt_dict["ip_src"] = pkt[IP].src
                                
                if TCP in pkt:
                    pkt_dict["port_dst"] = pkt[TCP].dport
                    pkt_dict["port_src"] = pkt[TCP].sport
                    pkt_dict["protocol"] = 'TCP'
                    
                elif UDP in pkt:
                    pkt_dict["port_dst"] = pkt[UDP].dport
                    pkt_dict["port_src"] = pkt[UDP].sport
                    pkt_dict["protocol"] = 'UDP'
                    
                elif ICMP in pkt:
                    pkt_dict["protocol"] = 'ICMP'
                    
                if DNSQR in pkt:
                    pkt_dict["is_dns"] = True
                    pkt_dict["dns_query"] = pkt[DNSQR].qname
                if DNSRR in pkt:
                    pkt_dict["is_dns"] = True
                    pkt_dict["dns_resp"] = pkt[DNSRR].rrname
                
                data.append(pkt_dict)
            except:
                continue
    return data
          
          

In [21]:
# load pcap file into dictionary
pdict = pcap_to_dict(file)

# from the dictionary, create a Pandas data frame
ppd = pd.DataFrame(pdict)

In [22]:
ppd.head(10)

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
0,2017-12-07 14:48:41.156083,,,,,False,113,a4:77:33:2f:e0:6e,b8:27:eb:2d:24:15,,,,1512680000.0
1,2017-12-07 14:48:41.255304,,,,,False,135,b8:27:eb:2d:24:15,a4:77:33:2f:e0:6e,,,,1512680000.0
2,2017-12-07 14:48:41.256350,,,,,False,169,a4:77:33:2f:e0:6e,b8:27:eb:2d:24:15,,,,1512680000.0
3,2017-12-07 14:48:41.300373,,,,,False,113,b8:27:eb:2d:24:15,a4:77:33:2f:e0:6e,,,,1512680000.0
4,2017-12-07 14:48:41.303218,,,,,False,90,33:33:00:00:00:16,a4:77:33:2f:e0:6e,,,,1512680000.0
5,2017-12-07 14:48:41.412617,,,,,False,78,33:33:ff:2f:e0:6e,a4:77:33:2f:e0:6e,,,,1512680000.0
6,2017-12-07 14:48:41.432318,,,255.255.255.255,0.0.0.0,False,393,ff:ff:ff:ff:ff:ff,a4:77:33:2f:e0:6e,67.0,68.0,UDP,1512680000.0
7,2017-12-07 14:48:42.414223,,,,,False,70,33:33:00:00:00:02,a4:77:33:2f:e0:6e,,,,1512680000.0
8,2017-12-07 14:48:43.204444,,,,,False,90,33:33:00:00:00:16,a4:77:33:2f:e0:6e,,,,1512680000.0
9,2017-12-07 14:48:44.186461,,,172.24.1.51,172.24.1.1,False,62,a4:77:33:2f:e0:6e,b8:27:eb:2d:24:15,,,ICMP,1512680000.0


## Loading your own packet capture file

In the next part of the exercise, we will perform our own packet capture in Wireshark (https://wireshark.org) and analyze it using some basic Pandas manipulation tools.

Only use the next cell is you are using Google Collaboratory to upload a file.  Otherwise you can load it from the local filesystem.

In [32]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving example.pcap to example.pcap
User uploaded file "example.pcap" with length 2001696 bytes


In [10]:
# Make sure that this cell prints out a filename that exists! (The one for your wireshark capture.)
dir = '/Users/feamster/Downloads'
filename = 'example-20191005'
file = "{}/{}.pcap".format(dir,filename)
print(file)

/Users/feamster/Downloads/example-20191005.pcap


In [13]:
# Turn your pcap into a Pandas data frame.
ex_pdict = pcap_to_dict(file)
ex_ppd = pd.DataFrame(ex_pdict)

In [15]:
# print out the first ten rows
ex_ppd.head(10)

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
0,2019-10-05 20:39:35.507428,,,192.204.13.35,192.168.1.13,False,651,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,8801.0,62673.0,UDP,1570326000.0
1,2019-10-05 20:39:35.507429,,,192.204.13.35,192.168.1.13,False,651,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,8801.0,62673.0,UDP,1570326000.0
2,2019-10-05 20:39:35.508774,,,192.168.1.13,192.204.13.35,False,60,3c:15:c2:d9:d3:50,08:02:8e:92:27:27,62673.0,8801.0,UDP,1570326000.0
3,2019-10-05 20:39:35.508778,,,192.168.1.13,192.204.13.35,False,868,3c:15:c2:d9:d3:50,08:02:8e:92:27:27,61420.0,8801.0,UDP,1570326000.0
4,2019-10-05 20:39:35.508984,,,192.204.13.35,192.168.1.13,False,157,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,8801.0,61420.0,UDP,1570326000.0
5,2019-10-05 20:39:35.519036,,,192.204.13.35,192.168.1.13,False,94,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,8801.0,62673.0,UDP,1570326000.0
6,2019-10-05 20:39:35.523732,,,192.168.1.13,192.204.13.35,False,99,3c:15:c2:d9:d3:50,08:02:8e:92:27:27,53207.0,8801.0,UDP,1570326000.0
7,2019-10-05 20:39:35.523897,,,192.204.13.35,192.168.1.13,False,292,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,8801.0,61420.0,UDP,1570326000.0
8,2019-10-05 20:39:35.523898,,,192.204.13.35,192.168.1.13,False,1282,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,8801.0,61420.0,UDP,1570326000.0
9,2019-10-05 20:39:35.534069,,,192.204.13.35,192.168.1.13,False,1282,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,8801.0,61420.0,UDP,1570326000.0


### Exploring the Dataset

Now you have some basic examples for taking a packet capture and loading it into a Pandas data frame.  You could then explore the dataset. First, you can use the "head" function to look at some entries in the data frame. For example head(1) looks at the first entry.

In [17]:
ex_ppd.head(1)

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
0,2019-10-05 20:39:35.507428,,,192.204.13.35,192.168.1.13,False,651,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,8801.0,62673.0,UDP,1570326000.0


You can also look at the shape and size of the packet capture. This should match the number of packets that wireshark told you it captured for your trace.

In [19]:
ex_ppd.shape

(1209, 13)

describe() tells us some basic stats on numerical columns. Some of this is meaningless, because it doesn't make any sense to take a mean on the port value.  But the stats on the time and length colum can be useful sanity checks.

In [20]:
# this is basically meaningless for our packet data but just to show an example
ex_ppd.describe()

Unnamed: 0,length,port_dst,port_src,time
count,1209.0,1205.0,1205.0,1209.0
mean,622.79239,21949.026556,45811.240664,1570326000.0
std,479.581695,23030.730322,23808.302508,1.390592
min,34.0,80.0,80.0,1570326000.0
25%,129.0,8801.0,8801.0,1570326000.0
50%,532.0,8801.0,61420.0,1570326000.0
75%,1122.0,53207.0,61420.0,1570326000.0
max,1326.0,63232.0,63232.0,1570326000.0


### Selecting Data

**Selecting Based on Index**

In [21]:
ex_ppd.iloc[0]

datetime     2019-10-05 20:39:35.507428
dns_query                          None
dns_resp                           None
ip_dst                    192.204.13.35
ip_src                     192.168.1.13
is_dns                            False
length                              651
mac_dst               08:02:8e:92:27:27
mac_src               3c:15:c2:d9:d3:50
port_dst                           8801
port_src                          62673
protocol                            UDP
time                        1.57033e+09
Name: 0, dtype: object

Pandas allows you to "slice" the data structure. Here we slice by indexes ("1:4") giving us the second, third, and fourth entries in the data frame.

In [24]:
ex_ppd.iloc[12:14]

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
12,2019-10-05 20:39:35.544286,,,192.204.13.35,192.168.1.13,False,1281,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,8801.0,61420.0,UDP,1570326000.0
13,2019-10-05 20:39:35.556435,,,192.204.13.35,192.168.1.13,False,1281,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,8801.0,61420.0,UDP,1570326000.0


**Selecting Based on Conditionals**

Select all TCP packets.

In [26]:
ex_ppd[ex_ppd['protocol'] == 'TCP'][:10]

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
29,2019-10-05 20:39:35.614173,,,18.211.118.21,192.168.1.13,False,129,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,443.0,62096.0,TCP,1570326000.0
45,2019-10-05 20:39:35.644049,,,192.168.1.13,18.211.118.21,False,66,3c:15:c2:d9:d3:50,08:02:8e:92:27:27,62096.0,443.0,TCP,1570326000.0
46,2019-10-05 20:39:35.644052,,,192.168.1.13,18.211.118.21,False,129,3c:15:c2:d9:d3:50,08:02:8e:92:27:27,62096.0,443.0,TCP,1570326000.0
47,2019-10-05 20:39:35.644087,,,18.211.118.21,192.168.1.13,False,66,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,443.0,62096.0,TCP,1570326000.0
92,2019-10-05 20:39:35.823151,,,192.204.13.35,192.168.1.13,False,229,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,443.0,62250.0,TCP,1570326000.0
93,2019-10-05 20:39:35.823178,,,192.204.13.35,192.168.1.13,False,232,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,443.0,62250.0,TCP,1570326000.0
94,2019-10-05 20:39:35.823232,,,192.204.13.35,192.168.1.13,False,234,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,443.0,62250.0,TCP,1570326000.0
99,2019-10-05 20:39:35.830945,,,17.253.25.201,192.168.1.13,False,66,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,80.0,63232.0,TCP,1570326000.0
100,2019-10-05 20:39:35.840029,,,192.168.1.13,17.253.25.201,False,66,3c:15:c2:d9:d3:50,08:02:8e:92:27:27,63232.0,80.0,TCP,1570326000.0
101,2019-10-05 20:39:35.840091,,,17.253.25.201,192.168.1.13,False,66,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,80.0,63232.0,TCP,1570326000.0


Select all packets destined for port 443.

In [28]:
tls_packets = ex_ppd[(ex_ppd['protocol'] == 'TCP') & (ex_ppd['port_dst'] == 443)]
tls_packets[:5]

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
29,2019-10-05 20:39:35.614173,,,18.211.118.21,192.168.1.13,False,129,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,443.0,62096.0,TCP,1570326000.0
47,2019-10-05 20:39:35.644087,,,18.211.118.21,192.168.1.13,False,66,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,443.0,62096.0,TCP,1570326000.0
92,2019-10-05 20:39:35.823151,,,192.204.13.35,192.168.1.13,False,229,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,443.0,62250.0,TCP,1570326000.0
93,2019-10-05 20:39:35.823178,,,192.204.13.35,192.168.1.13,False,232,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,443.0,62250.0,TCP,1570326000.0
94,2019-10-05 20:39:35.823232,,,192.204.13.35,192.168.1.13,False,234,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,443.0,62250.0,TCP,1570326000.0


How many TLS packets in the trace? Let's look at the shape of the resulting data frame.

In [29]:
tls_packets.shape

(58, 13)

### Compute Statistics

Compute Packet Statistics

In [31]:
print('Average Packet Length:', ex_ppd['length'].mean())
print('Minimum Packet Length:', ex_ppd['length'].min())
print('Maximum Packet Length:', ex_ppd['length'].max())
print('Median Packet Length:', ex_ppd['length'].median())

Average Packet Length: 622.7923904052936
Minimum Packet Length: 34
Maximum Packet Length: 1326
Median Packet Length: 532.0


Count the number of times a distinct value appears. For example, we can see how many packets appear on distinct ports.

In [32]:
ex_ppd[ex_ppd['port_dst'] < 2048]['port_dst'].value_counts()

443.0    58
80.0      2
Name: port_dst, dtype: int64

In [34]:
ex_ppd[ex_ppd['port_dst'] == 80][:2]

Unnamed: 0,datetime,dns_query,dns_resp,ip_dst,ip_src,is_dns,length,mac_dst,mac_src,port_dst,port_src,protocol,time
99,2019-10-05 20:39:35.830945,,,17.253.25.201,192.168.1.13,False,66,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,80.0,63232.0,TCP,1570326000.0
101,2019-10-05 20:39:35.840091,,,17.253.25.201,192.168.1.13,False,66,08:02:8e:92:27:27,3c:15:c2:d9:d3:50,80.0,63232.0,TCP,1570326000.0


Find statistics grouped by particular categories.

Average length by protocol type.

In [35]:
ex_ppd.groupby('protocol')['length'].mean()

protocol
TCP    111.333333
UDP    679.957721
Name: length, dtype: float64

In [36]:
ex_ppd.groupby('protocol')['length'].max()

protocol
TCP     817
UDP    1326
Name: length, dtype: int64

Average length for packets destined to various destination ports.

In [37]:
ex_ppd[ex_ppd['port_dst'] < 2048].groupby('port_dst')['length'].mean()

port_dst
80.0      66.000000
443.0    111.431034
Name: length, dtype: float64

## Grouping Packets by Time

In [60]:
time_index = pd.date_range('09/28/2019 19:51:21', periods=100, freq='1S')
time_df = pd.DataFrame(index=time_index)

In [61]:
time_df.head (10)

2019-09-28 19:51:21
2019-09-28 19:51:22
2019-09-28 19:51:23
2019-09-28 19:51:24
2019-09-28 19:51:25
2019-09-28 19:51:26
2019-09-28 19:51:27
2019-09-28 19:51:28
2019-09-28 19:51:29
2019-09-28 19:51:30


To be continued...