In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Assignment 3
In this assignment you will be training a simple classifier for detecting the type of video service being used and understand how the model is doing the classification.  

## Due Date: December 8th
Please submit a PDF verion of this notebook with all the cells evaluated and answers displayed.

In [None]:
from trustee import ClassificationTrustee
import matplotlib.pyplot as plt
from scapy.all import *
import pandas as pd
import numpy as np
import binascii
import ipaddress
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import plot_tree

# Display all rows for the DataFrame
pd.set_option('display.max_rows', None) 

### Task 0
In discussion section we used CIC Flowmeter to take care of preprocessing our packet traces. In this assignment we will use CSVs created using the following tshark script.

```
tshark -r <pcap_name> -T fields -E separator=/t -e frame.time_epoch -e ip.src -e tcp.srcport -e udp.srcport -e ip.dst -e tcp.dstport -e udp.dstport -e ip.len -e ip.hdr_len -e ip.proto -e tcp.flags -e tcp.seq_raw -e tcp.ack_raw -e tcp.hdr_len -e udp.length -e tcp.analysis.retransmission >> <output_filepath>.csv
```

Therefore the columns in each CSV are as follows:
- Frame Timestamp (`frame.time_epoch`)
- Source IP Address (`ip.src`)
- TCP Source Port (`tcp.srcport`)
- UDP Source Port (`udp.srcport`)
- Destination IP Address (`ip.dst`)
- TCP Destination Port (`tcp.dstport`)
- UDP Destination Port (`udp.dstport`)
- IP Packet Length (`ip.len`)
- IP Header Length (`ip.hdr_len`)
- Transport Protocol (`ip.proto`)
- TCP Flags (`tcp.flags`)
- TCP Sequence Number (`tcp.seq_raw`)
- TCP Acknowledgement Number (`tcp.ack_raw`)
- TCP Header Length (`tcp.hdr_len`)
- UDP Length (`udp.length`)
- TCP Retransmission (`tcp.analysis.retransmission`)  

You can find the CSV files for Twitch and Vimeo respectively on the lab server at the paths:  
```/mnt/cs190n/assignment3/twitch_csvs/```  
```/mnt/cs190n/assignment3/vimeo_csvs/```  

In [None]:
vimeo_csvs = [
    '/mnt/md0/cs190n/assignment3/vimeo_csvs/vimeo_capture1.csv',
    '/mnt/md0/cs190n/assignment3/vimeo_csvs/vimeo_capture2.csv',
    '/mnt/md0/cs190n/assignment3/vimeo_csvs/vimeo_capture3.csv',
    '/mnt/md0/cs190n/assignment3/vimeo_csvs/vimeo_capture4.csv',    
]

twitch_csvs = [
    '/mnt/md0/cs190n/assignment3/twitch_csvs/twitch_capture1.csv',
    '/mnt/md0/cs190n/assignment3/twitch_csvs/twitch_capture2.csv',
    '/mnt/md0/cs190n/assignment3/twitch_csvs/twitch_capture3.csv',
    '/mnt/md0/cs190n/assignment3/twitch_csvs/twitch_capture4.csv',
]

In [None]:
def ip_to_int(ip):
    try:
        return int(ipaddress.ip_address(ip))
    except ValueError:
        return None

#### Pre-process Vimeo Packets

In [None]:
vimeo_packets = pd.DataFrame()
for v in vimeo_csvs:
    
    # Read in CSV
    curr_df = pd.read_csv(v,
                          sep='\t',
                          header=None,
                          dtype = {0: 'float',
                                   1: 'string',
                                   2: 'Int64',
                                   3: 'Int64',
                                   4: 'string',
                                   5: 'Int64',
                                   6: 'Int64',
                                   7: 'string',
                                   8: 'string',
                                   9: 'string',
                                   10: 'string',
                                   11: 'Int64',
                                   12: 'Int64',
                                   13: 'Int64',
                                   14: 'Int64',
                                   15: 'Int64' 
                                   })
    
    curr_df = curr_df.rename(columns={0: 'frame_timestamp',
                                      1: 'ip_src',
                                      2: 'tcp_srcport',
                                      3: 'udp_srcport',
                                      4: 'ip_dst',
                                      5: 'tcp_dstport',
                                      6: 'udp_dstport',
                                      7: 'ip_len',
                                      8: 'ip_header_len',
                                      9: 'protocol',
                                      10: 'tcp_flags',
                                      11: 'tcp_seq_num',
                                      12: 'tcp_ack_num',
                                      13: 'tcp_header_len',
                                      14: 'udp_len',
                                      15: 'tcp_retransmission',
                                     })
    
    # Get basename of file
    basename = v.split('/')[-1].split('.')[0]
    curr_df['file'] = basename
    
    # Convert IPs to integers
    curr_df['ip_src_int'] = curr_df['ip_src'].apply(ip_to_int)
    curr_df['ip_dst_int'] = curr_df['ip_dst'].apply(ip_to_int)
    
    
    curr_df = curr_df.drop(columns=['ip_src', 'ip_dst']).rename(columns={'ip_src_int': 'ip_src',
                                                                         'ip_dst_int': 'ip_dst'})
    
    # Append packets to current DF
    vimeo_packets = pd.concat([vimeo_packets, curr_df], ignore_index=True)

#### Pre-process Twitch Packets

In [None]:
twitch_packets = pd.DataFrame()
for v in twitch_csvs:
    
    # Read in CSV
    curr_df = pd.read_csv(v,
                          sep='\t',
                          header=None,
                          dtype = {0: 'float',
                                   1: 'string',
                                   2: 'Int64',
                                   3: 'Int64',
                                   4: 'string',
                                   5: 'Int64',
                                   6: 'Int64',
                                   7: 'string',
                                   8: 'string',
                                   9: 'string',
                                   10: 'string',
                                   11: 'Int64',
                                   12: 'Int64',
                                   13: 'Int64',
                                   14: 'Int64',
                                   15: 'Int64' 
                                   })
    
    curr_df = curr_df.rename(columns={0: 'frame_timestamp',
                                      1: 'ip_src',
                                      2: 'tcp_srcport',
                                      3: 'udp_srcport',
                                      4: 'ip_dst',
                                      5: 'tcp_dstport',
                                      6: 'udp_dstport',
                                      7: 'ip_len',
                                      8: 'ip_header_len',
                                      9: 'protocol',
                                      10: 'tcp_flags',
                                      11: 'tcp_seq_num',
                                      12: 'tcp_ack_num',
                                      13: 'tcp_header_len',
                                      14: 'udp_len',
                                      15: 'tcp_retransmission',
                                     })
    
    
    # Get basename of file
    basename = v.split('/')[-1].split('.')[0]
    curr_df['file'] = basename
    
    # Convert IPs to integers
    curr_df['ip_src_int'] = curr_df['ip_src'].apply(ip_to_int)
    curr_df['ip_dst_int'] = curr_df['ip_dst'].apply(ip_to_int)
    
    
    curr_df = curr_df.drop(columns=['ip_src', 'ip_dst']).rename(columns={'ip_src_int': 'ip_src',
                                                                         'ip_dst_int': 'ip_dst'})
    
    # Append packets to current DF
    twitch_packets = pd.concat([twitch_packets, curr_df], ignore_index=True)

Before moving on to the tasks for this assignment make sure you have preprocessed the twitch and vimeo pcaps. It's okay if there are some '<NA>' values in your dataframe. The dtype parameter specifies the datatype of the values in that column. Note that the `ip_len`, `ip_header_len` and `protocol` fields are string values but we can cast in to an integer using `astype('Int64')`, and the src and destination IP are represented as integers.

In [None]:
vimeo_packets.head(10)

In [None]:
twitch_packets.head(10)

### Task 1: Preprocess the Data
In this task we will be preprocessing our data to label as either a 'twitch' or a 'vimeo' for training our classifier. We will need to discard background traffic in our captures that does not represent Twitch or Vimeo traffic. We will use a simple methodology to identify relevant flows for Twitch and Vimeo. We will only consider TCP flows with at least 30 inbound AND outbound packets with a TCP length greater than 0 within the flow.

#### Task 1a
The client IP in these packet captures is 172.17.0.2, discard the background traffic and retain only the packets with a source or destination IP equal to 172.17.0.2.  

In [None]:
all_packets = pd.concat([vimeo_packets, twitch_packets], ignore_index=True)

# TODO: Filter for packets that have a source IP or destination equal to 172.17.0.2
client_condition = # <fill me in>

all_packets = all_packets[client_condition]

#### Task 1b
Now we will filter these dataframes to only include TCP traffic (protocol 6), and calculate the TCP payload length. This new column will be called `tcp_len` which represents the length of the TCP payload. This is important because it helps us differentiate packets carrying data from the acknowledgement packets. The TCP length can be calculated as the difference between the IP Packet Length and the sum of the IP Header Length and TCP Header Length.    
```TCP_Payload_Length = IP_Packet_Length - (IP_Header_Length + TCP_Header_Length)```    
Be sure to note the datatypes of each column specified above to take care of any necessary string to Int64 conversions. Remember you can cast a string to an integer using `df['column_name'].astype('Int64')`

In [None]:
# TODO: Filter for packets that have a protocol of TCP (6)
protocol_condition = # <fill me in>

all_packets = all_packets[protocol_condition]

# TODO: Calculate TCP payload length for each packet
all_packets['tcp_len'] = # <fill me in>

#### Task 1c
We will only consider TCP flows with at least 30 inbound OR outbound packets with a TCP length greater than 0 within the flow and discard packets from the remaining flows.    
Remember a flow is a unique 5-tuple identifier consisting of Source IP, Source Port, Destination IP, Destination Port, and Protocol.   
Be sure to note the datatypes of each column specified above to take care of any necessary string to int conversions.

In [None]:
all_flows = (all_packets[all_packets['tcp_len'] > 0]
             .groupby(['protocol', 'ip_src', 'tcp_srcport', 'ip_dst', 'tcp_dstport'], dropna=False)
             .agg({'frame_timestamp': 'count'})
             .rename(columns = {'frame_timestamp':'pkt_count'}))
all_flows = all_flows.reset_index()

# TODO: filter for flows with at least 30 packets per flow
pkt_count_condition = # <fill me in>
all_flows = all_flows[pkt_count_condition]

all_flows = all_flows[['protocol', 'ip_src', 'tcp_srcport', 'ip_dst', 'tcp_dstport']]
all_flows_list = list(all_flows.itertuples(index=False, name=None))

all_packets['5tuple'] = list(zip(all_packets['protocol'], all_packets['ip_src'], all_packets['tcp_srcport'], all_packets['ip_dst'], all_packets['tcp_dstport']))

# TODO: filter for flows that are in all_flows_list using the 5tuple key
flow_condition = # <fill me in>
all_packets = all_packets[flow_condition]

### Task 2: Feature Selection
In this task we will be computing the features that we want to use as input to our model. We will choose the following features for our model, (grouped by 5-tuple):
- Total packets
- Total bytes in TCP payload
- Inter-packet delay (avg)
- Inter-packet delay (max)
- Inter-packet delay (stddev)
- Direction (0 for outbound, 1 for inbound)

Compute these features for each flow. You'll want to use the groupby function to group each flow and the agg function to compute these metrics aggregated over each group. We've provided you with the functions to use to calculate inter-packet delay, and you should be able to use `count` and `sum` to compute the total packets and bytes per flow.

In [None]:
def inter_packet_delay_avg(series):
    return (series.diff()).mean()

def inter_packet_delay_stddev(series):
    return (series.diff()).std()

def inter_packet_delay_max(series):
    return (series.diff()).max()

# TODO: Group the packets by 5-tuple and compute the features using the agg function
all_flow_stats = # <fill me in>

# TODO: Label the direction of packets, 0 for outbound and 1 for inbound
all_flow_stats['direction'] = #TODO: <fill me in>

### Task 3: Labelling the Data
Now we will label our flows as twitch or vimeo. We included the `file` column above which has a label for the specific file the packet is from. You can use this field to label each flow as twitch or vimeo.

In [None]:
# TODO: Label the flows as twitch or vimeo
all_flows_stats['Label'] = # <fill me in>

### Task 4: Training a Model
Now that we have selected our features and labelled our data, we can train a simple classifier. Choose a classifier from [python-scikit](https://scikit-learn.org/stable/supervised_learning.html#supervised-learning). 

In [None]:
# drop any rows with NaN
all_flow_stats = all_flow_stats.dropna()

# separate the data frame to features and answers
target_variable = 'Label'
if 'file' in set(all_flow_stats.columns):
    train_features = list(set(all_flow_stats.columns) - {target_variable} - {'file'})
else:
    train_features = list(set(all_flow_stats.columns) - {target_variable})
x_train = all_flow_stats[train_features]
y_train = all_flow_stats[target_variable]

# TODO: define the classifier you want to test out
clf = # <fill me in>
clf.fit(x_train, y_train)

# Create a set of predictions based on our model and view the precision / recall
y_pred = clf.predict(x_train.values)
print(metrics.classification_report(y_train, y_pred))

### Task 5: Understanding the Model
You will likely observe a very high if not perfect precision/recall because we are using a small amount of data and we are training / testing using the same dataset. This means that we were able to fit a model that can classify all of our training data correctly. Now we will generate a report from Trustee as well as visualize our model to understand which features are being used to classify our data.

In [None]:

# Create and Train a Trustee Tree
trustee = ClassificationTrustee(expert=clf)
trustee.fit(x_train, y_train, num_samples=len(x_train) // 2, num_iter=20, train_size=0.99)

# Display Trustee Results
_, dt, _, score = trustee.explain()
print(f"Training score of pruned DT: {score}")
dt_y_pred = dt.predict(x_train)
print("Model explanation global fidelity report:")
print(metrics.classification_report(clf.predict(x_train), dt_y_pred))
print("Model explanation score report:")
print(metrics.classification_report(y_train, dt_y_pred))

# plot a tree
fig = plt.figure(figsize=(25,20))
plot_tree(dt, feature_names=x_train.columns, class_names=sorted(all_flow_stats['Label'].unique()), filled=True, max_depth=7)

Given the report from Trustee, what feature(s) do you think are most importantly used by this classifier to predict whether a flow is Vimeo or Twitch? Is this an instance of shortcut learning? Try re-training the model without including the 5-tuple for the flow as part of the list of features and observe how the decision tree changes. Why might it a bad idea to include IPs as features into this model?