#**Homework**

Complete the following tasks:

* Use a dataset of 21 Video Sessions
* Recognize the Video Server(s) IP and select video traffic (***if more than one Server is found, keep the dominant flow only***)
* Detect Video Client HTTP Requests (Uplink packets with size larger or equal to 100 Bytes)
* Compute features to predict:
 1.   When the next UL Request is sent by the Video Client 
 2.   How large is the response of the Server to the next UL Request

**N.B.**: Below, you can find a list of useful functions for the tasks at hand (introduced during class).

In [95]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

### Functions Ready-To-Use

In [96]:
def filter_traffic(data, domain):
   
    # Look in DNS Responses for googlevideo domain
    dns_data = data[data['Protocol']=='DNS']
    dns = dns_data[dns_data['Info'].apply(lambda x: 'googlevideo' in x and 'response' in x)]
    ips = dns.Address.values 
    server_names = dns.Name.values
    
    # Filtering on either "Source" or "Destination" IP, get the 
    # rows of the dataset that contain at least one of the selected IPs
    downlink = data[data['Source'].apply(lambda x: x in ips)].dropna(subset=['Length']) 

    uplink = data[data['Destination'].apply(lambda x: x in ips)].dropna(subset=['Length'])
    
    return ips, server_names, uplink, downlink

def find_dominant(uplink, downlink):

  # Expressed in MB

  # Order flows by cumulative DL Volume
  flows_DL = downlink.groupby(['Source','Destination'])['Length'].sum()/(10**6)
  print(flows_DL)
  
  # Get (Source,Destination) IPs of dominant flows
  dom_id = flows_DL[flows_DL==max(flows_DL)].index[0]

  # Filter traffic selecting the dominant flow
  dom_dl = downlink[downlink['Source']==dom_id[0]]
  dom_ul = uplink[(uplink['Source']==dom_id[1])]

  return dom_ul, dom_dl

def timebased_filter(data, length=None, min_time=None, max_time=None):
  '''
  :param data: pd dataframe to be filtered. Must contain columns: "Length" and "Time"
  :param length: all packets shorter than length [Bytes] will be discarded (default 0)
  :param min_time: all packets with timestamp smaller than min_time [s] will be discarded (default 0)
  :param max_time: all packets with timestamp larger than max_time [s] will be discarded (default 1000)
  '''

  if length is None:
    length=0
  if min_time is None:
    min_time = 0
  if max_time is None:
    max_time = 1000
  
  filtered_data = data.copy().reset_index()
  mask = (filtered_data['Length']>=length) & (filtered_data['Time']>=min_time) & (filtered_data['Time']<= max_time)
  filtered_data = filtered_data.loc[mask[mask ==True].index]

  return filtered_data

def find_next(array, value):
  '''
  :param array: np.array, array of floats
  :param value: float, reference value
  :return: position of the closest element of the array greater than "value"
  '''
  delta = np.asarray(array) - value
  idx = np.where(delta >= 0, delta, np.inf).argmin()

  return idx

def normalize_dataset(training_set, test_set):

  mean_train = training_set.mean()
  std_train = training_set.std()
  norm_train = (training_set - mean_train)/std_train
  norm_test = (test_set - mean_train)/std_train  

  return norm_train, norm_test

### Functions to be completed


In [97]:
def features_extraction(uplink, downlink):
  '''
  Complete this function to extract both features and groundtruth.

  NB: The features extraction process is the same as the one introduced during
  the lecture. 
  '''
  dataset = pd.DataFrame(columns=['Request_Size','Inter_RR_Time','DL_Time','DL_Vol','DL_Size','PB_Time'])
  # ****************************************************************************
  # Feature 1: Client Request Size
  dataset['Request_Size'] = list(uplink.Length.values)

  # ****************************************************************************
  # Feature 2: Inter Request-Response Time
  rr_time = []
  response_time = []
  for t in uplink.Time:
    response_time.append(find_next(downlink.Time, t)) #index of next DL packet timestamp 
    rr_time.append(downlink.Time.iloc[response_time[-1]] - t)

  dataset['Inter_RR_Time'] = rr_time

  # ****************************************************************************
  # Feature 3-4-5: Download Time, Download Volume, Download Size (# Packets) 
  dt = []
  dv = []
  ds = []

  for rt1, rt2 in zip(response_time[:-1], response_time[1:]):

    #Download Time
    dt.append(downlink.Time.iloc[rt2-1] - downlink.Time.iloc[rt1])

    temp = timebased_filter(downlink, 0, downlink.Time.iloc[rt1], downlink.Time.iloc[rt2-1])
    
    #Download Volume
    dv.append(temp.Length.sum())

    #Download Size (# Packets) 
    ds.append(temp.shape[0])

  # Last Iteration data might be corrupted due to drastic interruption of capture 
  # process. If it is so, an error would occur during the features extraction.
  # To avoid this, we skip last HTTP iteration data when an error is raised 
  # using the try-except logic below.
  try:
    # Consider also last HTTP iteration
    #Download Time
    dt.append(downlink.Time.iloc[-1] - downlink.Time.iloc[rt2])

    temp = timebased_filter(downlink, 0, downlink.Time.iloc[rt2], downlink.Time.iloc[-1])
    #Download Volume
    dv.append(temp.Length.sum())

    #Download Size (# Packets) 
    ds.append(temp.shape[0])
  except:
    print()

  dataset['DL_Time'] = dt
  dataset['DL_Vol'] = dv
  dataset['DL_Size'] = ds

  # ****************************************************************************
  # Feature 5: Playback Time
  pbt = list(uplink.Time.values)
  dataset['PB_Time'] = pbt
  # ****************************************************************************
  
  
  # Check Features Consistency
  dataset = dataset[(dataset > 0).all(1)]
  dataset = dataset[dataset['DL_Time']<20]

  ###############################################################
  # TO BE COMPLETED

  ### EXTRACT GROUNDTRUTH HERE
  groundtruth = pd.DataFrame(columns=['Next_Request_Time','Next_Response_Vol'])
  # ****************************************************************************
  # GT 1: Next Request Time

  nr_time =  dataset['PB_Time'].shift(periods=-1) - (dataset['Inter_RR_Time'] + dataset['DL_Time'] +  dataset['PB_Time'])
  groundtruth['Next_Request_Time'] = nr_time

  # ****************************************************************************
  # GT 2: Next Response Volume
  
  groundtruth['Next_Response_Vol'] = dataset['DL_Vol'].shift(periods=-1)

  ###############################################################

  # Check Ground Truth Consistency
  groundtruth.dropna(inplace=True)

  # Align Dataset and Groundtruth
  intersection = set(dataset.index).intersection(set(groundtruth.index))
  dataset = dataset.loc[intersection,:]
  groundtruth = groundtruth.loc[intersection,:]

  return dataset, groundtruth

### Write your code here



In [98]:
from os import listdir
from os.path import isfile, join

def create_dataset():
  path = '/content/Captures'
  dumpfiles = [f for f in listdir(path) if (isfile(join(path, f)))]

  dataset = pd.DataFrame()
  groundtruth = pd.DataFrame()
  for f in dumpfiles:
    print('Processing file: ', f)
    data = pd.read_csv(join(path, f), sep=',', encoding='latin-1')

    ips, server_names, uplink, downlink = filter_traffic(data, domain='googlevideo') 

    # Select dominant flow
    uplink_dominant, downlink_dominant = find_dominant(uplink, downlink)

    # Filter uplink packets larger or equal to 100 bytes
    uplink_dominant = timebased_filter(uplink_dominant, 100)

    temp_dataset, temp_groundtruth = features_extraction(uplink_dominant, downlink_dominant)
    dataset = dataset.append(temp_dataset, ignore_index=True)
    groundtruth = groundtruth.append(temp_groundtruth, ignore_index=True)

  return dataset, groundtruth

dataset, groundtruth = create_dataset()
print(dataset.shape)
display(dataset.head())
groundtruth.head()

Processing file:  Capture_v2_20.csv
Source           Destination
173.194.182.230  192.168.1.6    1.706543
173.194.188.230  192.168.1.6    0.853296
74.125.153.7     192.168.1.6    1.060220
74.125.99.108    192.168.1.6    1.938626
Name: Length, dtype: float64
Processing file:  Capture_v2_7.csv
Source          Destination
74.125.111.106  192.168.1.6     2.080363
91.81.217.141   192.168.1.6    32.572815
Name: Length, dtype: float64
Processing file:  Capture_v2_10.csv
Source           Destination
173.194.160.219  192.168.1.6    0.008240
173.194.188.72   192.168.1.6    1.974872
74.125.153.59    192.168.1.6    0.012162
74.125.99.108    192.168.1.6    0.015762
Name: Length, dtype: float64
Processing file:  Capture_v2_19.csv
Source           Destination
173.194.182.135  192.168.1.6    0.017631
74.125.99.168    192.168.1.6    0.199115
Name: Length, dtype: float64
Processing file:  Capture_v2_9.csv
Source           Destination
173.194.188.105  192.168.1.6    1.686488
Name: Length, dtype: float64


Unnamed: 0,Request_Size,Inter_RR_Time,DL_Time,DL_Vol,DL_Size,PB_Time
0,526,1.386408,9e-06,148,2,11.458677
1,583,0.005415,0.003963,11050,10,12.845593
2,592,0.00657,0.003175,2415,4,12.856409
3,729,0.023158,0.014913,70516,49,12.915444
4,600,0.0041,0.007433,70162,49,13.006727


Unnamed: 0,Next_Request_Time,Next_Response_Vol
0,0.000499,11050.0
1,0.001438,2415.0
2,0.04929,70516.0
3,0.053212,70162.0
4,0.049132,93454.0


## **Regression phase**

In [99]:
# Next_Request_Time
x = groundtruth.drop('Next_Request_Time', axis = 1).dropna()
y = groundtruth['Next_Request_Time']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('--- Stats Next_Request_Time ---')  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)), 'seconds')

--- Stats Next_Request_Time ---
Root Mean Squared Error: 6.393713809748829 seconds


In [100]:
# Next_Response_Vol
x = groundtruth.drop('Next_Response_Vol', axis = 1).dropna()
y = groundtruth['Next_Response_Vol']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 0)
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('--- Stats Next_Request_Time ---')  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)), 'bytes')

--- Stats Next_Request_Time ---
Root Mean Squared Error: 454190.93974613835 bytes
