In [5]:
def classifiers(data, seconds, save_dir):
    warnings.filterwarnings("ignore")
    from sklearn import svm
    from sklearn.preprocessing import StandardScaler
    from sklearn.cluster import DBSCAN
    from sklearn.decomposition import PCA
    from sklearn.neighbors import NearestNeighbors
    import seaborn as sns
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.neighbors import KNeighborsClassifier  
    
    
    dataset = pd.read_csv("/home/det_tesi/sgarofalo/Window size analysis/train_dataset/train_dataset.csv", header = [0, 1])
    colors = ['r', 'b']
    classes = [0, 1] # 0 = Not RTP and 1 = RTP
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1:]
    classes_str, y = np.unique(y, return_inverse=True)
    X = StandardScaler().fit_transform(X)
    columns = ["interarrival_std", "interarrival_mean", "len_udp_std", "len_udp_mean", 
               "udp_count", "len_udp_kbps" , "interlength_udp_mean"]
    print("Classifing with " + str(seconds) +" window size..")
    print("Training set classes distribution: %.2f%% RTP" % (100*len(y[y == 1])/len(y)))
    # #############################################################################
    # Correlation Matrix
    # #############################################################################
    corr = pd.DataFrame(data=X, columns = columns).corr()
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    plt.figure(figsize = (10,10))
    ax = sns.heatmap(corr, 
                xticklabels=corr.columns.values,
                yticklabels=corr.columns.values,
                annot = True, cmap="YlGnBu", mask = mask)
    t = "Correlation matrix"
    save_photo(save_dir, t, str(seconds)+"s")
    
    # #############################################################################
    # Plot Data Using PCA
    # #############################################################################
    myModel = PCA(2)
    PC = myModel.fit_transform(X)
    # print ("%.2f%% variance ratio with 2 PC" % (100*sum(myModel.explained_variance_ratio_)))
    principalDf = pd.DataFrame(data = np.column_stack((PC[:, 0:2], y)), columns = ['principal component 1', 'principal component 2', 'label'])
    fig = plt.figure(figsize = (13,13))
    #plt.scatter(principalDf.iloc[:, 0], principalDf.iloc[:, 1])
    for target, color in zip(classes, colors):
        indicesToKeep = principalDf['label'] == target
        plt.scatter(principalDf.loc[indicesToKeep, 'principal component 1']
                   , principalDf.loc[indicesToKeep, 'principal component 2']
                   , c = color, s = 40)
    plt.xlabel('Principal Component 1', fontsize = 15)
    plt.ylabel('Principal Component 2', fontsize = 15, labelpad = -10)
    plt.legend(classes_str)
    plt.grid()
    t = "Data Plotting using PCA"
    plt.title(t, fontsize = 10)
    save_photo(save_dir, t, str(seconds)+"s")
    
    # #############################################################################
    # SVM
    # #############################################################################
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    model = svm.SVC()
    C = [0.01, 0.1, 1, 10, 100, 1000]
    kernel = ['rbf']
    gamma = [0.001, 0.01, 0.1, 1, 10, 100]
    params = {'C': C, 'kernel': kernel, 'gamma': gamma}
    SVM = GridSearchCV(model, params, cv=5, n_jobs=-1, iid=True)
    SVM.fit(X_train, y_train)
    SVMAccuracy = SVM.best_estimator_.score(X_test, y_test)
    
    # #############################################################################
    # Random Forest
    # #############################################################################
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)
    model = RandomForestClassifier()
    max_features = [3, 5, 7]
    n_estimators = [100, 200, 500, 1000, 2000]
    param_grid = {
        'max_features': max_features,
        'n_estimators': n_estimators
    }
    RF = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
    RF.fit(X_train, y_train)
    
    RFAccuracy = RF.best_estimator_.score(X_test, y_test)
    feature_imp = pd.Series(RF.best_estimator_.feature_importances_, index=columns)
    fig = plt.figure(figsize = (13,13))
    sns.barplot(x=feature_imp, y=feature_imp.index)
    # Add labels to your graph
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.legend()
    t = "Important Features"
    plt.title(t, fontsize = 10)
    save_photo(save_dir, t, str(seconds)+"s")
    
    # #############################################################################
    # KNN
    # #############################################################################
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)
    model = KNeighborsClassifier()
    metric = ["manhattan", "euclidean", "chebyshev"]
    weights = ['uniform', 'distance']
    params = {"metric": metric, 'weights': weights, 'n_neighbors': range(1, 40)}
    KNN = GridSearchCV(model, params, cv=5, n_jobs=-1)
    KNN.fit(X_train, y_train)
    KNNAccuracy = KNN.best_estimator_.score(X_test, y_test)

    # #############################################################################
    # Test Set
    # #############################################################################  
    dataset = pd.read_csv("/home/det_tesi/sgarofalo/Window size analysis/test_dataset/test_dataset.csv", header = [0, 1])
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1:]
    classes_str, y = np.unique(y, return_inverse=True)
    X = StandardScaler().fit_transform(X)
    print()
    print("test_dataset shape: " + str(dataset.shape))
    SVMAccuracy = SVM.best_estimator_.score(X, y)
    print("SVM accuracy => %.2f%%" % (100*SVMAccuracy))
    RFAccuracy = RF.best_estimator_.score(X, y)
    print("RF accuracy => %.2f%%" % (100*RFAccuracy))
    KNNAccuracy = KNN.best_estimator_.score(X, y)
    print("KNN accuracy => %.2f%%" % (100*KNNAccuracy))
    
    hist_data = {"SVM": [SVMAccuracy], 
                 "Random Forest": [RFAccuracy],
                 "KNN": [KNNAccuracy]}
    sns.barplot(data = pd.DataFrame(data=hist_data))
    t = "Classifiers Comparison"
    plt.title(t, fontsize = 10)
    save_photo(save_dir, t, str(seconds)+"s")
    
    data[seconds] = {}
    data[seconds]["SVM"] = SVMAccuracy 
    data[seconds]["RF"] = RFAccuracy
    data[seconds]["KNN"] = KNNAccuracy

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import os
import matplotlib.pyplot as plt
from json import JSONDecoder, JSONDecodeError
import sys
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")


def decode_stacked(document, pos=0, decoder=JSONDecoder()):
    NOT_WHITESPACE = re.compile(r'[^\s]')
    while True:
        match = NOT_WHITESPACE.search(document, pos)
        if not match:
            return
        pos = match.start()

        try:
            obj, pos = decoder.raw_decode(document, pos)
        except JSONDecodeError:
            # do something sensible if there's some error
            raise
        yield obj
        

def save_photo(save_dir, t, window_size=None):
    if sys.platform.startswith("linux"):
        save_dir = save_dir+'/Plots/'
        save_dir_flow = save_dir + window_size + '/'
    elif sys.platform.startswith("win32"):
        save_dir = save_dir+r'\\Plots\\'
        save_dir_flow = save_dir + window_size + r'\\'    
    dpi = 100
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    if window_size == None:
        plt.savefig(save_dir + t +'.png', dpi = dpi)
    else:
        if not os.path.exists(save_dir_flow):
            os.makedirs(save_dir_flow)
        plt.savefig(save_dir_flow + t +'.png', dpi = dpi)      
    plt.close()
    
#Open Tshark and run command to turn pcap to json
def pcap_to_json(source_pcap, protocol_policy):
    import subprocess
        
    # Retrive all STUN packets
    command = ['tshark', '-r', source_pcap, '-l', '-n', '-T', 'ek', '-Y (stun)']
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=None, encoding = 'utf-8', )
    try:
        output, error = process.communicate()
    except Exception as e:
        print ("Errore in pcap_to_json " + str(e))
        process.kill()

    # I've got all STUN packets: need to find which ports are used by RTP
    used_port = set()
    for obj in decode_stacked(output):
        if 'index' in obj.keys():
            continue
        if 'stun' in obj['layers'].keys() and "0x00000101" in obj['layers']["stun"]["stun_stun_type"]:          #0x0101 means success
            used_port.add(obj['layers']["udp"]["udp_udp_srcport"])
            used_port.add(obj['layers']["udp"]["udp_udp_dstport"])
    
    command = ['tshark', '-r', source_pcap, '-l', '-n', '-T', 'ek', protocol_policy, 'rtp']
    for port in used_port:
        command.append("-d udp.port==" + port + ",rtp")
    process = subprocess.Popen(command, stdout=subprocess.PIPE, encoding = 'utf-8', stderr=None)
    try:
        output, error = process.communicate()
        return output
    except Exception as e:
        print ("Errore in pcap_to_json " + str(e))
        process.kill()
    return output


#Read json created with tshark and put it in a list
def json_to_list(output):
    
    def udp_insert(obj, unique_flow, dict_flow_data, dictionary):
        
        # Retrive flow information 
        if "ipv6" in obj['layers']:
            source_addr = obj['layers']['ipv6']['ipv6_ipv6_src']
            dest_addr = obj['layers']['ipv6']['ipv6_ipv6_dst']
        elif "ip" in obj["layers"]:
            source_addr = obj['layers']['ip']['ip_ip_src']
            dest_addr = obj['layers']['ip']['ip_ip_dst']
        source_port = int(obj['layers']['udp']['udp_udp_srcport'])
        dest_port = int(obj['layers']['udp']['udp_udp_dstport'])
        
        # Save ssrc if new
        unique_tuple = (source_addr, dest_addr, source_port, dest_port)
        unique_flow.add(unique_tuple)

        # Retrive packet information
        frame_num = int(obj['layers']['frame']['frame_frame_number'])       
        len_udp = int(obj['layers']['udp']['udp_udp_length'])
        len_frame = int(obj['layers']['frame']['frame_frame_len'])
        timestamp = float(obj['layers']['frame']['frame_frame_time_epoch'])
        label = "RTP" if 'rtp' in obj['layers'].keys() else "Not RTP"
        # Add new packet to dictionary
#        columns = ['frame_num', 'p_type', 'len_udp', 'len_ip', 'len_frame', 'timestamps', 'rtp_timestamp', 'rtp_seq_num']
        data = [frame_num, len_udp, len_frame, timestamp, label]
        
        if unique_tuple in dictionary:
            dictionary[unique_tuple].append(data)
        else:
            dictionary[unique_tuple] = []
            dictionary[unique_tuple].append(data)
    
    dict_data = {}

    #Find RTP flows
    unique_flow = set()
    dict_flow_data = {}
    
    # df containign unique flow
    df_unique_flow = pd.DataFrame(columns = ['source_addr',
                           'dest_addr',
                           'source_port',
                           'dest_port'])
    

    # Analyze each packet
    for obj in decode_stacked(output):
        #remove instances which have only index:date and type:pcap_file
        if 'index' in obj.keys():
            continue
        elif 'udp' in obj['layers'].keys():
            udp_insert(obj, unique_flow, dict_flow_data, dict_data)

    for x in unique_flow:
        columns = ['frame_num', 'len_udp', 'len_frame', 'timestamps', 'label']
        dict_flow_data[x] = pd.DataFrame(dict_data[x], columns=columns)
        df_unique_flow = df_unique_flow.append({'source_addr': x[0], 'dest_addr': x[1], 
                'source_port': x[2], 'dest_port': x[3]}, ignore_index = True)
        
    return dict_flow_data, df_unique_flow, unique_flow

#    return df_rtp, l_rtp, l_non_rtp, l_stun, l_rtcp, l_turn, l_tcp, l_only_udp, unique_flow


#Create nicknames for every flow
def make_nicknames(dict_flow_data):
    
    #New: packet lenght mean -> audio/video
    dict_flow_nickname = {}
    threshold = 300
    for key in dict_flow_data:
        mean = dict_flow_data[key]["len_frame"].mean()
        if mean > threshold:
            dict_flow_nickname[key] = "video"
        else:
            dict_flow_nickname[key] = "audio"
#    
    


    return dict_flow_nickname



#Make dictionary flow : df, bytes_per_second, packets_per_second
def make_rtp_data(dict_flow_data):
        
    
    packets_per_second = {}
    bytes_per_second = {}
    inter_packet_gap_s = {}
    inter_rtp_timestamp_gap = {}
    
    for flow_id in dict_flow_data:
        inner_df = dict_flow_data[flow_id].sort_values('timestamps')
        inter_packet_gap_s[flow_id] = (inner_df.timestamps - inner_df.timestamps.shift()).dropna()
        inter_rtp_timestamp_gap[flow_id] = (inner_df.rtp_timestamp - inner_df.rtp_timestamp.shift()).dropna()
        
        # Need to define an index in order to use resample method
        datetime = pd.to_datetime(inner_df.timestamps, unit = 's')
        inner_df = inner_df.set_index(datetime)
        packets_per_second[flow_id] = inner_df.iloc[:,0].resample('S').count()
        bytes_per_second[flow_id] = inner_df['len_frame'].resample('S').sum()
        
    return packets_per_second, bytes_per_second, inter_packet_gap_s, inter_rtp_timestamp_gap



def calculate_packet_loss(dict_flow_data):
     
    #Calculate packet loss
    dict_flow_packet_loss = {}
    for flow_id in dict_flow_data:
        seq = dict_flow_data[flow_id]['rtp_seq_num'].sort_values()
        seq_diff = (seq - seq.shift()).fillna(1)
        dict_flow_packet_loss[flow_id] = (seq_diff.where(seq_diff != 1)-1).sum()
    
#    print("Packet losses: ", dict_flow_packet_loss)
    
    return dict_flow_packet_loss
#    


def inter_statistics(dict_flow_data):
    for flow_id in dict_flow_data:
        dict_flow_data[flow_id]["interarrival"] = dict_flow_data[flow_id]["timestamps"].diff()
        dict_flow_data[flow_id]["interlength_udp"] = dict_flow_data[flow_id]["len_udp"].diff()
#        dict_flow_data[flow_id]["label"] = 0 if dict_flow_data[flow_id]["len_udp"].max() > 500 else 1
    return dict_flow_data

def kbps(series):
    return series.sum()*8/1024

def max_count(series):
    a = len(series[series == "RTP"]) > len(series[series == "Not RTP"])
    return "RTP" if a else "Not RTP"


In [None]:
data = {}
for seconds in range(1, 11):
    seconds_samples = str(seconds) + "s"
    save_dir = "/home/det_tesi/sgarofalo/Window size analysis"
    print("Building datasets with seconds_samples = " + seconds_samples)
    for source_pcap, filename in zip(["/home/det_tesi/sgarofalo/Window size analysis/train_dataset", "/home/det_tesi/sgarofalo/Window size analysis/test_dataset"], ["train_dataset.csv", "test_dataset.csv"]):        
        arr = os.listdir(source_pcap)
        arr = [x for x in arr if x.split(".")[1].startswith(("pcapng", "pcap"))]
        df_merged = pd.DataFrame()
        for file in arr:
            file_path = source_pcap + "/" + file
            protocol_policy = "--disable-protocol" if 'game' in file else '--enable-protocol'
            output = pcap_to_json(file_path, protocol_policy)
            dict_flow_data, df_unique_flow, unique_flow = json_to_list(output)
            dict_flow_data = inter_statistics(dict_flow_data) 
            df_train = pd.DataFrame()
            for flow_id in dict_flow_data.keys():
                dict_flow_data[flow_id]["timestamps"] = pd.to_datetime(dict_flow_data[flow_id]["timestamps"], unit = 's')
                dict_flow_data[flow_id].set_index('timestamps', inplace = True)
                train = dict_flow_data[flow_id].resample(seconds_samples).agg({'interarrival' : ['std', 'mean'], 'len_udp' : ['std', 'mean', 'count', kbps], \
                            'interlength_udp' : ['mean'], "label": max_count})
                df_train = pd.concat([df_train, train])
            df_train_dropped = df_train.dropna()
            #print("[%s]: dataset shape: %s" % (file.split(".")[0], str(df_train_dropped.shape)))
            y = df_train_dropped.label.max_count
            #print("[%s]: dataset classes distribution: %.2f%% RTP" % (file.split(".")[0], 100*len(y[y == "RTP"])/len(y)))
            df_merged = pd.concat([df_merged, df_train_dropped], ignore_index=True)
        df_merged.to_csv(source_pcap + "/" + filename, sep=',', encoding='utf-8', index=False)
    classifiers(data, seconds, save_dir)
    print()
columns = ["SVM", "RF", "KNN"]
df = pd.DataFrame(columns=columns)
for i in data:
    df = df.append({"SVM": data[i]["SVM"], "RF": data[i]["RF"], "KNN": data[i]["KNN"]}, ignore_index=True)
df.index +=  1 
plt.figure(figsize=(20, 16))
plt.plot(df)
plt.xlabel("Seconds")
plt.ylabel("Accuracy")
plt.legend(columns, fontsize = 16)
t = "Window size analysis"
plt.title(t, fontsize=16)
path = "/home/det_tesi/sgarofalo/Window size analysis/result"
plt.savefig(path + "/" + t + '.png', dpi = 100)

Building datasets with seconds_samples = 1s
/home/det_tesi/sgarofalo/Window size analysis/train_dataset/python_tm.pcap
/home/det_tesi/sgarofalo/Window size analysis/train_dataset/python_tm_2.pcap
/home/det_tesi/sgarofalo/Window size analysis/train_dataset/Simone_cattura_webex_app_ethernet_07_11_2_people_video.pcapng
/home/det_tesi/sgarofalo/Window size analysis/train_dataset/Simone_cattura_webex_app_ethernet_07_11_2_people_audio_video_doing_nothing.pcapng
/home/det_tesi/sgarofalo/Window size analysis/train_dataset/udp_nonRTP_traffic.pcapng
