In [119]:
import subprocess
import sys
import io
import numpy as np
import pandas as pd

#Интервал времени в секундах деления пакетов
DATE_INTERVAL = 10
#Список конечных параметров для сравнения. В коде почечены темиже числами для ориентирования
    'client_package_size_mean', #1
    'client_package_size_std', #2
    'server_package_size_mean', #3
    'server_package_size_std', #4
    'client_batch_sizes_mean', #5
    'client_batch_sizes_std', #6
    'server_batch_sizes_mean', #7
    'server_batch_sizes_std', #8
    'client_batch_counts_mean', #9
    'server_batch_counts_mean', #10
    'client_efficiency', #11
    'server_efficiency', #12
    'ratio_sizes', #13
    'ratio_application_size', #14
    'ratio_packages', #15
    'client_package_size_sum', #16
    'client_application_size_sum', #17
    'client_package_count', #18
    'client_batch_counts_sum', #19
    'server_package_size_sum', #20
    'server_application_size_sum', #21
    'server_package_count', #22
    'server_batch_counts_sum', #23
    'transport_protocol', #24
    'ip_protocol_version', #25
]

pcap_file_path = '../selpack.pcap'
#Читаем .pcap файл и экспортируем в cvs
pcapToCVSCom = 'tshark -r '+ pcap_file_path + ' -T fields \
-e frame.time_epoch -e ip.src -e ip.dst -e ip.proto -e frame.len -e ip.version \
-e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e ip.hdr_len -e tcp.hdr_len \
-E separator=, -E quote=d -E header=y -E occurrence=f'
proc = subprocess.Popen(pcapToCVSCom.split() + ['-Y', '(ip.proto == 17 || ip.proto == 6) && !icmp'],
                        stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = proc.communicate()
data = ''
if error:
    sys.exit(error.decode("utf-8")) #или "ISO-8859-1"
else:
    data = output.decode("utf-8")
    f = open("./sel.csv", "w")
    f.write(output.decode("utf-8"))
    f.close()

In [106]:
# Чтение csv файлы. В данной случае читается из созданного потока, для чтения файла использовать:
# traffic = pd.read_csv('/Users/theeska/Downloads/traffic.csv', encoding = "ISO-8859-1", low_memory=False)
traffic = pd.read_csv(io.StringIO(data), sep=',')

#Получение ip клиента
clientIP = pd.concat([traffic['ip.src'], traffic['ip.dst']]).value_counts().idxmax()

#Преобразование портов в один стобец
traffic['srcport'] = traffic['udp.srcport'].combine(traffic['tcp.srcport'], lambda x,y: x if np.isnan(y) else y).astype(int)
traffic['dstport'] = traffic['tcp.dstport'].combine(traffic['udp.dstport'], lambda x,y: x if np.isnan(y) else y).astype(int)
traffic = traffic.drop(['tcp.srcport', 'udp.srcport', 'tcp.dstport', 'udp.dstport'], axis=1)

#Получение конечных точек (<IP>:<port>) и указание направления трафика для пакета
traffic['isFromClient'] = traffic['ip.src'] == clientIP
traffic['src'] = traffic['ip.src'] + ":"  + traffic['srcport'].apply(str)
traffic['dest'] = traffic['ip.dst'] + ":" + traffic['dstport'].apply(str)
traffic = traffic.drop(['srcport', 'dstport', 'ip.src', 'ip.dst'], axis=1)

#Подстановка длины хэддера для UDP
traffic = traffic.rename(columns={"tcp.hdr_len": "transport_header"})
traffic.loc[traffic['ip.proto'] == 17, 'transport_header'] = int(8)

#рассчет прикладной нагрузки пакета
traffic['application_size'] = traffic['frame.len'] - 14 - traffic['ip.hdr_len'] - traffic['transport_header']

#Преобразование UNIX-времени к pandas.datetime64 группировка с интервалом DATE_INTERVAL секунд
traffic['frame.time_epoch'] = pd.to_datetime(traffic['frame.time_epoch'], unit='s')
grouped = traffic.groupby(pd.Grouper(freq=str(DATE_INTERVAL) + 'S', key='frame.time_epoch'))
intervalledPackeges = list()
for key, _ in grouped:
    intervalledPackeges.append((key, grouped.get_group(key)))
    
    
#Разделение пакетов в каждом интервале на потоки (пакеты между двумя конечными точками)
intervalledFlows = []
for timeGroup in intervalledPackeges:
    grouped = timeGroup[1].groupby(['src', 'dest'])
    finallMap = {}
    for key, dataframe in grouped:
        route = frozenset(key)
        if route in finallMap: 
            finallMap[route] = pd.concat([finallMap[route], dataframe])
        else:
            finallMap[route] = dataframe
    intervalledFlows.append((timeGroup[0], finallMap))

#Выпрямление стркутуры хранения, чтобы хранить все датасеты в массиве кортежей
#Кортеж (<интервал времени>, <конечные точки>, <пакеты>)
allFlowsList = []
for intervals in intervalledFlows:
    for flowName in intervals[1]:
        allFlowsList.append((intervals[0], flowName, intervals[1][flowName]))
        
#Очистка ненужных больших данных
#del traffic, intervalledPackeges, intervalledFlows, grouped, timeGroup, finallMap

In [116]:
flow = flow.sort_values(by='frame.time_epoch', ascending=True)
statistic_data = {}

#Пакетовые показатели клиента
client_flow = flow[flow['isFromClient'] == True]
statistic_data['client_package_size_mean'] = client_flow['frame.len'].mean()#1
statistic_data['client_package_size_std'] = client_flow['frame.len'].std()#2
statistic_data['client_package_size_sum'] = client_flow['frame.len'].sum()#16
statistic_data['client_application_size_sum'] = client_flow['application_size'].sum()#17
statistic_data['client_package_count'] = client_flow.shape[0]#18
statistic_data['client_efficiency'] = statistic_data['client_application_size_sum'] / statistic_data['client_package_size_sum']#11

#Пакетовые показатели сервера
server_flow = flow[flow['isFromClient'] == False]
statistic_data['server_package_size_mean'] = server_flow['frame.len'].mean()#3
statistic_data['server_package_size_std'] = server_flow['frame.len'].std()#4
statistic_data['server_package_size_sum'] = server_flow['frame.len'].sum()#20
statistic_data['server_application_size_sum'] = server_flow['application_size'].sum()#21
statistic_data['server_package_count'] = server_flow.shape[0]#22
statistic_data['server_efficiency'] = statistic_data['server_application_size_sum'] / statistic_data['server_package_size_sum']#12


#Пакетовые показатели отношения клиент к сервер
statistic_data['ratio_sizes'] = statistic_data['client_package_size_sum'] / statistic_data['server_package_size_sum']#13
statistic_data['ratio_application_size'] = statistic_data['client_application_size_sum'] / statistic_data['server_application_size_sum'] #14
statistic_data['ratio_packages'] = statistic_data['client_package_count'] / statistic_data['server_package_count']#15

statistic_data['transport_protocol'] = flow['ip.proto'].value_counts().idxmax()#24
statistic_data['ip_protocol_version'] = flow['ip.version'].value_counts().idxmax()#25

In [117]:
#Расчет партий(batch) пакетов 
#batch_conf = (<количество полезных(имеющих прикладную нагрузку) пакетов>, <суммарный размер пакетов в партии>)
isClientSender = flow['isFromClient'].iloc[0]
client_batches = []
server_batches = []
current_batch_size = 0
current_useful_package_count = 0
for index, row in flow.iterrows():

    #Нет полезной нагрузки
    if row['application_size'] == 0:
        continue
        
    #Направление нагрузки не изменилось
    if row['isFromClient'] == isClientSender:
        current_batch_size += row['frame.len']
        current_useful_package_count += 1
        continue
        
    #Направление нагрузки изменилось, поэтому записываем и подготоваливаем счетчики
    batch_conf = (current_useful_package_count, current_batch_size)
    client_batches.append(batch_conf) if isClientSender else server_batches.append(batch_conf)
    current_batch_size = row['frame.len']
    current_useful_package_count = 1
    isClientSender = row['isFromClient']
batch_conf = (current_useful_package_count, current_batch_size)
client_batches.append(batch_conf) if isClientSender else server_batches.append(batch_conf)

#Перевод в массивы numpy.array для статистических расчетов
client_batches_countes = np.array(list(map(lambda x: x[0], client_batches)))
client_batches_sizes = np.array(list(map(lambda x: x[1], client_batches)))
server_batches_countes = np.array(list(map(lambda x: x[0], server_batches)))
server_batches_sizes = np.array(list(map(lambda x: x[1], server_batches)))

#Партийные метрики клиента
statistic_data['client_batch_sizes_mean'] = client_batches_sizes.mean()#5
statistic_data['client_batch_sizes_std'] = client_batches_sizes.std()#6
statistic_data['client_batch_counts_mean'] = client_batches_countes.mean()#9
statistic_data['client_batch_counts_sum'] = len(client_batches_countes)#19

#Партийные метрики сервера
statistic_data['server_batch_sizes_mean'] = server_batches_sizes.mean()#7
statistic_data['server_batch_sizes_std'] = server_batches_sizes.std()#8
statistic_data['server_batch_counts_mean'] = server_batches_countes.mean()#10
statistic_data['server_batch_counts_sum'] = len(server_batches_countes)#23

In [186]:
df = pd.DataFrame(columns= ['route', 'timestamp'] + FEATURES)
for i, flow in enumerate(allFlowsList):
    df.loc[i] = getStatisticDataFromFlow(flow)
df 

  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  if sys.path[0] == '':


Unnamed: 0,route,timestamp,client_package_size_mean,client_package_size_std,server_package_size_mean,server_package_size_std,client_batch_sizes_mean,client_batch_sizes_std,server_batch_sizes_mean,server_batch_sizes_std,...,client_package_size_sum,client_application_size_sum,client_package_count,client_batch_counts_sum,server_package_size_sum,server_application_size_sum,server_package_count,server_batch_counts_sum,transport_protocol,ip_protocol_version
0,"(192.168.1.64:54759, 2.22.238.56:443)",2020-05-14 11:18:40,78.6,18.049931,63.6,15.646086,195.0,0.0,90.0,0.0,...,393,63.0,5,1,318,24.0,5,1,6,4
1,"(192.168.1.64:54770, 17.253.123.203:80)",2020-05-14 11:18:50,110.285714,111.966832,609.5,713.942225,364.0,0.0,3451.0,0.0,...,772,298.0,7,1,3657,3253.0,6,1,6,4
2,"(192.168.1.64:54719, 173.194.73.101:443)",2020-05-14 11:18:50,66.0,,60.0,,,,60.0,0.0,...,66,0.0,1,0,60,6.0,1,1,6,4
3,"(173.194.73.101:443, 192.168.1.64:54721)",2020-05-14 11:18:50,66.0,,60.0,,,,60.0,0.0,...,66,0.0,1,0,60,6.0,1,1,6,4
4,"(192.168.1.64:54771, 178.18.141.21:443)",2020-05-14 11:18:50,285.0,481.936977,529.785714,627.894428,1405.0,1464.657184,2342.333333,1408.180939,...,4845,3891.0,17,3,7417,6649.0,14,3,6,4
5,"(178.237.20.123:443, 192.168.1.64:54569)",2020-05-14 11:18:50,54.0,,66.0,,0.0,0.0,,,...,54,0.0,1,1,66,0.0,1,0,6,4
6,"(192.168.1.1:53, 192.168.1.64:50475)",2020-05-14 11:18:50,88.0,,226.0,,88.0,0.0,226.0,0.0,...,88,46.0,1,1,226,184.0,1,1,17,4
7,"(192.168.1.64:54615, 192.168.1.1:53)",2020-05-14 11:18:50,80.0,,386.0,,80.0,0.0,386.0,0.0,...,80,38.0,1,1,386,344.0,1,1,17,4
8,"(192.168.1.64:50541, 196.55.247.168:9505)",2020-05-14 11:18:50,216.857143,190.009148,152.333333,82.218408,418.0,37.478883,228.5,199.626526,...,1518,1056.0,7,3,914,518.0,6,4,6,4
9,"(64.233.165.147:443, 192.168.1.64:54664)",2020-05-14 11:18:50,66.0,,60.0,,,,60.0,0.0,...,66,0.0,1,0,60,6.0,1,1,6,4


Unnamed: 0,route,timestamp,client_package_size_mean,client_package_size_std,server_package_size_mean,server_package_size_std,client_batch_sizes_mean,client_batch_sizes_std,server_batch_sizes_mean,server_batch_sizes_std,...,client_package_size_sum,client_application_size_sum,client_package_count,client_batch_counts_sum,server_package_size_sum,server_application_size_sum,server_package_count,server_batch_counts_sum,transport_protocol,ip_protocol_version
0,"(192.168.1.64:54759, 2.22.238.56:443)",2020-05-14 11:18:40,78.6,18.049931,63.6,15.646086,195.0,0.0,90.0,0.0,...,393,63.0,5,1,318,24.0,5,1,6,4
1,"(192.168.1.64:54770, 17.253.123.203:80)",2020-05-14 11:18:50,110.285714,111.966832,609.5,713.942225,364.0,0.0,3451.0,0.0,...,772,298.0,7,1,3657,3253.0,6,1,6,4
2,"(192.168.1.64:54719, 173.194.73.101:443)",2020-05-14 11:18:50,66.0,0.0,60.0,0.0,0.0,0.0,60.0,0.0,...,66,0.0,1,0,60,6.0,1,1,6,4
3,"(173.194.73.101:443, 192.168.1.64:54721)",2020-05-14 11:18:50,66.0,0.0,60.0,0.0,0.0,0.0,60.0,0.0,...,66,0.0,1,0,60,6.0,1,1,6,4
4,"(192.168.1.64:54771, 178.18.141.21:443)",2020-05-14 11:18:50,285.0,481.936977,529.785714,627.894428,1405.0,1464.657184,2342.333333,1408.180939,...,4845,3891.0,17,3,7417,6649.0,14,3,6,4
5,"(178.237.20.123:443, 192.168.1.64:54569)",2020-05-14 11:18:50,54.0,0.0,66.0,0.0,0.0,0.0,0.0,0.0,...,54,0.0,1,1,66,0.0,1,0,6,4
6,"(192.168.1.1:53, 192.168.1.64:50475)",2020-05-14 11:18:50,88.0,0.0,226.0,0.0,88.0,0.0,226.0,0.0,...,88,46.0,1,1,226,184.0,1,1,17,4
7,"(192.168.1.64:54615, 192.168.1.1:53)",2020-05-14 11:18:50,80.0,0.0,386.0,0.0,80.0,0.0,386.0,0.0,...,80,38.0,1,1,386,344.0,1,1,17,4
8,"(192.168.1.64:50541, 196.55.247.168:9505)",2020-05-14 11:18:50,216.857143,190.009148,152.333333,82.218408,418.0,37.478883,228.5,199.626526,...,1518,1056.0,7,3,914,518.0,6,4,6,4
9,"(64.233.165.147:443, 192.168.1.64:54664)",2020-05-14 11:18:50,66.0,0.0,60.0,0.0,0.0,0.0,60.0,0.0,...,66,0.0,1,0,60,6.0,1,1,6,4


In [180]:
def getStatisticDataFromFlow(flow_data):
    flow = flow_data[2].sort_values(by='frame.time_epoch', ascending=True)
    statistic_data = {}

    #Пакетовые показатели клиента
    client_flow = flow[flow['isFromClient'] == True]
    statistic_data['client_package_size_mean'] = client_flow['frame.len'].mean()#1
    statistic_data['client_package_size_std'] = client_flow['frame.len'].std()#2
    statistic_data['client_package_size_sum'] = client_flow['frame.len'].sum()#16
    statistic_data['client_application_size_sum'] = client_flow['application_size'].sum()#17
    statistic_data['client_package_count'] = client_flow.shape[0]#18
    statistic_data['client_efficiency'] = statistic_data['client_application_size_sum'] / statistic_data['client_package_size_sum']#11

    #Пакетовые показатели сервера
    server_flow = flow[flow['isFromClient'] == False]
    statistic_data['server_package_size_mean'] = server_flow['frame.len'].mean()#3
    statistic_data['server_package_size_std'] = server_flow['frame.len'].std()#4
    statistic_data['server_package_size_sum'] = server_flow['frame.len'].sum()#20
    statistic_data['server_application_size_sum'] = server_flow['application_size'].sum()#21
    statistic_data['server_package_count'] = server_flow.shape[0]#22
    statistic_data['server_efficiency'] = statistic_data['server_application_size_sum'] / statistic_data['server_package_size_sum']#12


    #Пакетовые показатели отношения клиент к сервер
    statistic_data['ratio_sizes'] = statistic_data['client_package_size_sum'] / statistic_data['server_package_size_sum']#13
    statistic_data['ratio_application_size'] = statistic_data['client_application_size_sum'] / statistic_data['server_application_size_sum'] #14
    statistic_data['ratio_packages'] = statistic_data['client_package_count'] / statistic_data['server_package_count']#15

    statistic_data['transport_protocol'] = flow['ip.proto'].value_counts().idxmax()#24
    statistic_data['ip_protocol_version'] = flow['ip.version'].value_counts().idxmax()#25
    
    
    #Расчет партий(batch) пакетов 
    #batch_conf = (<количество полезных(имеющих прикладную нагрузку) пакетов>, <суммарный размер пакетов в партии>)
    isClientSender = flow['isFromClient'].iloc[0]
    client_batches = []
    server_batches = []
    current_batch_size = 0
    current_useful_package_count = 0
    for index, row in flow.iterrows():

        #Нет полезной нагрузки
        if row['application_size'] == 0:
            continue

        #Направление нагрузки не изменилось
        if row['isFromClient'] == isClientSender:
            current_batch_size += row['frame.len']
            current_useful_package_count += 1
            continue

        #Направление нагрузки изменилось, поэтому записываем и подготоваливаем счетчики
        batch_conf = (current_useful_package_count, current_batch_size)
        client_batches.append(batch_conf) if isClientSender else server_batches.append(batch_conf)
        current_batch_size = row['frame.len']
        current_useful_package_count = 1
        isClientSender = row['isFromClient']
    batch_conf = (current_useful_package_count, current_batch_size)
    client_batches.append(batch_conf) if isClientSender else server_batches.append(batch_conf)

    #Перевод в массивы numpy.array для статистических расчетов
    client_batches_countes = np.array(list(map(lambda x: x[0], client_batches)))
    client_batches_sizes = np.array(list(map(lambda x: x[1], client_batches)))
    server_batches_countes = np.array(list(map(lambda x: x[0], server_batches)))
    server_batches_sizes = np.array(list(map(lambda x: x[1], server_batches)))

    #Партийные метрики клиента
    statistic_data['client_batch_sizes_mean'] = client_batches_sizes.mean()#5
    statistic_data['client_batch_sizes_std'] = client_batches_sizes.std()#6
    statistic_data['client_batch_counts_mean'] = client_batches_countes.mean()#9
    statistic_data['client_batch_counts_sum'] = len(client_batches_countes)#19

    #Партийные метрики сервера
    statistic_data['server_batch_sizes_mean'] = server_batches_sizes.mean()#7
    statistic_data['server_batch_sizes_std'] = server_batches_sizes.std()#8
    statistic_data['server_batch_counts_mean'] = server_batches_countes.mean()#10
    statistic_data['server_batch_counts_sum'] = len(server_batches_countes)#23
    
    #Создает массив со всеми параметрами потока ля конечного датасета
    df_row = [flow_data[1], flow_data[0]]
    for name in FEATURES:
        df_row.append(statistic_data[name])
    return df_row

In [192]:
stats = getStatisticDataFromFlow(allFlowsList[0])

In [193]:
stats = None
if stats:
    print(stats)