In [353]:
import subprocess
import sys
import io
import numpy as np
import pandas as pd

DATE_INTERVAL = 10

#Читаем .pcap файл и экспортируем в cvs
pcapToCVSCom = 'tshark -r /Users/theeska/Downloads/dimapac.pcap -T fields \
-e frame.time_epoch -e ip.src -e ip.dst -e ip.proto -e frame.len -e ip.version \
-e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e ip.hdr_len -e tcp.hdr_len \
-E separator=, -E quote=d -E header=y -E occurrence=f'
proc = subprocess.Popen(pcapToCVSCom.split() + ['-Y', 'ip.proto==6 or ip.proto==17'],
                        stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = proc.communicate()
if error:
    sys.exit(error.decode("utf-8")) #или "ISO-8859-1"
else:
    f = open("/Users/theeska/Downloads/sel.csv", "w")
    f.write(output.decode("utf-8"))
    f.close()

In [355]:
# Чтение csv файлы. В данной случае читается из созданного потока, для чтения файла заменить на:
# traffic = pd.read_csv('/Users/theeska/Downloads/traffic.csv', encoding = "ISO-8859-1", low_memory=False)
traffic = pd.read_csv(io.StringIO(data), sep=',')

#Получение ip клиента
clientIP = pd.concat([traffic['ip.src'], traffic['ip.dst']]).value_counts().idxmax()

#Преобразование портов в один стобец
traffic['srcport'] = traffic['udp.srcport'].combine(traffic['tcp.srcport'], lambda x,y: x if np.isnan(y) else y).astype(int)
traffic['dstport'] = traffic['tcp.dstport'].combine(traffic['udp.dstport'], lambda x,y: x if np.isnan(y) else y).astype(int)
traffic = traffic.drop(['tcp.srcport', 'udp.srcport', 'tcp.dstport', 'udp.dstport'], axis=1)

#Получение конечных точек (IP:port) и указание направления трафика для пакета
traffic['isFromClient'] = traffic['ip.src'] == clientIP
traffic['src'] = traffic['ip.src'] + ":"  + traffic['srcport'].apply(str)
traffic['dest'] = traffic['ip.dst'] + ":" + traffic['dstport'].apply(str)
traffic = traffic.drop(['srcport', 'dstport', 'ip.src', 'ip.dst'], axis=1)

#Подстановка длины хэддера для UDP
traffic = traffic.rename(columns={"tcp.hdr_len": "transport_header"})
traffic.loc[traffic['ip.proto'] == 17, 'transport_header'] = int(8)

#Преобразование UNIX-времени к pandas.datetime64 группировка с интервалом DATE_INTERVAL секунд
traffic['frame.time_epoch'] = pd.to_datetime(traffic['frame.time_epoch'], unit='s')
grouped = traffic.groupby(pd.Grouper(freq=str(DATE_INTERVAL) + 'S', key='frame.time_epoch'))
intervalledPackeges = list()
for key, _ in grouped:
    intervalledPackeges.append((key, grouped.get_group(key)))
    
    
#Разделение пакетов в каждом интервале на потоки (пакеты между двумя конечными точками)
intervalledFlows = []
for timeGroup in intervalledPackeges:
    grouped = timeGroup[1].groupby(['src', 'dest'])
    finallMap = {}
    for key, dataframe in grouped:
        route = frozenset(key)
        if route in finallMap: 
            finallMap[route] = pd.concat([finallMap[route], dataframe])
        else:
            finallMap[route] = dataframe
    intervalledFlows.append((timeGroup[0], finallMap))

#Выпрямление стркутуры хранения, чтобы хранить все датасеты в массиве кортежей
#Кортеж (<интервал времени>, <конечные точки>, <пакеты>)
allFlowsList = []
for intervals in intervalledFlows:
    for flowName in intervals[1]:
        allFlowsList.append((intervals[0], flowName, intervals[1][flowName]))
        
# del traffic, intervalledPackeges, intervalledFlows, grouped, timeGroup, finallMap

In [None]:
flow['application_size'] = flow['frame.len'] - 14 - flow['ip.hdr_len'] - flow['transport_header']
transport_protocol = flow['ip.proto'].value_counts().idxmax()
ip_protocol_version = flow['ip.version'].value_counts().idxmax()

In [None]:
flow = flow.sort_values(by='frame.time_epoch', ascending=True)

#Пакетовые показатели клиента
client_flow = flow[flow['isFromClient'] == True]
client_package_size_mean = client_flow['frame.len'].mean()#1
client_package_size_std = client_flow['frame.len'].std()#2
client_size_sum = client_flow['frame.len'].sum()#16
client_application_size_sum = client_flow['application_size'].sum()#17
client_package_count = client_flow.shape[0]#18
client_efficiency = client_application_size_sum / client_package_size_sum#11

#Пакетовые показатели сервера
server_flow = flow[flow['isFromClient'] == False]
server_package_size_mean = server_flow['frame.len'].mean()#3
server_package_size_std = server_flow['frame.len'].std()#4
server_package_size_sum = server_flow['frame.len'].sum()#20
server_application_size_sum = server_flow['application_size'].sum()#21
server_package_count = server_flow.shape[0]#22
server_efficiency = server_application_size_sum / server_package_size_sum#12


#Пакетовые показатели отношения клиент к сервер
ratio_sizes = client_package_size_sum / server_package_size_sum#13
ratio_application_size = client_application_size_sum / server_application_size_sum #14
ratio_packages = flow[flow['isFromClient'] == True].shape[0] / flow[flow['isFromClient'] == False].shape[0]#15

In [None]:
#Расчет партий(batch) пакетов 
#batch_conf = (<количество полезных(прикладной нагрузки) пакетов>, <размеры пакетов в партии>)
isClientSender = flow['isFromClient'].iloc[0]
client_batches = []
server_batches = []
current_batch_size = 0
current_useful_package_count = 0
for index, row in flow.iterrows():

    #Нет полезной нагрузки
    if row['application_size'] == 0:
        continue
        
    #Направление нагрузки не изменилось
    if row['isFromClient'] == isClientSender:
        current_batch_size += row['frame.len']
        current_useful_package_count += 1
        continue
        
    #Направление нагрузки изменилось, поэтому записываем и подготоваливаем счетчики
    batch_conf = (current_useful_package_count, current_batch_size)
    client_batches.append(batch_conf) if isClientSender else server_batches.append(batch_conf)
    current_batch_size = row['frame.len']
    current_useful_package_count = 1
    isClientSender = row['isFromClient']
batch_conf = (current_useful_package_count, current_batch_size)
client_batches.append(batch_conf) if isClientSender else server_batches.append(batch_conf)

#Перевод в массивы numpy.array для статистических расчетов
client_batches_countes = np.array(list(map(lambda x: x[0], client_batches)))
client_batches_sizes = np.array(list(map(lambda x: x[1], client_batches)))
server_batches_countes = np.array(list(map(lambda x: x[0], server_batches)))
server_batches_sizes = np.array(list(map(lambda x: x[1], server_batches)))

#Партийные метрики клиента
client_batch_sizes_mean = client_batches_sizes.mean()#5
client_batch_sizes_std = client_batches_sizes.std()#6
client_batch_counts_mean = client_batches_countes.mean()#9
client_batch_counts_sum = len(client_batches_countes)#19

#Партийные метрики сервера
server_batch_sizes_mean = server_batches_sizes.mean()#7
server_batch_sizes_std = server_batches_sizes.std()#8
server_batch_counts_mean = server_batches_countes.mean()#10
server_batch_counts_sum = len(server_batches_countes)#23

In [359]:
a = 0
b = 0
key
for intervals in intervalledFlows:
    for flows in intervals:
        
        b += 1
        a += intervals[flows].shape[0]
print("packs =", a, " flows =", b, "  otnoshenie=", a/b)

packs = 243938  flows = 6128   otnoshenie= 39.807114882506525


In [2]:
allFlowsList[0]

NameError: name 'allFlowsList' is not defined

In [361]:
intervalledFlows[0][frozenset({'192.168.1.64:54759', '2.22.238.56:443'})]

Unnamed: 0,frame.time_epoch,ip.proto,frame.len,ip.version,ip.hdr_len,isFromClient,src,dest
2,2020-05-14 11:18:49.548606157,6,66,4,20,True,192.168.1.64:54759,2.22.238.56:443
3,2020-05-14 11:18:49.548671007,6,66,4,20,True,192.168.1.64:54759,2.22.238.56:443
4,2020-05-14 11:18:49.548846960,6,105,4,20,True,192.168.1.64:54759,2.22.238.56:443
5,2020-05-14 11:18:49.552702188,6,90,4,20,True,192.168.1.64:54759,2.22.238.56:443
6,2020-05-14 11:18:49.558002949,6,66,4,20,True,192.168.1.64:54759,2.22.238.56:443
0,2020-05-14 11:18:49.548538923,6,90,4,20,False,2.22.238.56:443,192.168.1.64:54759
1,2020-05-14 11:18:49.548543930,6,66,4,20,False,2.22.238.56:443,192.168.1.64:54759
7,2020-05-14 11:18:49.570630074,6,54,4,20,False,2.22.238.56:443,192.168.1.64:54759
8,2020-05-14 11:18:49.575381041,6,54,4,20,False,2.22.238.56:443,192.168.1.64:54759
9,2020-05-14 11:18:49.580176115,6,54,4,20,False,2.22.238.56:443,192.168.1.64:54759


In [1]:
whos

Interactive namespace is empty.
