In [15]:
import subprocess
import sys
import io
import numpy as np
import pandas as pd

#Интервал времени в секундах деления пакетов
DATE_INTERVAL = 30
#Список конечных параметров для сравнения. В коде почечены темиже числами для ориентирования
FEATURES = [
    'client_package_size_mean', #1
    'client_package_size_std', #2
    'server_package_size_mean', #3
    'server_package_size_std', #4
    'client_batch_sizes_mean', #5
    'client_batch_sizes_std', #6
    'server_batch_sizes_mean', #7
    'server_batch_sizes_std', #8
    'client_batch_counts_mean', #9
    'server_batch_counts_mean', #10
    'client_efficiency', #11
    'server_efficiency', #12
    'ratio_sizes', #13
    'ratio_application_size', #14
    'ratio_packages', #15
    'client_package_size_sum', #16
    'client_application_size_sum', #17
    'client_package_count', #18
    'client_batch_counts_sum', #19
    'server_package_size_sum', #20
    'server_application_size_sum', #21
    'server_package_count', #22
    'server_batch_counts_sum', #23
    'transport_protocol', #24
    'ip_protocol_version', #25
]

pcap_file_path = '../my1601.pcapng'
#Читаем .pcap файл и экспортируем в cvs
pcapToCVSCom = 'tshark -r '+ pcap_file_path + ' -T fields \
-e frame.time_epoch -e ip.src -e ip.dst -e ip.proto -e frame.len -e ip.version \
-e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e ip.hdr_len -e tcp.hdr_len \
-E separator=, -E quote=d -E header=y -E occurrence=f'
proc = subprocess.Popen(pcapToCVSCom.split() + ['-Y', '(ip.proto == 17 || ip.proto == 6) && !icmp'],
                        stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = proc.communicate()
data = ''
if error:
    sys.exit(error.decode("utf-8")) #или "ISO-8859-1"
else:
    data = output.decode("utf-8")
    f = open("./sel.csv", "w")
    f.write(output.decode("utf-8"))
    f.close()

In [16]:
# Чтение csv файлы. В данной случае читается из созданного потока, для чтения файла использовать:
# traffic = pd.read_csv('/Users/theeska/Downloads/traffic.csv', encoding = "ISO-8859-1", low_memory=False)
traffic = pd.read_csv(io.StringIO(data), sep=',')

#Получение ip клиента
clientIP = pd.concat([traffic['ip.src'], traffic['ip.dst']]).value_counts().idxmax()

#Преобразование портов в один стобец
traffic['srcport'] = traffic['udp.srcport'].combine(traffic['tcp.srcport'], lambda x,y: x if np.isnan(y) else y).astype(int)
traffic['dstport'] = traffic['tcp.dstport'].combine(traffic['udp.dstport'], lambda x,y: x if np.isnan(y) else y).astype(int)
traffic = traffic.drop(['tcp.srcport', 'udp.srcport', 'tcp.dstport', 'udp.dstport'], axis=1)

#Получение конечных точек (<IP>:<port>) и указание направления трафика для пакета
traffic['isFromClient'] = traffic['ip.src'] == clientIP
traffic['src'] = traffic['ip.src'] + ":"  + traffic['srcport'].apply(str)
traffic['dest'] = traffic['ip.dst'] + ":" + traffic['dstport'].apply(str)
traffic = traffic.drop(['srcport', 'dstport', 'ip.src', 'ip.dst'], axis=1)

#Подстановка длины хэддера для UDP
traffic = traffic.rename(columns={"tcp.hdr_len": "transport_header"})
traffic.loc[traffic['ip.proto'] == 17, 'transport_header'] = int(8)

#рассчет прикладной нагрузки пакета
traffic['application_size'] = traffic['frame.len'] - 14 - traffic['ip.hdr_len'] - traffic['transport_header']

#Преобразование UNIX-времени к pandas.datetime64 группировка с интервалом DATE_INTERVAL секунд
traffic['frame.time_epoch'] = pd.to_datetime(traffic['frame.time_epoch'], unit='s')

In [29]:
intervalledPackeges = list()
intarval_offser_count = 4
base_delta = DATE_INTERVAL / 4
for i in range(intarval_offser_count):
    grouped = traffic.groupby(pd.Grouper(freq=str(DATE_INTERVAL) + 'S', key='frame.time_epoch', base=base_delta*i))
    for key, _ in grouped:
        print(key)
        intervalledPackeges.append((key, grouped.get_group(key)))
    print('end\n')


2020-04-27 21:08:30
2020-04-27 21:09:00
2020-04-27 21:09:30
2020-04-27 21:10:00
end

2020-04-27 21:08:07.500000
2020-04-27 21:08:37.500000
2020-04-27 21:09:07.500000
2020-04-27 21:09:37.500000
2020-04-27 21:10:07.500000
end

2020-04-27 21:08:15
2020-04-27 21:08:45
2020-04-27 21:09:15
2020-04-27 21:09:45
end

2020-04-27 21:08:22.500000
2020-04-27 21:08:52.500000
2020-04-27 21:09:22.500000
2020-04-27 21:09:52.500000
end



In [13]:
traffic.head()

Unnamed: 0,frame.time_epoch,ip.proto,frame.len,ip.version,ip.hdr_len,transport_header,isFromClient,src,dest,application_size
0,2020-04-27 21:08:35.658354998,6,203,4,20,32.0,True,192.168.1.53:64521,173.194.221.132:443,137.0
1,2020-04-27 21:08:35.661669016,17,79,4,20,8.0,True,192.168.1.53:58772,192.168.1.1:53,37.0
2,2020-04-27 21:08:35.679302931,17,95,4,20,8.0,False,192.168.1.1:53,192.168.1.53:58772,53.0
3,2020-04-27 21:08:35.680814028,6,78,4,20,44.0,True,192.168.1.53:64536,64.233.165.84:443,0.0
4,2020-04-27 21:08:35.690493107,6,66,4,20,32.0,False,173.194.221.132:443,192.168.1.53:64521,0.0


In [28]:
for i in range(4):
    print(i)

0
1
2
3
