-
Notifications
You must be signed in to change notification settings - Fork 4
/
tcpsessions_from_pcap.py
614 lines (578 loc) · 32.4 KB
/
tcpsessions_from_pcap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
import socket
import struct
import subprocess
import timeit
import glob
import json
import os
import argparse
import logging
import shutil
import tarfile
from tcpsession.tcpsession import TCPSessions, NetworkTuple
from datetime import datetime
from hashlib import md5, sha256
LOG_FORMAT_STRING = '%(asctime)s %(levelname)-8s %(filename)s %(lineno)d %(message)s'
LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
EMPTY_FILE_MD5_SUM = "d41d8cd98f00b204e9800998ecf8427e"
TMP_PCAP_EXTR_DIR = "pcap_extraction"
TMP_TCPSESSION_EXTR_DIR = "tcpsession"
TMP_TCPFLOW_EXTR_DIR = "tcpflow"
TMP_JS_EXTR_DIR = "js_extraction"
TMP_WS_EXTR_DIR = "wireshark"
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT_STRING, datefmt=LOG_DATEFMT,
filename=os.path.splitext(os.path.basename(__file__))[0] + ".log", filemode="w")
logger = logging.getLogger(__name__)
def str_to_inet(ip: str) -> bytes:
"""
Converts a string representation of IP address to binary representation.
:param ip: IP like - "123.45.67.89"
:return: 32 bit representation of "123.45.67.89" like - '{-CY'
"""
try:
return socket.inet_pton(socket.AF_INET, ip)
except OSError:
return socket.inet_pton(socket.AF_INET6, ip)
def inet_to_str(inet) -> str:
"""Convert inet object to a string
Args:
inet (inet struct): inet network address
Returns:
str: Printable/readable IP address
"""
# First try ipv4 and then ipv6
try:
return socket.inet_ntop(socket.AF_INET, inet)
except ValueError:
return socket.inet_ntop(socket.AF_INET6, inet)
def extract_data_with_tcpsessions(pcap, out_dir) ->TCPSessions:
"""Extracts the TCP sessions using TCPSessions class.
:param pcap: input pcap to extract sessions from
:param out_dir: directory where pcap of extracted session will be stored
:return: Object of TCPSessions which could be used again for the given input pcap
"""
start_time = datetime.now()
tcpsessions = TCPSessions(pcap)
tcpsessions.process_pcap()
tcpsessions.dump_all_sessions(out_dir)
logger.info("Total time taken to process pcap {} by TCPSessions is {}".format(pcap, datetime.now() - start_time))
return tcpsessions
def extract_data_with_tcpflow(pcap, out_dir):
"""Extracts the TCP sessions using tcpflow command
:param pcap: pcap to extract sessions from
:param out_dir: output directory where all the results will be stored
:return: None
"""
cmd = "tcpflow -r {} -o {}".format(pcap, out_dir)
logger.info("tcpflow command going to be used is {}".format(cmd))
start_time = datetime.now()
output = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
logger.info("Total time taken to process the pcap {} by tcpflow is {}".format(pcap, datetime.now() - start_time))
if output.returncode is not 0:
logger.info("tcpflow command failed with return code: {}".format(output.returncode))
return output.returncode
def inet_to_tcpflow_repr(net_tuple: NetworkTuple):
sip_octets = net_tuple.get_str_sip().split(".")
dip_octets = net_tuple.get_str_dip().split(".")
return "{:0>3}.{:0>3}.{:0>3}.{:0>3}.{:0>5}-{:0>3}.{:0>3}.{:0>3}.{:0>3}.{:0>5}".format(sip_octets[0], sip_octets[1],
sip_octets[2], sip_octets[3],
net_tuple.sp, dip_octets[0], dip_octets[1], dip_octets[2],
dip_octets[3], net_tuple.dp)
def file_hash(file_path, hash_algo="sha256"):
buf_size = 65536
if hash_algo == "sha256":
digester = sha256()
else:
digester = md5()
with open(file_path, 'rb') as file_ref:
while True:
buf = file_ref.read(buf_size)
if not buf:
break
digester.update(buf)
return digester.hexdigest()
def verify_data_with_tcpflow(pcap, tcpsession_out_dir, tcpflow_out_dir):
tcpsessions = extract_data_with_tcpsessions(pcap, tcpsession_out_dir)
sessions = tcpsessions.sessions
logger.info("Going to verify the results against tcpflow")
if extract_data_with_tcpflow(pcap, tcpflow_out_dir) is not 0:
logger.info("Couldn't verify against tcpflow because command execution failed")
return
diff_src_session_data_count = 0
diff_dst_session_data_count = 0
for net_tuple in sessions.keys():
rev_net_tuple = NetworkTuple(net_tuple.dip, net_tuple.sip, net_tuple.dp, net_tuple.sp, net_tuple.proto)
logger.info("reverse network tuple: {}".format(rev_net_tuple))
for session_id in sessions[net_tuple].sessions.keys():
_net_tuple = sessions[net_tuple].get_session_network_tuple(session_id - 1)
tcpsession_json = os.path.join(tcpsession_out_dir,
repr(_net_tuple) + '-' + str(session_id - 1) + ".json")
logger.info("tcpsession json output file: {}".format(tcpsession_json))
with open(tcpsession_json) as tcpsession_json_fp:
tcpsession_json_obj = json.load(tcpsession_json_fp)
if session_id == 1:
tcpflow_src_file = os.path.join(tcpflow_out_dir, inet_to_tcpflow_repr(net_tuple))
tcpflow_dst_file = os.path.join(tcpflow_out_dir, inet_to_tcpflow_repr(rev_net_tuple))
else:
tcpflow_src_file = os.path.join(tcpflow_out_dir, inet_to_tcpflow_repr(net_tuple) + "c" +
str(session_id - 1))
tcpflow_dst_file = os.path.join(tcpflow_out_dir, inet_to_tcpflow_repr(rev_net_tuple) + "c" +
str(session_id - 1))
logger.info("tcpflow src output file: {}".format(tcpflow_src_file))
logger.info("tcpflow dst output file: {}".format(tcpflow_dst_file))
if os.path.exists(tcpflow_src_file):
src_data_md5sum = file_hash(tcpflow_src_file, "md5")
else:
logger.info("tcpflow file src file doesn't exist.")
src_data_md5sum = EMPTY_FILE_MD5_SUM
if src_data_md5sum == tcpsession_json_obj["combined_src_payload_md5sum"]:
logger.info("SRC md5sum for {} is same for tcpsession and tcpflow".format(
str(net_tuple) + '-' + str(session_id - 1)))
else:
stream_id = tcpsessions.network_tuple_stream_id[net_tuple][session_id - 1]
logger.info("In the pcap {} SRC md5sum for {} is different for tcpsession and tcpflow for stream"
" id: {}".format(pcap, str(net_tuple) + '-' + str(session_id - 1), stream_id - 1))
logger.info("tcpsession src md5: {},"
" tcpflow src md5: {}".format(tcpsession_json_obj["combined_src_payload_md5sum"],
src_data_md5sum))
diff_src_session_data_count += 1
if os.path.exists(tcpflow_dst_file):
dst_data_md5sum = file_hash(tcpflow_dst_file, "md5")
else:
logger.info("tcpflow file dst file doesn't exist.")
dst_data_md5sum = EMPTY_FILE_MD5_SUM
if dst_data_md5sum == tcpsession_json_obj["combined_dst_payload_md5sum"]:
logger.info("DST md5sum for {} is same for tcpsession and tcpflow".format(
str(net_tuple) + '-' + str(session_id - 1)))
else:
stream_id = tcpsessions.network_tuple_stream_id[net_tuple][session_id - 1]
logger.info("In the pcap {} DST md5sum for {} is different for tcpsession and tcpflow for stream id:"
" {}".format(pcap, str(net_tuple) + '-' + str(session_id - 1), stream_id - 1))
logger.info("tcpsession dst md5: {},"
" tcpflow dst md5: {}".format(tcpsession_json_obj["combined_dst_payload_md5sum"],
dst_data_md5sum))
diff_dst_session_data_count += 1
logger.info("Number of sessions whose src/dst data was different from TCP flow"
" is src count: {}, dst count: {}".format(diff_src_session_data_count, diff_dst_session_data_count))
def run_command(cmd: str):
"""Executes a command passed along with its argument
:param cmd: command to run
:return: return the 0 on success else the error code; same as subprocess.run() return code.
"""
output = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
if output.returncode is not 0:
logger.info("command failed with return code {}".format(output.returncode))
return output.returncode
def create_tar(_input: str, output_tar_name: str):
"""Creates gunzipped tar of given input.
:param _input: input file or directory
:param output_tar_name: name of output tar
:return: None
"""
if os.path.isdir(_input):
file_list = os.listdir(_input)
elif os.path.isfile(_input):
file_list = [_input]
with tarfile.open(output_tar_name, "w:gz") as tar_fp:
for _file in file_list:
tar_fp.add(_file)
def verify_data_with_wireshark(pcap, output_dir, performance_mode=False):
"""Verifies the correctness of sessions extracted from a pcap with the TCPSessions class against the Wireshark.
Beware, takes long time to execute because it spawns one process for each stream.
:param pcap: pcap to extract sessions from
:param output_dir: directory to store pcaps of the sessions extracted with TCPSessions class
:return:
"""
tcpsession_out_dir = os.path.join(output_dir, TMP_TCPSESSION_EXTR_DIR)
ws_out_dir = os.path.join(output_dir, TMP_WS_EXTR_DIR)
if performance_mode:
logger.info("going to extract sessions with tshark")
else:
logger.info("Going to verify results against Wireshark/tshark")
if os.path.exists(tcpsession_out_dir):
shutil.rmtree(tcpsession_out_dir)
os.makedirs(tcpsession_out_dir)
if os.path.exists(ws_out_dir):
shutil.rmtree(ws_out_dir)
os.makedirs(ws_out_dir)
net_tuples = extract_data_with_tcpsessions(pcap, tcpsession_out_dir).sessions.keys()
start_time = datetime.now()
logger.info("Extraction with TCPSession is done. Going to verify the results against Wireshark")
for net_tuple in net_tuples:
tuple_filter = '(ip.src == {} && tcp.srcport == {} && ip.dst == {} && tcp.dstport == {})'
client_filter = tuple_filter.format(net_tuple.get_str_sip(), net_tuple.sp, net_tuple.get_str_dip(),
net_tuple.dp)
server_filter = tuple_filter.format(net_tuple.get_str_dip(), net_tuple.dp, net_tuple.get_str_sip(),
net_tuple.sp)
extract_streams_cmd = 'tshark -r {} -Y "{} || {}" -T fields -e tcp.stream | sort -n -u'.format(
pcap, client_filter, server_filter)
logger.info("Command to extract session ids: {}".format(extract_streams_cmd))
ws_out_file = "{}/{}".format(ws_out_dir,net_tuple)
def tshark_session_output_verification(cmd: str, sip: str, dip: str, sport: int, dport: int, out_file: str,
stream: int, count: int):
"""Parses the output of tshark command used to extract a specific session id and compares it against the
TCPSessions's output. If there is an error it dumps the tshark extracted data in JSON file whose schema is
defined in data/output_schema.json. Verification is done by calculation the MD5 of the data extracted from
both the techniques, extracted data is in order it was sent from client and server, and comparing the MD5s.
:param cmd: tshark command to extract the data a session id data
:param sip: source IP
:param dip: destination IP
:param sport: source port
:param dport: destination port
:param out_file: output file to write the tshark/Wireshark results if extracted session data doesn't match
:param stream: stream id for current network tuple
:param count: count to differentiate the name of output files if there multiple sessions of a network tuple
:return: None
"""
with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
shell=True) as proc:
if proc.returncode is not None:
logger.info("tshark command failed with return code {}".format(proc.returncode))
return
pcap_data = False
tcp_payload_hex = list()
session_digester = md5()
while True:
line = proc.stdout.readline()
if not line:
break
if not pcap_data:
if line[:7] == b"Node 1:":
pcap_data = True
else:
if line[:3] == b'===':
break
if line[0] == 0x09:
line = line[1:-1]
pkt_src = dip
else:
pkt_src = sip
line = line[:-1]
tcp_payload_hex.append((pkt_src, line.decode("utf-8", "backslashreplace")))
if not performance_mode:
session_digester.update(line)
hex_session_digest = session_digester.hexdigest()
def dump_output(out_file, count):
out_file = "{}-{}.json".format(out_file, count)
logger.info("Dumping the Wireshark result in file: {}".format(out_file))
with open(out_file, "w") as json_fp:
output_dict = dict()
output_dict["sip"] = sip
output_dict["dip"] = dip
output_dict["sport"] = sport
output_dict["dport"] = dport
output_dict["proto"] = 6
output_dict["tcp_payload_hex"] = tcp_payload_hex
output_dict["tcp_ordered_hex_payload_md5sum"] = hex_session_digest
json.dump(output_dict, json_fp, indent=1)
if performance_mode:
dump_output(out_file, count)
else:
tcpsession_json = os.path.join(tcpsession_out_dir,
repr(net_tuple) + '-' + str(count) + ".json")
with open(tcpsession_json) as tcpsession_json_fp:
tcpsession_json_obj = json.load(tcpsession_json_fp)
if tcpsession_json_obj["tcp_ordered_hex_payload_md5sum"] == hex_session_digest:
logger.info("For pcap {} correct checksum for network tuple: "
"{}, count: {}, and stream: {}".format(pcap, net_tuple, count, stream))
else:
logger.info("For pcap {} wrong checksum for network tuple: "
"{}, count: {}, and stream: {}".format(pcap, net_tuple, count, stream))
dump_output(out_file, count)
def extract_session_ids():
"""Extracts the distinct session id in a pcap using tshark command and call the
tshark_session_output_verification() function for each session id for verification.
:return: None
"""
with subprocess.Popen(extract_streams_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
shell=True) as proc:
streams = set()
while True:
line = proc.stdout.readline()
if not line:
break
streams.add(int(line))
streams = list(streams)
streams = sorted(streams)
count = 0
while True:
if len(streams) == 0:
break
stream = streams.pop()
extract_session_data_cmd = 'tshark -r {} -q -z "follow,tcp,raw,{}"'.format(
pcap, stream)
logger.info("Command to extract session# {} data: {}".format(stream, extract_session_data_cmd))
tshark_session_output_verification(extract_session_data_cmd, net_tuple.get_str_sip(),
net_tuple.get_str_dip(), net_tuple.sp, net_tuple.dp,
ws_out_file, stream, count)
count += 1
extract_session_ids()
def performance_analysis():
"""Performance analysis of different ways of extraction sessions in pcap"""
num_op = 5
session_data_cmd = 'for stream in `{}`; do tshark -r {} -q -Y "{} || {}" -z "follow,tcp,raw,$stream" >' \
' {}-$stream.data;done'.format(extract_streams_cmd, pcap, client_filter, server_filter,
ws_out_file)
stmt = "{}({})".format("run_command", "session_data_cmd")
tot = timeit.timeit(stmt=stmt,globals=locals(),number=num_op)
print("total time take for first way: {}, per command: {}".format(tot, tot/num_op))
session_data_cmd = 'for stream in `{}`; do tshark -r {} -q -z "follow,tcp,raw,$stream" >' \
' {}-$stream.data;done'.format(extract_streams_cmd, pcap,
ws_out_file + "type-3")
stmt = "{}({})".format("run_command", "session_data_cmd")
tot = timeit.timeit(stmt=stmt,globals=locals(),number=num_op)
print("total time take for second way: {}, per command: {}".format(tot, tot/num_op))
print(session_data_cmd)
stmt = "{}()".format("extract_session_ids")
tot = timeit.timeit(stmt=stmt,globals=locals(),number=num_op)
print("total time take for current used way: {}, per command: {}".format(tot, tot/num_op))
logger.info("Total time taken for Wireshark verification is: {}".format(datetime.now() - start_time))
def extract_js_with_bash_cmd():
start_time = datetime.now()
cmd = 'for var in `grep -r -l -E "(GET|HEAD|POST|PUT|DELETE|CONNECT|OPTIONS|TRACE|PATCH) .+ HTTP/" .`; do' \
' grep -l "<script" $var ;done | wc -l'
logger.info("Bash JS extraction command: " + cmd)
output = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
if output.returncode is not 0:
logger.info("command failed with return code {}".format(output.returncode))
logger.info(output.stdout)
logger.info("total time take with grep : {}".format(datetime.now() - start_time))
def extract_js_with_python(input_dir, output_dir):
import re
start_time = datetime.now()
#http_pattern = re.compile(r"(GET|HEAD|POST|PUT|DELETE|CONNECT|OPTIONS|TRACE|PATCH) .+ HTTP/.*Content-Type: text/html")
#http_pattern = re.compile(r"(GET|HEAD|POST|PUT|DELETE|CONNECT|OPTIONS|TRACE|PATCH) .+ HTTP/")
#http_pattern = re.compile(r'"HTTP/[0-1]\.[0-9] 200 OK\\r\\n')
http_pattern = re.compile(r'Content-Type: text/html')
pattern = re.compile(r"(?<=<script>).*?(?=</script>|$)", re.DOTALL)
file_count = 0
non_http_file = 0
without_script = 0
with_script = 0
for path, _dir, files in os.walk(input_dir):
for _file in files:
_file = os.path.join(path, _file)
if not os.path.splitext(_file)[1] == ".json":
continue
with open(_file) as fp:
dict_object = json.load(fp)
content = [dict_object["combined_src_payload"], dict_object["combined_dst_payload"]]
script_found = False
http_file = False
js_fp = None
for value in content:
res = http_pattern.findall(value)
if res:
http_file = True
res = pattern.findall(value)
if res:
script_found = True
js_file_name = os.path.splitext(os.path.basename(_file))[0] + ".js"
if js_fp is None:
js_fp = open(os.path.join(output_dir, js_file_name), "w")
for _res in res:
js_fp.write(_res + "\n")
file_count += 1
if not http_file:
non_http_file += 1
if script_found:
with_script += 1
else:
without_script += 1
logger.info("total file: {}, non http file: {}, without script: {}, "
"with script: {}".format(file_count, non_http_file, without_script, with_script))
logger.info("time taken by python in extracting JS is : {}".format(datetime.now() - start_time))
return with_script
def extract_tcpsessions_from_pcaps(_input, output_dir, tar_output_prefix=None, extract_js=False, _create_tar=False,
recursive=False):
pwd = os.getcwd()
pcap_list = list()
start = datetime.now()
output_dir = os.path.abspath(output_dir)
if not os.path.isdir(_input):
input_dir = os.path.dirname(os.path.abspath(_input))
pcap_list = [os.path.basename(_input)]
else:
input_dir = os.path.abspath(_input)
os.chdir(input_dir)
if recursive:
for path, dirs, files in os.walk("."):
for _file in files:
if os.path.splitext(_file)[1] == ".pcap":
pcap_list.append(os.path.join(path, _file))
else:
for _file in os.listdir(input_dir):
if os.path.isfile(_file) and os.path.splitext(_file)[1] == ".pcap":
pcap_list.append(_file)
js_file_count = 0
js_tars = list()
json_tars = list()
for pcap in pcap_list:
os.chdir(input_dir)
dir_name = os.path.dirname(pcap).strip("./")
_file = os.path.basename(pcap)
_file_without_ext = os.path.splitext(os.path.basename(pcap))[0]
json_tar_output_name = dir_name.strip("./").replace("/", "_") + "_" + _file_without_ext + "-session-JSON.tar"
tcpsession_out_dir = os.path.join(output_dir, TMP_TCPSESSION_EXTR_DIR, dir_name, _file_without_ext)
json_tar_output_file_path = os.path.join(output_dir, TMP_TCPSESSION_EXTR_DIR, dir_name, json_tar_output_name)
json_tars.append(json_tar_output_file_path)
if os.path.exists(json_tar_output_file_path):
logger.info("JSON tar {} already exists, skipping pcap {}".format(json_tar_output_file_path, pcap))
else:
if os.path.exists(tcpsession_out_dir):
shutil.rmtree(tcpsession_out_dir)
os.makedirs(tcpsession_out_dir)
tcpflow_out_dir = os.path.join(output_dir, TMP_TCPFLOW_EXTR_DIR, dir_name)
if os.path.exists(tcpflow_out_dir):
shutil.rmtree(tcpflow_out_dir)
os.makedirs(tcpflow_out_dir)
logger.info("Going to work on pcap: {}".format(pcap))
verify_data_with_tcpflow(pcap, tcpsession_out_dir, tcpflow_out_dir)
logger.info("Done with pcap: {}".format(pcap))
if _create_tar:
os.chdir(tcpsession_out_dir)
create_tar(os.curdir, json_tar_output_file_path)
os.chdir(input_dir)
if extract_js:
js_tar_output_name = dir_name.strip("./").replace("/", "_") + _file_without_ext + "-JS.tar"
js_output_dir = os.path.join(output_dir, TMP_JS_EXTR_DIR, dir_name, _file_without_ext)
js_tar_output_file_path = os.path.join(output_dir, TMP_JS_EXTR_DIR, dir_name, js_tar_output_name)
js_tars.append(js_tar_output_file_path)
if os.path.exists(js_tar_output_file_path):
logger.info("JS tar {} already exits, skipping pcap {}".format(js_tar_output_file_path, pcap))
else:
logger.info("Going to extract JS files")
if not os.path.exists(js_output_dir):
os.makedirs(js_output_dir)
else:
shutil.rmtree(js_output_dir)
os.mkdir(js_output_dir)
js_file_count += extract_js_with_python(tcpsession_out_dir, js_output_dir)
logger.info("Total script files found so far {}".format(js_file_count))
if _create_tar:
os.chdir(js_output_dir)
create_tar(os.curdir, js_tar_output_file_path)
os.chdir(input_dir)
logger.info("JSON of sessions are stored in {}".format(os.path.join(output_dir, TMP_TCPSESSION_EXTR_DIR)))
if extract_js:
logger.info("Extracted JS from the sessions are store in {}".format(os.path.join(output_dir,
TMP_JS_EXTR_DIR)))
os.chdir(output_dir)
if _create_tar:
if extract_js:
final_JS_tar_output_file = os.path.join(output_dir, "{}-JS.tar.gz".format(tar_output_prefix))
logger.info("Going to create the final JS tar {}".format(final_JS_tar_output_file))
#js_tars = os.listdir()
logger.info("JS tar going to be added to final tar.gz: {}".format(js_tars))
with tarfile.open(final_JS_tar_output_file, "w:gz") as final_tar_output_fp:
for js_tar in js_tars:
final_tar_output_fp.add(js_tar, arcname=os.path.basename(js_tar))
logger.info("Final JS tar created at {}!".format(final_JS_tar_output_file))
logger.info("Total file with js script found is: {}".format(js_file_count))
final_json_tar_output_file = os.path.join(output_dir, "{}-session-JSON.tar.gz".format(tar_output_prefix))
logger.info("Going to create the final tar of session's JSON data {}".format(final_json_tar_output_file))
logger.info("JSON tar going to be added to the final tar.gz: {}".format(json_tars))
with tarfile.open(final_json_tar_output_file, "w:gz") as final_tar_output_fp:
for json_tar in json_tars:
final_tar_output_fp.add(json_tar, arcname=os.path.basename(json_tar))
logger.info("Final tar of session JSON data created at {}!".format(final_json_tar_output_file))
os.chdir(pwd)
logger.info("Total time taken in {} is {}".format(extract_tcpsessions_from_pcaps.__name__, datetime.now() - start))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Extract JS from pcaps stored in directory using TCPSessions library, "
"and many other stuff. If a directory of pcaps is provided then a tar"
" of all the extracted JS could also be created.")
parser.add_argument("-i", "--input-dir", action="store", type=str,
help="Directory to pick input pcaps from, picks only in the directory not recursively.")
parser.add_argument("-o", "--output-dir", action="store", type=str,
help="Directory where all the output will be stored")
parser.add_argument("-p", "--pcap", action="store", type=str, help="Input pcap.")
parser.add_argument("-t", "--tar-output-prefix", action="store_true", help="Prefix for the name of output tar.")
parser.add_argument("-n", "--no-js-extraction", action="store_true",
help="Don't extract JS from from the extracted TCP sessions. "
"Final tar output will be of JSON of individual sessions if input"
" is directory, else just JSON of individual sessions in input pcap")
parser.add_argument("-r", "--recursive", action="store_true",
help="Input given is directory and try to find pcaps recursively in it")
parser.add_argument("-c", "--create-tar", action="store_true", help="Create a tar of the final output")
parser.add_argument("-w", "--wireshark-verification", action="store_true",
help="Verify the output of a session extracted with TCPSession against the tshark (Wireshark)."
" This is a standalone option to re-verify the results of TCPSession which failed against "
"tcpflow command in normal usecases. "
"Provide the pcap as an input along with this switch to see the results.")
parser.add_argument("-k", "--performance-comparison", action="store", type=int,
help="does performance comparision between TCPsession and tcpflow or Wireshark")
args = parser.parse_args()
recursive = False
if args.no_js_extraction:
extract_js = False
else:
extract_js = True
if not args.input_dir:
if not args.pcap:
logger.info("See the help for valid arguments")
exit(0)
else:
_input = args.pcap
else:
if not args.input_dir:
logger.info("provide an input directory to work on")
exit(0)
else:
_input = args.input_dir
if not os.path.isdir(_input):
logger.info("provide a valid input directory to work on")
exit(0)
if args.output_dir:
output_dir = args.output_dir
if not os.path.isdir(output_dir):
logger.info("Provided output path is not a directory, current directory will be used for output")
output_dir = os.path.abspath(os.curdir)
else:
recursive = args.recursive
else:
logger.info("Output directory is not provided. Extracted output will be stored in current directory")
output_dir = os.path.abspath(os.curdir)
if args.create_tar:
_create_tar=args.create_tar
if args.tar_output_prefix:
tar_output_prefix = args.tar_output_prefix
elif args.input_dir:
tar_output_prefix = "extracted-session-data-"
logger.info("No tar output file name was given.")
logger.info("Name of the final tar would start with [your-pcap-file-name]-{}".format(tar_output_prefix))
else:
tar_output_prefix = "extracted-"
else:
_create_tar = False
tar_output_prefix = None
if args.wireshark_verification and args.performance_comparison:
logger.error("Wireshark verification option is not valid with performance comparision")
exit(0)
else:
if args.input_dir or args.recursive or args.create_tar or args.no_js_extraction or args.tar_output_prefix:
if args.wireshark_verification:
logger.info("Ignoring arguments irrelevant to wireshark verification")
elif args.performance_comparison:
logger.info("Ignoring arguments irrelevant to performance comparision")
if args.wireshark_verification and args.pcap and output_dir:
verify_data_with_wireshark(_input, output_dir)
exit(0)
if args.performance_comparison:
iter_count = args.performance_comparison
logger.info("Performance comparision using the pcap {} for {} iterations".format(_input, iter_count))
func_name = "verify_data_with_wireshark"
stmt = '{}("{}", "{}", {})'.format(func_name, _input, output_dir, True)
ws_total_time = timeit.timeit(stmt=stmt, globals=globals(), number=iter_count)
logger.info("Total time taken by {} for {} iterations is {}, per iteration"
" time is {}".format(func_name, iter_count, ws_total_time, ws_total_time/iter_count))
func_name = "verify_data_with_tcpflow"
stmt = '{}("{}", "{}", "{}")'.format(func_name, _input, os.path.join(output_dir, "tcpsession"),
os.path.join(output_dir, "tcpflow"))
ts_total_time = timeit.timeit(stmt=stmt, globals=globals(), number=iter_count)
logger.info("Total time taken by {} for {} iterations is {}, per iteration"
" time is {}".format(func_name, iter_count, ts_total_time, ts_total_time/iter_count))
logger.info("Time taken by {} is {} times of {}.".format("verify_data_with_wireshark",
ws_total_time/ts_total_time, func_name))
exit(0)
extract_tcpsessions_from_pcaps(_input, output_dir, tar_output_prefix, extract_js, _create_tar, recursive)