Skip to content

Commit

Permalink
Improved compatibility of dsv parsers with Python3 log2timeline#1952 (l…
Browse files Browse the repository at this point in the history
…og2timeline#1966)

* Improved compatibility of dsv parsers with Python3 log2timeline#1952
  • Loading branch information
Onager committed Jun 23, 2018
1 parent 528edf0 commit 2d012b7
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 19 deletions.
29 changes: 18 additions & 11 deletions plaso/parsers/dsv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

import abc
import csv
import sys

from dfvfs.helpers import text_file

from plaso.lib import errors
from plaso.lib import line_reader_file
Expand Down Expand Up @@ -87,28 +88,23 @@ def _ConvertRowToUnicode(self, parser_mediator, row):

return row

def _CreateDictReader(self, parser_mediator, line_reader):
def _CreateDictReader(self, line_reader):
"""Returns a reader that processes each row and yields dictionaries.
csv.DictReader does this job well for single-character delimiters; parsers
that need multi-character delimiters need to override this method.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfvfs.
line_reader (iter): yields lines from a file-like object.
Returns:
iter: a reader of dictionaries, as returned by csv.DictReader().
"""
if not self._encoding:
self._encoding = parser_mediator.codepage

delimiter = self.DELIMITER
quotechar = self.QUOTE_CHAR
magic_test_string = self._MAGIC_TEST_STRING
# Python 3 csv module requires arguments to constructor to be of type str.
if sys.version_info[0] >= 3:
if py2to3.PY_3:
delimiter = delimiter.decode(self._encoding)
quotechar = quotechar.decode(self._encoding)
magic_test_string = magic_test_string.decode(self._encoding)
Expand Down Expand Up @@ -147,13 +143,23 @@ def ParseFileObject(self, parser_mediator, file_object, **unused_kwargs):
'[{0:s}] Unable to parse DSV file: {1:s} size of file exceeds '
'maximum supported size').format(self.NAME, display_name))

line_reader = line_reader_file.BinaryLineReader(file_object)
# TODO: Replace this with detection of the file encoding via byte-order
# marks. Also see: https://github.com/log2timeline/plaso/issues/1971
if not self._encoding:
self._encoding = parser_mediator.codepage

# The Python 2 csv module reads bytes and the Python 3 csv module Unicode
# reads strings.
if py2to3.PY_3:
line_reader = text_file.TextFile(file_object, encoding=self._encoding)
else:
line_reader = line_reader_file.BinaryLineReader(file_object)

# If we specifically define a number of lines we should skip, do that here.
for _ in range(0, self.NUMBER_OF_HEADER_LINES):
line_reader.readline()

reader = self._CreateDictReader(parser_mediator, line_reader)
reader = self._CreateDictReader(line_reader)

row_offset = line_reader.tell()
try:
Expand Down Expand Up @@ -182,13 +188,14 @@ def ParseFileObject(self, parser_mediator, file_object, **unused_kwargs):
'[{0:s}] Unable to parse DSV file: {1:s}. Signature '
'mismatch.').format(self.NAME, display_name))

row = self._ConvertRowToUnicode(parser_mediator, row)

if not self.VerifyRow(parser_mediator, row):
display_name = parser_mediator.GetDisplayName()
raise errors.UnableToParseFile((
'[{0:s}] Unable to parse DSV file: {1:s}. Verification '
'failed.').format(self.NAME, display_name))

row = self._ConvertRowToUnicode(parser_mediator, row)
self.ParseRow(parser_mediator, row_offset, row)
row_offset = line_reader.tell()

Expand Down
4 changes: 2 additions & 2 deletions plaso/parsers/mactime.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class MactimeEventData(events.EventData):
filename (str): name of the file.
inode (int): "inode" of the file. Note that inode is an overloaded term
in the context of mactime and used for MFT entry index values as well.
md5_hash (str): MD5 hash of the file content.
md5 (str): MD5 hash of the file content, formatted as a hexadecimal string.
mode_as_string (str): protection mode.
offset (int): number of the corresponding line.
size (int): size of the file content.
Expand Down Expand Up @@ -154,7 +154,7 @@ def VerifyRow(self, unused_parser_mediator, row):
# MD5|name|inode|mode_as_string|UID|GID|size|atime|mtime|ctime|crtime
# 0|/lost+found|11|d/drwx------|0|0|12288|1337961350|1337961350|1337961350|0

if row['md5'] != b'0' and not self._MD5_RE.match(row['md5']):
if row['md5'] != '0' and not self._MD5_RE.match(row['md5']):
return False

# Check if the following columns contain a base 10 integer value if set.
Expand Down
5 changes: 4 additions & 1 deletion plaso/parsers/mcafeeav.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from __future__ import unicode_literals

import codecs

from plaso.containers import events
from plaso.containers import time_events
from plaso.lib import errors
Expand Down Expand Up @@ -137,7 +139,8 @@ def VerifyRow(self, parser_mediator, row):
# This file can have a UTF-8 byte-order-marker at the beginning of
# the first row.
# TODO: Find out all the code pages this can have. Asked McAfee 10/31.
if row['date'][0:3] == b'\xef\xbb\xbf':
row_bytes = codecs.encode(row['date'], parser_mediator.codepage)
if row_bytes.startswith(b'\xef\xbb\xbf'):
row['date'] = row['date'][3:]
self._encoding = 'utf-8'

Expand Down
4 changes: 1 addition & 3 deletions plaso/parsers/trendmicroav.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,10 @@ def __init__(self, *args, **kwargs):
kwargs.setdefault('encoding', 'cp1252')
super(TrendMicroBaseParser, self).__init__(*args, **kwargs)

def _CreateDictReader(self, parser_mediator, line_reader):
def _CreateDictReader(self, line_reader):
"""Iterates over the log lines and provide a reader for the values.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfvfs.
line_reader (iter): yields each line in the log file.
Yields:
Expand Down
4 changes: 2 additions & 2 deletions tests/parsers/dsv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


class TestDSVParser(dsv_parser.DSVParser):
"""Delimiter seperated values (DSV) parser parser for testing.
"""Delimiter separated values (DSV) parser parser for testing.
Attribute:
row_offsets[list[int]: offsets of the rows extracted by the DSV parser.
Expand Down Expand Up @@ -56,7 +56,7 @@ def VerifyRow(self, unused_parser_mediator, unused_row):


class DSVParserTest(test_lib.ParserTestCase):
"""Tests the delimiter seperated values (DSV) parser."""
"""Tests the delimiter separated values (DSV) parser."""

# pylint: disable=protected-access

Expand Down

0 comments on commit 2d012b7

Please sign in to comment.