Improved python3 compatibility of lib tests log2timeline#1932 (log2ti…

…meline#1934) * Improved python3 compatibility of lib tests log2timeline#1927
Onager · Jun 11, 2018 · e8f0df9 · e8f0df9
1 parent 5fe5562
commit e8f0df9
Show file tree

Hide file tree

Showing 16 changed files with 201 additions and 121 deletions.
diff --git a/plaso/engine/filter_file.py b/plaso/engine/filter_file.py
@@ -66,7 +66,7 @@ def BuildFindSpecs(self, environment_variables=None):
         path_attributes[attribute_name] = attribute_value
 
     find_specs = []
-    with open(self._path, 'rb') as file_object:
+    with open(self._path, 'r') as file_object:
       for line in file_object:
         line = line.strip()
         if line.startswith('#'):

diff --git a/plaso/lib/binary.py b/plaso/lib/binary.py
@@ -3,7 +3,8 @@
 
 from __future__ import unicode_literals
 
-import binascii
+import codecs
+import itertools
 import logging
 import os
 
@@ -14,11 +15,11 @@ def ByteArrayCopyToString(byte_array, codepage='utf-8'):
   """Copies a UTF-8 encoded byte array into a Unicode string.
 
   Args:
-    byte_array: A byte array containing an UTF-8 encoded string.
-    codepage: The codepage of the byte stream.
+    byte_array (bytes): byte stream containing an UTF-8 encoded string.
+    codepage (Optional[str]): codepage of the byte stream.
 
   Returns:
-    A Unicode string.
+    str: Unicode string.
   """
   byte_stream = b''.join(map(chr, byte_array))
   return ByteStreamCopyToString(byte_stream, codepage=codepage)
@@ -28,44 +29,44 @@ def ByteStreamCopyToString(byte_stream, codepage='utf-8'):
   """Copies a UTF-8 encoded byte stream into a Unicode string.
 
   Args:
-    byte_stream: A byte stream containing an UTF-8 encoded string.
-    codepage: The codepage of the byte stream.
+    byte_stream (bytes): byte stream containing an UTF-8 encoded string.
+    codepage (Optional[str]): codepage of the byte stream.
 
   Returns:
-    A Unicode string.
+    str: Unicode string.
   """
   try:
-    string = byte_stream.decode(codepage)
+    string = codecs.decode(byte_stream, codepage)
   except UnicodeDecodeError:
     logging.warning(
         'Unable to decode {0:s} formatted byte stream.'.format(codepage))
-    string = byte_stream.decode(codepage, errors='ignore')
+    string = codecs.decode(byte_stream, codepage, errors='ignore')
 
   string, _, _ = string.partition('\x00')
   return string
 
 
-def ByteStreamCopyToUTF16Stream(byte_stream, byte_stream_size=None):
+def ByteStreamCopyToUTF16String(byte_stream, byte_stream_size=None):
   """Reads an UTF-16 formatted stream from a byte stream.
 
   The UTF-16 formatted stream should be terminated by an end-of-string
   character (\x00\x00). Otherwise the function reads up to the byte stream size.
 
   Args:
-    byte_stream: The byte stream that contains the UTF-16 formatted stream.
-    byte_stream_size: Optional byte stream size or None if the entire
-                      byte stream should be read.
+    byte_stream (bytes): byte stream that contains the UTF-16 formatted
+        stream.
+    byte_stream_size (Optional[int]): byte stream size or None if the entire
+        byte stream should be read.
 
   Returns:
-    String containing the UTF-16 formatted stream.
+    bytes: UTF-16 formatted stream.
   """
   byte_stream_index = 0
   if not byte_stream_size:
     byte_stream_size = len(byte_stream)
 
   while byte_stream_index + 1 < byte_stream_size:
-    if (byte_stream[byte_stream_index] == b'\x00' and
-        byte_stream[byte_stream_index + 1] == b'\x00'):
+    if _StreamContainsUTF16NullTerminator(byte_stream, byte_stream_index):
       break
 
     byte_stream_index += 2
@@ -80,14 +81,14 @@ def ReadUTF16Stream(file_object, offset=None, byte_size=0):
   an end-of-string character (\x00\x00) or up to the byte size.
 
   Args:
-    file_object: A file-like object to read the data from.
-    offset: An offset into the file object data, if -1 or not set
-            the current location into the file object data is used.
-    byte_size: Maximum number of bytes to read or 0 if the function
-               should keep reading up to the end of file.
+    file_object (file): file-like object to read the data from.
+    offset (Optional[int]): offset into the file object data, if -1 or not set
+        the current location into the file object data is used.
+    byte_size (Optional[int]): maximum number of bytes to read or 0 if the
+        function should keep reading up to the end of file.
 
   Returns:
-    An Unicode string.
+    str: Unicode string.
   """
   if offset is not None:
     file_object.seek(offset, os.SEEK_SET)
@@ -116,18 +117,19 @@ def UTF16StreamCopyToString(byte_stream, byte_stream_size=None):
   character (\x00\x00). Otherwise the function reads up to the byte stream size.
 
   Args:
-    byte_stream: The UTF-16 formatted byte stream.
-    byte_stream_size: The byte stream size or None if the entire byte stream
-                      should be used.
+    byte_stream (bytes): UTF-16 formatted byte stream.
+    byte_stream_size (Optional[int]): byte stream size or None if the entire
+        byte stream should be used.
 
   Returns:
-    An Unicode string.
+    str: Unicode string.
   """
-  utf16_stream = ByteStreamCopyToUTF16Stream(
+  utf16_stream = ByteStreamCopyToUTF16String(
       byte_stream, byte_stream_size=byte_stream_size)
 
   try:
-    return utf16_stream.decode('utf-16-le')
+    string = codecs.decode(utf16_stream, 'utf-16-le')
+    return string
   except (UnicodeDecodeError, UnicodeEncodeError) as exception:
     logging.error('Unable to decode string: {0:s} with error: {1!s}'.format(
         HexifyBuffer(utf16_stream), exception))
@@ -142,12 +144,12 @@ def ArrayOfUTF16StreamCopyToString(byte_stream, byte_stream_size=None):
   character (\x00\x00). Otherwise the function reads up to the byte stream size.
 
   Args:
-    byte_stream: The UTF-16 formatted byte stream.
-    byte_stream_size: The byte stream size or None if the entire byte stream
-                      should be used.
+    byte_stream (str): UTF-16 formatted byte stream.
+    byte_stream_size (Optional[int]): byte stream size or None if the entire
+        byte stream should be used.
 
   Returns:
-    An array of Unicode strings.
+    list[str]: Unicode strings.
   """
   array_of_strings = []
   utf16_stream_start = 0
@@ -156,15 +158,13 @@ def ArrayOfUTF16StreamCopyToString(byte_stream, byte_stream_size=None):
     byte_stream_size = len(byte_stream)
 
   while byte_stream_index + 1 < byte_stream_size:
-    if (byte_stream[byte_stream_index] == b'\x00' and
-        byte_stream[byte_stream_index + 1] == b'\x00'):
-
+    if _StreamContainsUTF16NullTerminator(byte_stream, byte_stream_index):
       if byte_stream_index - utf16_stream_start <= 2:
         break
 
-      array_of_strings.append(
-          byte_stream[utf16_stream_start:byte_stream_index].decode(
-              'utf-16-le'))
+      utf16_stream = byte_stream[utf16_stream_start:byte_stream_index]
+      string = codecs.decode(utf16_stream, 'utf-16-le')
+      array_of_strings.append(string)
       utf16_stream_start = byte_stream_index + 2
 
     byte_stream_index += 2
@@ -180,12 +180,13 @@ def ArrayOfUTF16StreamCopyToStringTable(byte_stream, byte_stream_size=None):
   character (\x00\x00). Otherwise the function reads up to the byte stream size.
 
   Args:
-    byte_stream: The UTF-16 formatted byte stream.
-    byte_stream_size: The byte stream size or None if the entire byte stream
-                      should be used.
+    byte_stream (bytes): The UTF-16 formatted byte stream.
+    byte_stream_size (int): The byte stream size or None if the entire byte
+      stream should be used.
 
   Returns:
-    A dict of Unicode strings with the byte offset as their key.
+    dict[int, str]: Unicode strings with their offset in the byte stream as
+        their key.
   """
   string_table = {}
   utf16_stream_start = 0
@@ -194,14 +195,13 @@ def ArrayOfUTF16StreamCopyToStringTable(byte_stream, byte_stream_size=None):
     byte_stream_size = len(byte_stream)
 
   while byte_stream_index + 1 < byte_stream_size:
-    if (byte_stream[byte_stream_index] == b'\x00' and
-        byte_stream[byte_stream_index + 1] == b'\x00'):
 
+    if _StreamContainsUTF16NullTerminator(byte_stream, byte_stream_index):
       if byte_stream_index - utf16_stream_start <= 2:
         break
 
-      string = byte_stream[utf16_stream_start:byte_stream_index].decode(
-          'utf-16-le')
+      utf16_stream = byte_stream[utf16_stream_start:byte_stream_index]
+      string = codecs.decode(utf16_stream, 'utf-16-le')
       string_table[utf16_stream_start] = string
       utf16_stream_start = byte_stream_index + 2
 
@@ -211,31 +211,77 @@ def ArrayOfUTF16StreamCopyToStringTable(byte_stream, byte_stream_size=None):
 
 
 def ReadUTF16(string_buffer):
-  """Returns a decoded UTF-16 string from a string buffer."""
+  """Returns a decoded UTF-16 string from a string buffer.
+
+  Args:
+    string_buffer(bytes): byte string.
+
+  Returns:
+    str: Unicode string.
+  """
   if isinstance(string_buffer, (list, tuple)):
     use_buffer = ''.join(string_buffer)
   else:
     use_buffer = string_buffer
 
-  if not isinstance(use_buffer, py2to3.STRING_TYPES):
+  if not isinstance(use_buffer, py2to3.BYTES_TYPE):
     return ''
 
   try:
-    return use_buffer.decode('utf-16').replace('\x00', '')
+    return codecs.decode(use_buffer, 'utf-16').replace('\x00', '')
   except SyntaxError as exception:
     logging.error('Unable to decode string: {0:s} with error: {1!s}.'.format(
         HexifyBuffer(string_buffer), exception))
   except (UnicodeDecodeError, UnicodeEncodeError) as exception:
     logging.error('Unable to decode string: {0:s} with error: {1!s}'.format(
         HexifyBuffer(string_buffer), exception))
 
-  return use_buffer.decode('utf-16', errors='ignore').replace('\x00', '')
+  return codecs.decode(
+      use_buffer, 'utf-16', errors='ignore').replace('\x00', '')
+
+
+def HexifyBuffer(byte_sequence):
+  """Returns an hexadecimal representation of a byte sequence.
+
+  Args:
+    byte_sequence (bytes): byte sequence.
+
+  Returns:
+    str: hexadecimal representation of the byte stream.
+  """
+  hex_bytes = codecs.encode(byte_sequence, 'hex')
+  output_string = codecs.decode(hex_bytes, 'utf-8')
+  string_iterators = [iter(output_string)] * 2
+
+  # pylint: disable=no-member
+  if py2to3.PY_2:
+    iterators = itertools.izip_longest(*string_iterators)
+  else:
+    iterators = itertools.zip_longest(*string_iterators)
+  groups = list(iterators)
+  output_string = ''.join(
+      ['\\x{0:s}{1:s}'.format(group[0], group[1]) for group in groups])
+  return output_string
+
+
+def _StreamContainsUTF16NullTerminator(byte_stream, offset):
+  """Checks if the given byte stream has a UTF-16 null character at the offset.
 
+  This is a little complicated because of the necessity of supporting Python 2
+  and 3.
 
-def HexifyBuffer(string_buffer):
-  """Return a string with the hex representation of a string buffer."""
-  chars = []
-  for char in string_buffer:
-    chars.append(binascii.hexlify(char))
+  Args:
+    byte_stream (bytes): byte string.
+    offset (int): byte stream offset to check.
 
-  return '\\x{0:s}'.format('\\x'.join(chars))
+  Returns:
+    bool: whether there's a UTF-16 null terminator in the stream at the given
+        offset.
+  """
+  byte_1 = byte_stream[offset]
+  byte_2 = byte_stream[offset + 1]
+  if py2to3.PY_2 and byte_1 == b'\x00' and byte_2 == b'\x00':
+    return True
+  if py2to3.PY_3 and byte_1 == 0 and byte_2 == 0:
+    return True
+  return False
diff --git a/plaso/lib/lexer.py b/plaso/lib/lexer.py
@@ -473,7 +473,7 @@ def _CombineBinaryExpressions(self, operator):
         self.stack[i-1] = None
         self.stack[i+1] = None
 
-    self.stack = filter(None, self.stack)
+    self.stack = list(filter(None, self.stack))
 
   def _CombineParenthesis(self):
     """Combine parenthesis."""
@@ -483,7 +483,7 @@ def _CombineParenthesis(self):
         self.stack[i] = None
         self.stack[i+2] = None
 
-    self.stack = filter(None, self.stack)
+    self.stack = list(filter(None, self.stack))
 
   def Reduce(self):
     """Reduce the token stack into an AST."""

diff --git a/plaso/lib/loggers.py b/plaso/lib/loggers.py
@@ -6,18 +6,22 @@
 import gzip
 import logging
 
+from plaso.lib import py2to3
+
 
 class CompressedFileHandler(logging.FileHandler):
   """Compressed file handler for logging."""
 
-  def __init__(self, filename, mode='a', encoding=None):
+  def __init__(self, filename, mode='a', encoding='utf-8'):
     """Initializes a compressed file logging handler.
 
     Args:
       filename (str): name of the log file.
       mode (Optional[str]): file access mode.
       encoding (Optional[str]): encoding of the log lines.
     """
+    if 't' not in mode and encoding and py2to3.PY_3:
+      mode = '{0:s}t'.format(mode)
     super(CompressedFileHandler, self).__init__(
         filename, mode=mode, encoding=encoding, delay=True)
 
@@ -28,19 +32,12 @@ def _open(self):
       file: file-like object of the resulting stream.
     """
     # The gzip module supports directly setting encoding as of Python 3.3.
-    return gzip.open(self.baseFilename, self.mode)
-
-  def emit(self, record):
-    """Emits a record.
-
-    Args:
-      record (logging.LogRecord): log record.
-    """
-    if self.encoding:
-      record = record.encode(self.encoding)
-
-    super(CompressedFileHandler, self).emit(record)
+    # pylint: disable=unexpected-keyword-arg
+    if py2to3.PY_3:
+      return gzip.open(
+          self.baseFilename, mode=self.mode, encoding=self.encoding)
 
+    return gzip.open(self.baseFilename, self.mode)
 
 def ConfigureLogging(
     debug_output=False, filename=None, mode='w', quiet_mode=False):