Merge a80cc45 into bc32373

ycm-core · Apr 19, 2016 · bbf4aaa · bbf4aaa
2 parents bc32373 + a80cc45
commit bbf4aaa
Show file tree

Hide file tree

Showing 44 changed files with 1,873 additions and 382 deletions.
diff --git a/ycmd/completers/all/identifier_completer.py b/ycmd/completers/all/identifier_completer.py
@@ -29,7 +29,7 @@
 from collections import defaultdict
 from ycmd.completers.general_completer import GeneralCompleter
 from ycmd import identifier_utils
-from ycmd.utils import ToCppStringCompatible
+from ycmd.utils import ToCppStringCompatible, SplitLines
 from ycmd import responses
 
 SYNTAX_FILENAME = 'YCM_PLACEHOLDER_FOR_SYNTAX'
@@ -190,15 +190,15 @@ def PreviousIdentifierOnLine( line, column ):
     return nearest_ident
 
   line_num = request_data[ 'line_num' ] - 1
-  column_num = request_data[ 'column_num' ] - 1
+  column_num = request_data[ 'column_codepoint' ] - 1
   filepath = request_data[ 'filepath' ]
   try:
     filetype = request_data[ 'filetypes' ][ 0 ]
   except KeyError:
     filetype = None
 
   contents_per_line = (
-    request_data[ 'file_data' ][ filepath ][ 'contents' ].split( '\n' ) )
+    SplitLines( request_data[ 'file_data' ][ filepath ][ 'contents' ] ) )
 
   ident = PreviousIdentifierOnLine( contents_per_line[ line_num ], column_num )
   if ident:
@@ -225,9 +225,10 @@ def _GetCursorIdentifier( request_data ):
     filetype = request_data[ 'filetypes' ][ 0 ]
   except KeyError:
     filetype = None
-  return identifier_utils.IdentifierAtIndex( request_data[ 'line_value' ],
-                                             request_data[ 'column_num' ] - 1,
-                                             filetype )
+  return identifier_utils.IdentifierAtIndex(
+      request_data[ 'line_value' ],
+      request_data[ 'column_codepoint' ] - 1,
+      filetype )
 
 
 def _IdentifiersFromBuffer( text,

diff --git a/ycmd/completers/completer.py b/ycmd/completers/completer.py
@@ -40,10 +40,56 @@ class Completer( with_metaclass( abc.ABCMeta, object ) ):
   Completer. The following are functions that the Vim part of YCM will be
   calling on your Completer:
 
+  *Important note about unicode and byte offsets*
+
+    Useful background: http://utf8everywhere.org
+
+    Internally, all Python strings are unicode string objects, unless otherwise
+    converted to 'bytes' using ToBytes. In particular, the line_value and
+    file_data.contents entries in the request_data are unicode strings.
+
+    However, offsets in the API (such as column_num and start_column) are *byte*
+    offsets into a utf-8 encoded version of the contents of the line or buffer.
+    Therefore it is *never* safe to perform 'character' arithmetic
+    (such as '-1' to get the previous 'character') using these byte offsets, and
+    they cannot *ever* be used to index into line_value or buffer contents
+    unicode strings.
+
+    It is therefore important to ensure that you use the right type of offsets
+    for the right type of calculation:
+     - use codepoint offsets and a unicode string for 'character' calculations
+     - use byte offsets and utf-8 encoded bytes for all other manipulations
+
+    ycmd provides the following ways of accessing the source data and offsets:
+
+    For working with utf-8 encoded bytes:
+     - request_data[ 'line_bytes' ] - the line as utf-8 encoded bytes.
+     - request_data[ 'start_column' ] and request_data[ 'column_num' ].
+
+    For working with 'character' manipulations (unicode strings and codepoint
+    offsets):
+     - request_data[ 'line_value' ] - the line as a unicode string.
+     - request_data[ 'start_codepoint' ] and request_data[ 'column_codepoint' ].
+
+    For converting between the two:
+     - utils.ToBytes
+     - utils.ByteOffsetToCodepointOffset
+     - utils.ToUnicode
+     - utils.CodepointOffsetToByteOffset
+
+    Note: The above use of codepoints for 'character' manipulations is not
+    strictly correct. There are unicode 'characters' which consume multiple
+    codepoints. However, it is currently considered viable to use a single
+    codepoint = a single character until such a time as we improve support for
+    unicode identifiers. The purpose of the above rule is to prevent crashes and
+    random encoding exceptions, not to fully support unicode identifiers.
+
+  *END: Important note about unicode and byte offsets*
+
   ShouldUseNow() is called with the start column of where a potential completion
   string should start and the current line (string) the cursor is on. For
   instance, if the user's input is 'foo.bar' and the cursor is on the 'r' in
-  'bar', start_column will be the 1-based index of 'b' in the line. Your
+  'bar', start_column will be the 1-based byte index of 'b' in the line. Your
   implementation of ShouldUseNow() should return True if your semantic completer
   should be used and False otherwise.
 
@@ -146,16 +192,19 @@ def ShouldUseNowInner( self, request_data ):
     if not self.prepared_triggers:
       return False
     current_line = request_data[ 'line_value' ]
-    start_column = request_data[ 'start_column' ] - 1
-    column_num = request_data[ 'column_num' ] - 1
+    start_codepoint = request_data[ 'start_codepoint' ] - 1
+    column_codepoint = request_data[ 'column_codepoint' ] - 1
     filetype = self._CurrentFiletype( request_data[ 'filetypes' ] )
 
     return self.prepared_triggers.MatchesForFiletype(
-        current_line, start_column, column_num, filetype )
+        current_line, start_codepoint, column_codepoint, filetype )
 
 
   def QueryLengthAboveMinThreshold( self, request_data ):
-    query_length = request_data[ 'column_num' ] - request_data[ 'start_column' ]
+    # Note: calculation in 'characters' not bytes.
+    query_length = ( request_data[ 'column_codepoint' ] -
+                     request_data[ 'start_codepoint' ] )
+
     return query_length >= self.min_num_chars
 
 
@@ -322,6 +371,9 @@ def ServerIsReady( self ):
 
 
 class CompletionsCache( object ):
+  """Completions for a particular request. Importantly, columns are byte
+  offsets, not unicode codepoints."""
+
   def __init__( self ):
     self._access_lock = threading.Lock()
     self.Invalidate()
@@ -335,6 +387,7 @@ def Invalidate( self ):
       self._completions = None
 
 
+  # start_column is a byte offset.
   def Update( self, line_num, start_column, completion_type, completions ):
     with self._access_lock:
       self._line_num = line_num
@@ -343,6 +396,7 @@ def Update( self, line_num, start_column, completion_type, completions ):
       self._completions = completions
 
 
+  # start_column is a byte offset.
   def GetCompletionsIfCacheValid( self, line_num, start_column,
                                   completion_type ):
     with self._access_lock:
@@ -352,6 +406,7 @@ def GetCompletionsIfCacheValid( self, line_num, start_column,
       return self._completions
 
 
+  # start_column is a byte offset.
   def _CacheValidNoLock( self, line_num, start_column, completion_type ):
     return ( line_num == self._line_num and
              start_column == self._start_column and

diff --git a/ycmd/completers/completer_utils.py b/ycmd/completers/completer_utils.py
@@ -28,8 +28,9 @@
 # We don't want ycm_core inside Vim.
 import os
 import re
+import copy
 from collections import defaultdict
-from ycmd.utils import ToCppStringCompatible, ToUnicode
+from ycmd.utils import ToCppStringCompatible, ToUnicode, ReadFile
 
 
 class PreparedTriggers( object ):
@@ -46,23 +47,29 @@ def __init__( self, user_trigger_map = None, filetype_set = None ):
     self._filetype_to_prepared_triggers = final_triggers
 
 
-  def MatchingTriggerForFiletype( self, current_line, start_column, column_num,
+  def MatchingTriggerForFiletype( self,
+                                  current_line,
+                                  start_codepoint,
+                                  column_codepoint,
                                   filetype ):
     try:
       triggers = self._filetype_to_prepared_triggers[ filetype ]
     except KeyError:
       return None
     return _MatchingSemanticTrigger( current_line,
-                                     start_column,
-                                     column_num,
+                                     start_codepoint,
+                                     column_codepoint,
                                      triggers )
 
 
-  def MatchesForFiletype( self, current_line, start_column, column_num,
+  def MatchesForFiletype( self,
+                          current_line,
+                          start_codepoint,
+                          column_codepoint,
                           filetype ):
     return self.MatchingTriggerForFiletype( current_line,
-                                            start_column,
-                                            column_num,
+                                            start_codepoint,
+                                            column_codepoint,
                                             filetype ) is not None
 
 
@@ -92,44 +99,53 @@ def UpdateDict( first, second ):
   return final_dict
 
 
-def _RegexTriggerMatches( trigger, line_value, start_column, column_num ):
+# start_codepoint and column_codepoint are codepoint offsets in the unicode
+# string line_value.
+def _RegexTriggerMatches( trigger,
+                          line_value,
+                          start_codepoint,
+                          column_codepoint ):
   for match in trigger.finditer( line_value ):
-    # By definition of 'start_column', we know that the character just before
-    # 'start_column' is not an identifier character but all characters
-    # between 'start_column' and 'column_num' are. This means that if our
-    # trigger ends with an identifier character, its tail must match between
-    # 'start_column' and 'column_num', 'start_column' excluded. But if it
-    # doesn't, its tail must match exactly at 'start_column'. Both cases are
-    # mutually exclusive hence the following condition.
-    if start_column <= match.end() and match.end() <= column_num:
+    # By definition of 'start_codepoint', we know that the character just before
+    # 'start_codepoint' is not an identifier character but all characters
+    # between 'start_codepoint' and 'column_codepoint' are. This means that if
+    # our trigger ends with an identifier character, its tail must match between
+    # 'start_codepoint' and 'column_codepoint', 'start_codepoint' excluded. But
+    # if it doesn't, its tail must match exactly at 'start_codepoint'. Both
+    # cases are mutually exclusive hence the following condition.
+    if start_codepoint <= match.end() and match.end() <= column_codepoint:
       return True
   return False
 
 
-# start_column and column_num are 0-based
-def _MatchingSemanticTrigger( line_value, start_column, column_num,
+# start_codepoint and column_codepoint are 0-based and are codepoint offsets
+# into the unicode string line_value.
+def _MatchingSemanticTrigger( line_value, start_codepoint, column_codepoint,
                               trigger_list ):
-  if start_column < 0 or column_num < 0:
+  if start_codepoint < 0 or column_codepoint < 0:
     return None
 
   line_length = len( line_value )
-  if not line_length or start_column > line_length:
+  if not line_length or start_codepoint > line_length:
     return None
 
   # Ignore characters after user's caret column
-  line_value = line_value[ :column_num ]
+  line_value = line_value[ : column_codepoint ]
 
   for trigger in trigger_list:
-    if _RegexTriggerMatches( trigger, line_value, start_column, column_num ):
+    if _RegexTriggerMatches( trigger,
+                             line_value,
+                             start_codepoint,
+                             column_codepoint ):
       return trigger
   return None
 
 
-def _MatchesSemanticTrigger( line_value, start_column, column_num,
+def _MatchesSemanticTrigger( line_value, start_codepoint, column_codepoint,
                              trigger_list ):
   return _MatchingSemanticTrigger( line_value,
-                                   start_column,
-                                   column_num,
+                                   start_codepoint,
+                                   column_codepoint,
                                    trigger_list ) is not None
 
 
@@ -155,9 +171,73 @@ def FiletypeCompleterExistsForFiletype( filetype ):
 
 def FilterAndSortCandidatesWrap( candidates, sort_property, query ):
   from ycm_core import FilterAndSortCandidates
-  return FilterAndSortCandidates( candidates,
-                                  ToCppStringCompatible( sort_property ),
-                                  ToCppStringCompatible( query ) )
+
+  # The c++ interface we use only understands the (*native*) 'str' type (i.e.
+  # not the 'str' type from python-future. If we pass it a 'unicode' or
+  # 'bytes' instance then various things blow up, such as converting to
+  # std::string. Therefore all strings passed into the c++ API must pass through
+  # ToCppStringCompatible (or more strictly all strings which the C++ code
+  # needs to use and convert. In this case, just the insertion text property)
+
+  # FIXME: This is actually quite inefficient in an area which is used
+  # constantly and the key performance critical part of the system. There is
+  # code in the C++ layer (see PythonSupport.cpp:GetUtf8String) which attempts
+  # to work around this limitation. Unfortunately it has issues which cause the
+  # above problems, and we work around it by converting here in the python
+  # layer until we can come up with a better solution in the C++ layer.
+
+  # Note: we must deep copy candidates because we do not want to clobber the
+  # data that is passed in. It is actually used directly by the cache, so if
+  # we change the data pointed to by the elements of candidates, then this will
+  # be reflected in a subsequent response from the cache. This is particularly
+  # important for those candidates which are *not* returned after the filter, as
+  # they are not converted back to unicode.
+  cpp_compatible_candidates = _ConvertCandidatesToCppCompatible(
+    copy.deepcopy( candidates ),
+    sort_property )
+
+  # However, the reset of the python layer expects all the candidates properties
+  # to be some form of unicode string - a python-future str() instance.
+  # So we need to convert the insertion text property back to a unicode string
+  # before returning it.
+  filtered_candidates = FilterAndSortCandidates(
+    cpp_compatible_candidates,
+    ToCppStringCompatible( sort_property ),
+    ToCppStringCompatible( query ) )
+
+  return _ConvertCandidatesToPythonCompatible( filtered_candidates,
+                                               sort_property )
+
+
+def _ConvertCandidatesToCppCompatible( candidates, sort_property ):
+  """Convert the candidates to the format expected by the C++ layer."""
+  return _ConvertCandidates( candidates, sort_property, ToCppStringCompatible )
+
+
+def _ConvertCandidatesToPythonCompatible( candidates, sort_property ):
+  """Convert the candidates to the format expected by the python layer."""
+  return _ConvertCandidates( candidates, sort_property, ToUnicode )
+
+
+def _ConvertCandidates( candidates, sort_property, converter ):
+  """Apply the conversion function |converter| to the logical insertion text
+  field within the candidates in the candidate list |candidates|. The
+  |sort_property| is required to determine the format of |candidates|.
+
+  The conversion function should take a single argument (the string) and return
+  the converted string. It should be one of ycmd.utils.ToUnicode or
+  ycmd.utils.ToCppStringCompatible.
+
+  Typically this method is not called directly, rather it is used via
+  _ConvertCandidatesToCppCompatible and _ConvertCandidatesToPythonCompatible."""
+
+  if sort_property:
+    for candidate in candidates:
+      candidate[ sort_property ] = converter( candidate[ sort_property ] )
+    return candidates
+
+  return [ converter( c ) for c in candidates ]
+
 
 TRIGGER_REGEX_PREFIX = 're!'
 
@@ -216,3 +296,16 @@ def GetIncludeStatementValue( line, check_closing = True ):
       if close_char_pos != -1:
         include_value = line[ match.end() : close_char_pos ]
   return include_value, quoted_include
+
+
+def GetFileContents( request_data, filename ):
+  """Returns the contents of the absolute path |filename| as a unicode
+  string. If the file contents exist in |request_data| (i.e. it is open and
+  potentially modified/dirty in the user's editor), then it is returned,
+  otherwise the file is read from disk (assuming a UTF-8 encoding) and its
+  contents returned."""
+  file_data = request_data[ 'file_data' ]
+  if filename in file_data:
+    return ToUnicode( file_data[ filename ][ 'contents' ] )
+
+  return ToUnicode( ReadFile( filename ) )