Skip to content

Commit

Permalink
Merge a80cc45 into bc32373
Browse files Browse the repository at this point in the history
  • Loading branch information
puremourning committed Apr 19, 2016
2 parents bc32373 + a80cc45 commit bbf4aaa
Show file tree
Hide file tree
Showing 44 changed files with 1,873 additions and 382 deletions.
13 changes: 7 additions & 6 deletions ycmd/completers/all/identifier_completer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from collections import defaultdict
from ycmd.completers.general_completer import GeneralCompleter
from ycmd import identifier_utils
from ycmd.utils import ToCppStringCompatible
from ycmd.utils import ToCppStringCompatible, SplitLines
from ycmd import responses

SYNTAX_FILENAME = 'YCM_PLACEHOLDER_FOR_SYNTAX'
Expand Down Expand Up @@ -190,15 +190,15 @@ def PreviousIdentifierOnLine( line, column ):
return nearest_ident

line_num = request_data[ 'line_num' ] - 1
column_num = request_data[ 'column_num' ] - 1
column_num = request_data[ 'column_codepoint' ] - 1
filepath = request_data[ 'filepath' ]
try:
filetype = request_data[ 'filetypes' ][ 0 ]
except KeyError:
filetype = None

contents_per_line = (
request_data[ 'file_data' ][ filepath ][ 'contents' ].split( '\n' ) )
SplitLines( request_data[ 'file_data' ][ filepath ][ 'contents' ] ) )

ident = PreviousIdentifierOnLine( contents_per_line[ line_num ], column_num )
if ident:
Expand All @@ -225,9 +225,10 @@ def _GetCursorIdentifier( request_data ):
filetype = request_data[ 'filetypes' ][ 0 ]
except KeyError:
filetype = None
return identifier_utils.IdentifierAtIndex( request_data[ 'line_value' ],
request_data[ 'column_num' ] - 1,
filetype )
return identifier_utils.IdentifierAtIndex(
request_data[ 'line_value' ],
request_data[ 'column_codepoint' ] - 1,
filetype )


def _IdentifiersFromBuffer( text,
Expand Down
65 changes: 60 additions & 5 deletions ycmd/completers/completer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,56 @@ class Completer( with_metaclass( abc.ABCMeta, object ) ):
Completer. The following are functions that the Vim part of YCM will be
calling on your Completer:
*Important note about unicode and byte offsets*
Useful background: http://utf8everywhere.org
Internally, all Python strings are unicode string objects, unless otherwise
converted to 'bytes' using ToBytes. In particular, the line_value and
file_data.contents entries in the request_data are unicode strings.
However, offsets in the API (such as column_num and start_column) are *byte*
offsets into a utf-8 encoded version of the contents of the line or buffer.
Therefore it is *never* safe to perform 'character' arithmetic
(such as '-1' to get the previous 'character') using these byte offsets, and
they cannot *ever* be used to index into line_value or buffer contents
unicode strings.
It is therefore important to ensure that you use the right type of offsets
for the right type of calculation:
- use codepoint offsets and a unicode string for 'character' calculations
- use byte offsets and utf-8 encoded bytes for all other manipulations
ycmd provides the following ways of accessing the source data and offsets:
For working with utf-8 encoded bytes:
- request_data[ 'line_bytes' ] - the line as utf-8 encoded bytes.
- request_data[ 'start_column' ] and request_data[ 'column_num' ].
For working with 'character' manipulations (unicode strings and codepoint
offsets):
- request_data[ 'line_value' ] - the line as a unicode string.
- request_data[ 'start_codepoint' ] and request_data[ 'column_codepoint' ].
For converting between the two:
- utils.ToBytes
- utils.ByteOffsetToCodepointOffset
- utils.ToUnicode
- utils.CodepointOffsetToByteOffset
Note: The above use of codepoints for 'character' manipulations is not
strictly correct. There are unicode 'characters' which consume multiple
codepoints. However, it is currently considered viable to use a single
codepoint = a single character until such a time as we improve support for
unicode identifiers. The purpose of the above rule is to prevent crashes and
random encoding exceptions, not to fully support unicode identifiers.
*END: Important note about unicode and byte offsets*
ShouldUseNow() is called with the start column of where a potential completion
string should start and the current line (string) the cursor is on. For
instance, if the user's input is 'foo.bar' and the cursor is on the 'r' in
'bar', start_column will be the 1-based index of 'b' in the line. Your
'bar', start_column will be the 1-based byte index of 'b' in the line. Your
implementation of ShouldUseNow() should return True if your semantic completer
should be used and False otherwise.
Expand Down Expand Up @@ -146,16 +192,19 @@ def ShouldUseNowInner( self, request_data ):
if not self.prepared_triggers:
return False
current_line = request_data[ 'line_value' ]
start_column = request_data[ 'start_column' ] - 1
column_num = request_data[ 'column_num' ] - 1
start_codepoint = request_data[ 'start_codepoint' ] - 1
column_codepoint = request_data[ 'column_codepoint' ] - 1
filetype = self._CurrentFiletype( request_data[ 'filetypes' ] )

return self.prepared_triggers.MatchesForFiletype(
current_line, start_column, column_num, filetype )
current_line, start_codepoint, column_codepoint, filetype )


def QueryLengthAboveMinThreshold( self, request_data ):
query_length = request_data[ 'column_num' ] - request_data[ 'start_column' ]
# Note: calculation in 'characters' not bytes.
query_length = ( request_data[ 'column_codepoint' ] -
request_data[ 'start_codepoint' ] )

return query_length >= self.min_num_chars


Expand Down Expand Up @@ -322,6 +371,9 @@ def ServerIsReady( self ):


class CompletionsCache( object ):
"""Completions for a particular request. Importantly, columns are byte
offsets, not unicode codepoints."""

def __init__( self ):
self._access_lock = threading.Lock()
self.Invalidate()
Expand All @@ -335,6 +387,7 @@ def Invalidate( self ):
self._completions = None


# start_column is a byte offset.
def Update( self, line_num, start_column, completion_type, completions ):
with self._access_lock:
self._line_num = line_num
Expand All @@ -343,6 +396,7 @@ def Update( self, line_num, start_column, completion_type, completions ):
self._completions = completions


# start_column is a byte offset.
def GetCompletionsIfCacheValid( self, line_num, start_column,
completion_type ):
with self._access_lock:
Expand All @@ -352,6 +406,7 @@ def GetCompletionsIfCacheValid( self, line_num, start_column,
return self._completions


# start_column is a byte offset.
def _CacheValidNoLock( self, line_num, start_column, completion_type ):
return ( line_num == self._line_num and
start_column == self._start_column and
Expand Down
149 changes: 121 additions & 28 deletions ycmd/completers/completer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@
# We don't want ycm_core inside Vim.
import os
import re
import copy
from collections import defaultdict
from ycmd.utils import ToCppStringCompatible, ToUnicode
from ycmd.utils import ToCppStringCompatible, ToUnicode, ReadFile


class PreparedTriggers( object ):
Expand All @@ -46,23 +47,29 @@ def __init__( self, user_trigger_map = None, filetype_set = None ):
self._filetype_to_prepared_triggers = final_triggers


def MatchingTriggerForFiletype( self, current_line, start_column, column_num,
def MatchingTriggerForFiletype( self,
current_line,
start_codepoint,
column_codepoint,
filetype ):
try:
triggers = self._filetype_to_prepared_triggers[ filetype ]
except KeyError:
return None
return _MatchingSemanticTrigger( current_line,
start_column,
column_num,
start_codepoint,
column_codepoint,
triggers )


def MatchesForFiletype( self, current_line, start_column, column_num,
def MatchesForFiletype( self,
current_line,
start_codepoint,
column_codepoint,
filetype ):
return self.MatchingTriggerForFiletype( current_line,
start_column,
column_num,
start_codepoint,
column_codepoint,
filetype ) is not None


Expand Down Expand Up @@ -92,44 +99,53 @@ def UpdateDict( first, second ):
return final_dict


def _RegexTriggerMatches( trigger, line_value, start_column, column_num ):
# start_codepoint and column_codepoint are codepoint offsets in the unicode
# string line_value.
def _RegexTriggerMatches( trigger,
line_value,
start_codepoint,
column_codepoint ):
for match in trigger.finditer( line_value ):
# By definition of 'start_column', we know that the character just before
# 'start_column' is not an identifier character but all characters
# between 'start_column' and 'column_num' are. This means that if our
# trigger ends with an identifier character, its tail must match between
# 'start_column' and 'column_num', 'start_column' excluded. But if it
# doesn't, its tail must match exactly at 'start_column'. Both cases are
# mutually exclusive hence the following condition.
if start_column <= match.end() and match.end() <= column_num:
# By definition of 'start_codepoint', we know that the character just before
# 'start_codepoint' is not an identifier character but all characters
# between 'start_codepoint' and 'column_codepoint' are. This means that if
# our trigger ends with an identifier character, its tail must match between
# 'start_codepoint' and 'column_codepoint', 'start_codepoint' excluded. But
# if it doesn't, its tail must match exactly at 'start_codepoint'. Both
# cases are mutually exclusive hence the following condition.
if start_codepoint <= match.end() and match.end() <= column_codepoint:
return True
return False


# start_column and column_num are 0-based
def _MatchingSemanticTrigger( line_value, start_column, column_num,
# start_codepoint and column_codepoint are 0-based and are codepoint offsets
# into the unicode string line_value.
def _MatchingSemanticTrigger( line_value, start_codepoint, column_codepoint,
trigger_list ):
if start_column < 0 or column_num < 0:
if start_codepoint < 0 or column_codepoint < 0:
return None

line_length = len( line_value )
if not line_length or start_column > line_length:
if not line_length or start_codepoint > line_length:
return None

# Ignore characters after user's caret column
line_value = line_value[ :column_num ]
line_value = line_value[ : column_codepoint ]

for trigger in trigger_list:
if _RegexTriggerMatches( trigger, line_value, start_column, column_num ):
if _RegexTriggerMatches( trigger,
line_value,
start_codepoint,
column_codepoint ):
return trigger
return None


def _MatchesSemanticTrigger( line_value, start_column, column_num,
def _MatchesSemanticTrigger( line_value, start_codepoint, column_codepoint,
trigger_list ):
return _MatchingSemanticTrigger( line_value,
start_column,
column_num,
start_codepoint,
column_codepoint,
trigger_list ) is not None


Expand All @@ -155,9 +171,73 @@ def FiletypeCompleterExistsForFiletype( filetype ):

def FilterAndSortCandidatesWrap( candidates, sort_property, query ):
from ycm_core import FilterAndSortCandidates
return FilterAndSortCandidates( candidates,
ToCppStringCompatible( sort_property ),
ToCppStringCompatible( query ) )

# The c++ interface we use only understands the (*native*) 'str' type (i.e.
# not the 'str' type from python-future. If we pass it a 'unicode' or
# 'bytes' instance then various things blow up, such as converting to
# std::string. Therefore all strings passed into the c++ API must pass through
# ToCppStringCompatible (or more strictly all strings which the C++ code
# needs to use and convert. In this case, just the insertion text property)

# FIXME: This is actually quite inefficient in an area which is used
# constantly and the key performance critical part of the system. There is
# code in the C++ layer (see PythonSupport.cpp:GetUtf8String) which attempts
# to work around this limitation. Unfortunately it has issues which cause the
# above problems, and we work around it by converting here in the python
# layer until we can come up with a better solution in the C++ layer.

# Note: we must deep copy candidates because we do not want to clobber the
# data that is passed in. It is actually used directly by the cache, so if
# we change the data pointed to by the elements of candidates, then this will
# be reflected in a subsequent response from the cache. This is particularly
# important for those candidates which are *not* returned after the filter, as
# they are not converted back to unicode.
cpp_compatible_candidates = _ConvertCandidatesToCppCompatible(
copy.deepcopy( candidates ),
sort_property )

# However, the reset of the python layer expects all the candidates properties
# to be some form of unicode string - a python-future str() instance.
# So we need to convert the insertion text property back to a unicode string
# before returning it.
filtered_candidates = FilterAndSortCandidates(
cpp_compatible_candidates,
ToCppStringCompatible( sort_property ),
ToCppStringCompatible( query ) )

return _ConvertCandidatesToPythonCompatible( filtered_candidates,
sort_property )


def _ConvertCandidatesToCppCompatible( candidates, sort_property ):
"""Convert the candidates to the format expected by the C++ layer."""
return _ConvertCandidates( candidates, sort_property, ToCppStringCompatible )


def _ConvertCandidatesToPythonCompatible( candidates, sort_property ):
"""Convert the candidates to the format expected by the python layer."""
return _ConvertCandidates( candidates, sort_property, ToUnicode )


def _ConvertCandidates( candidates, sort_property, converter ):
"""Apply the conversion function |converter| to the logical insertion text
field within the candidates in the candidate list |candidates|. The
|sort_property| is required to determine the format of |candidates|.
The conversion function should take a single argument (the string) and return
the converted string. It should be one of ycmd.utils.ToUnicode or
ycmd.utils.ToCppStringCompatible.
Typically this method is not called directly, rather it is used via
_ConvertCandidatesToCppCompatible and _ConvertCandidatesToPythonCompatible."""

if sort_property:
for candidate in candidates:
candidate[ sort_property ] = converter( candidate[ sort_property ] )
return candidates

return [ converter( c ) for c in candidates ]


TRIGGER_REGEX_PREFIX = 're!'

Expand Down Expand Up @@ -216,3 +296,16 @@ def GetIncludeStatementValue( line, check_closing = True ):
if close_char_pos != -1:
include_value = line[ match.end() : close_char_pos ]
return include_value, quoted_include


def GetFileContents( request_data, filename ):
"""Returns the contents of the absolute path |filename| as a unicode
string. If the file contents exist in |request_data| (i.e. it is open and
potentially modified/dirty in the user's editor), then it is returned,
otherwise the file is read from disk (assuming a UTF-8 encoding) and its
contents returned."""
file_data = request_data[ 'file_data' ]
if filename in file_data:
return ToUnicode( file_data[ filename ][ 'contents' ] )

return ToUnicode( ReadFile( filename ) )
Loading

0 comments on commit bbf4aaa

Please sign in to comment.