Skip to content

Commit

Permalink
Implemented linguistic complexity but may not be working quite right.…
Browse files Browse the repository at this point in the history
… Need to benchmark against original code...
  • Loading branch information
alexholehouse committed Jun 28, 2015
1 parent f6172f3 commit 7343a6e
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 2 deletions.
9 changes: 9 additions & 0 deletions localcider/backend/sequence.py
Expand Up @@ -580,6 +580,15 @@ def get_linear_WF_complexity(self, alphabetSize=20, userAlphabet={}, windowSize=
"""
return self.ComplexityObject.get_WF_complexity(self.seq, alphabetSize, userAlphabet, windowSize, stepSize)



def get_linear_LC_complexity(self, alphabetSize=20, userAlphabet={}, windowSize=10, stepSize=1, wordSize=3):
"""
Returns the Linguistic Complexity vectorial complexity using a sliding window approach
"""
return self.ComplexityObject.get_LC_complexity(self.seq, alphabetSize, userAlphabet, windowSize, stepSize, wordSize)



Expand Down
53 changes: 53 additions & 0 deletions localcider/backend/sequenceComplexity.py
Expand Up @@ -388,6 +388,51 @@ def CWF(self, sequence, alphabet, windowSize, stepSize):

return CWF_array

###########################################################
# calculate linguistic complexity
#############ste##############################################
def LC(self, sequence, alphabet, windowSize, stepSize, wordSize):

# the current step
step = 0
LC_array = []

while (step <= len(sequence)-windowSize):

# restart complexity calculation for this window
LC = 0

# reset the position for this window
i = 0

ngrams = set()

ngram = ''

# for each position in the window
for i in range(0,windowSize-wordSize):

# extract the current ngram
position = step+i
ngram = ''.join(sequence[position:position+wordSize])

# if this ngram is not already present in the set
if ngram not in ngrams:
# add it to the ngrams set
ngrams.add(ngram)

print ngrams
v = len(ngrams) #size of ngrams set
vmax = min(len(alphabet)**wordSize, windowSize-1+wordSize)
LC = float(v)/vmax
print v
print vmax
LC_array.append(LC) #add this to an array of the complexity profile scores
step += stepSize #increment the step

return LC_array



def get_WF_complexity(self, sequence, alphabetSize=20, userAlphabet={}, windowSize=10,stepSize=1):

Expand All @@ -397,5 +442,13 @@ def get_WF_complexity(self, sequence, alphabetSize=20, userAlphabet={}, windowSi
return self.CWF(reduced_sequence, alphabet, windowSize, stepSize)


def get_LC_complexity(self, sequence, alphabetSize=20, userAlphabet={}, windowSize=10,stepSize=1,wordSize=3):

# reduce alphabet complexity
(reduced_sequence, alphabet) = self.reduce_alphabet(sequence, alphabetSize, userAlphabet)

return self.LC(reduced_sequence, alphabet, windowSize, stepSize, wordSize)




15 changes: 13 additions & 2 deletions localcider/sequenceParameters.py
Expand Up @@ -561,7 +561,7 @@ def get_reduced_alphabet_sequence(self, alphabetSize=20, userAlphabet={}):



def get_linearComplexity(self, complexityType="WF", alphabetSize=20, userAlphabet={}, windowSize=10, stepSize=1):
def get_linearComplexity(self, complexityType="WF", alphabetSize=20, userAlphabet={}, windowSize=10, stepSize=1, wordSize=3):

"""
Returns the linear sequence complexity as defined by complexityType. Optionally,
Expand All @@ -577,6 +577,8 @@ def get_linearComplexity(self, complexityType="WF", alphabetSize=20, userAlphabe
WF - Wooton-Federhen complexity [1]
LC - Linqguistic complexity
(Default = 'WF')
alphabetSize | Defines the size of the alphabet being used, where pre-defined
Expand All @@ -595,6 +597,8 @@ def get_linearComplexity(self, complexityType="WF", alphabetSize=20, userAlphabe
stepSize | Size of steps taken as we define a new sliding window. Default is
1 and should probably always be used...
wordSize | Relevant for linguistic complexity (need more details!)
OUTPUT:
--------------------------------------------------------------------------------
Returns a vector of values corresponding to the sliding window complexity of the
Expand Down Expand Up @@ -628,7 +632,7 @@ def get_linearComplexity(self, complexityType="WF", alphabetSize=20, userAlphabe
"""

# set the allowed types of complexity here
allowed_types = ('WF')
allowed_types = ('WF', 'LC')

# provide case insensitivity
try:
Expand All @@ -641,7 +645,14 @@ def get_linearComplexity(self, complexityType="WF", alphabetSize=20, userAlphabe
raise SequenceComplexityException("Complexity type %s is not a valid type - must be one of %s"%(complexityType, allowed_types))

if complexityType == "WF":
if not wordSize == 3:
print "WARNING: Ignoring wordSize argument for Wooton-Federhen complexity"

return self.SeqObj.get_linear_WF_complexity(alphabetSize, userAlphabet, windowSize, stepSize)

if complexityType == "LC":
return self.SeqObj.get_linear_LC_complexity(alphabetSize, userAlphabet, windowSize, stepSize, wordSize)



# ============================================ #
Expand Down

0 comments on commit 7343a6e

Please sign in to comment.