Implemented linguistic complexity but may not be working quite right.…

… Need to benchmark against original code...
Pappulab · Jun 28, 2015 · 7343a6e · 7343a6e
1 parent f6172f3
commit 7343a6e
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 2 deletions.
diff --git a/localcider/backend/sequence.py b/localcider/backend/sequence.py
@@ -580,6 +580,15 @@ def get_linear_WF_complexity(self, alphabetSize=20, userAlphabet={}, windowSize=
 
         """
         return self.ComplexityObject.get_WF_complexity(self.seq, alphabetSize, userAlphabet, windowSize, stepSize)
+
+
+
+    def get_linear_LC_complexity(self, alphabetSize=20, userAlphabet={}, windowSize=10, stepSize=1, wordSize=3):
+        """
+        Returns the Linguistic Complexity vectorial complexity using a sliding window approach
+
+        """
+        return self.ComplexityObject.get_LC_complexity(self.seq, alphabetSize, userAlphabet, windowSize, stepSize, wordSize)
 
 
 

diff --git a/localcider/backend/sequenceComplexity.py b/localcider/backend/sequenceComplexity.py
@@ -388,6 +388,51 @@ def CWF(self, sequence, alphabet, windowSize, stepSize):
 
 	return CWF_array
 
+        ###########################################################
+        # calculate linguistic complexity
+        #############ste##############################################
+    def LC(self, sequence, alphabet, windowSize, stepSize, wordSize):
+
+        # the current step
+        step = 0 
+        LC_array = []
+
+        while (step <= len(sequence)-windowSize):
+
+            # restart complexity calculation for this window
+            LC = 0
+
+            # reset the position for this window
+            i = 0 
+
+            ngrams = set()
+
+            ngram = ''
+
+            # for each position in the window
+            for i in range(0,windowSize-wordSize): 
+
+                # extract the current ngram
+                position = step+i 
+                ngram = ''.join(sequence[position:position+wordSize])
+
+                # if this ngram is not already present in the set
+                if ngram not in ngrams: 
+                    # add it to the ngrams set 
+                    ngrams.add(ngram) 
+
+            print ngrams
+            v = len(ngrams) #size of ngrams set
+            vmax = min(len(alphabet)**wordSize, windowSize-1+wordSize)
+            LC = float(v)/vmax
+            print v
+            print vmax
+            LC_array.append(LC) #add this to an array of the complexity profile scores
+            step += stepSize #increment the step
+
+        return LC_array
+
+
 
     def get_WF_complexity(self, sequence, alphabetSize=20, userAlphabet={}, windowSize=10,stepSize=1):
 
@@ -397,5 +442,13 @@ def get_WF_complexity(self, sequence, alphabetSize=20, userAlphabet={}, windowSi
         return self.CWF(reduced_sequence, alphabet, windowSize, stepSize)
 
 
+    def get_LC_complexity(self, sequence, alphabetSize=20, userAlphabet={}, windowSize=10,stepSize=1,wordSize=3):
+
+        # reduce alphabet complexity
+        (reduced_sequence, alphabet) = self.reduce_alphabet(sequence, alphabetSize, userAlphabet)
+
+        return self.LC(reduced_sequence, alphabet, windowSize, stepSize, wordSize)
+
+
 
 
diff --git a/localcider/sequenceParameters.py b/localcider/sequenceParameters.py
@@ -561,7 +561,7 @@ def get_reduced_alphabet_sequence(self, alphabetSize=20, userAlphabet={}):
 
 
 
-    def get_linearComplexity(self, complexityType="WF", alphabetSize=20, userAlphabet={}, windowSize=10, stepSize=1):
+    def get_linearComplexity(self, complexityType="WF", alphabetSize=20, userAlphabet={}, windowSize=10, stepSize=1, wordSize=3):
 
         """
         Returns the linear sequence complexity as defined by complexityType. Optionally,
@@ -577,6 +577,8 @@ def get_linearComplexity(self, complexityType="WF", alphabetSize=20, userAlphabe
 
                          WF - Wooton-Federhen complexity [1]
 
+                         LC - Linqguistic complexity 
+
                          (Default = 'WF')
 
         alphabetSize   | Defines the size of the alphabet being used, where pre-defined 
@@ -595,6 +597,8 @@ def get_linearComplexity(self, complexityType="WF", alphabetSize=20, userAlphabe
         stepSize       | Size of steps taken as we define a new sliding window. Default is
                          1 and should probably always be used...
 
+        wordSize       | Relevant for linguistic complexity (need more details!)
+
         OUTPUT:
         --------------------------------------------------------------------------------
         Returns a vector of values corresponding to the sliding window complexity of the
@@ -628,7 +632,7 @@ def get_linearComplexity(self, complexityType="WF", alphabetSize=20, userAlphabe
         """
 
         # set the allowed types of complexity here
-        allowed_types = ('WF')
+        allowed_types = ('WF', 'LC')
 
         # provide case insensitivity 
         try:
@@ -641,7 +645,14 @@ def get_linearComplexity(self, complexityType="WF", alphabetSize=20, userAlphabe
             raise SequenceComplexityException("Complexity type %s is not a valid type - must be one of %s"%(complexityType, allowed_types))
 
         if complexityType == "WF":
+            if not wordSize == 3:
+                print "WARNING: Ignoring wordSize argument for Wooton-Federhen complexity"
+
             return self.SeqObj.get_linear_WF_complexity(alphabetSize, userAlphabet, windowSize, stepSize)
+
+        if complexityType == "LC":
+            return self.SeqObj.get_linear_LC_complexity(alphabetSize, userAlphabet, windowSize, stepSize, wordSize)
+
 
 
     # ============================================ #