Partial fix for #591

Loads with spelling+transcription. An all-spelling corpus is problematic.
PhonologicalCorpusTools · Nov 26, 2016 · 88ecded · 88ecded
1 parent 344d202
commit 88ecded
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 8 deletions.
diff --git a/corpustools/corpus/classes/lexicon.py b/corpustools/corpus/classes/lexicon.py
@@ -979,6 +979,13 @@ def __init__(self, update=False, **kwargs):
             self.descriptors.append('Frequency')
             self.Frequency = 0
 
+        if self._transcription_name is None:
+            for d in self.descriptors:
+                if isinstance(getattr(self,d,None), Transcription):
+                    self._transcription_name = d
+                    break
+
+
     def initDefaults(self):
         for attribute, default_value in Word.word_attributes.items():
             if isinstance(default_value, list):
@@ -1002,8 +1009,11 @@ def frequency(self):
 
     @property
     def transcription(self):
-        #return self._transcription
-        return getattr(self, self._transcription_name, self._transcription)
+        try:
+            value = getattr(self, self._transcription_name, self._transcription)
+        except (TypeError, AttributeError):
+            value = self.Transcription
+        return value
 
     @transcription.setter
     def transcription(self, value):

diff --git a/corpustools/corpus/classes/spontaneous.py b/corpustools/corpus/classes/spontaneous.py
@@ -1,7 +1,7 @@
 
 from collections import OrderedDict
 
-from .lexicon import Transcription, Corpus, Attribute
+from .lexicon import Transcription, Corpus, Attribute, Word
 
 import os
 import wave
@@ -359,7 +359,7 @@ def find_wordtype(self, wordtype):
         return list(x for x in self if x.wordtype == wordtype)
 
 
-class WordToken(object):
+class WordToken():
     """
     WordToken objects are individual productions of Words
 
@@ -434,7 +434,6 @@ def __init__(self,**kwargs):
         self._transcription = None
         self._freq_names = ['abs_freq', 'freq_per_mil', 'sfreq', 'lowercase_freq', 'log10_freq']
 
-
         for key, value in kwargs.items():
             if not all([letter.isupper() for letter in key]):
                 key = key.capitalize()

diff --git a/corpustools/corpus/io/helper.py b/corpustools/corpus/io/helper.py
@@ -454,7 +454,7 @@ def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, c
 
     discourse = Discourse(discourse_kwargs)
 
-    if not 'frequency' in [a.name.lower() for a in discourse.lexicon.attributes]:
+    if not 'Frequency' in [a.name for a in discourse.lexicon.attributes]:
         # running text will not have a frequency attribute supplied by the user
         # textgrids are also unlikely to have this attribute
         discourse.lexicon.add_attribute(Attribute('frequency', 'numeric', 'Frequency'))
@@ -479,6 +479,10 @@ def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, c
                 try:
                     #word_kwargs[at.output_name] = (at.attribute, annotations[at][n][0])
                     word_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0])
+                    if at.attribute.att_type == 'tier':
+                        #word_kwargs['_transcription_name'] = at.attribute.name
+                        print('found a tier')
+                        print(at.attribute.name)
                 except IndexError:
                     #word_kwargs[at.output_name] = (at.attribute, None)
                     word_kwargs[at.attribute.name] = (at.attribute, None)

diff --git a/corpustools/corpus/io/textgrid.py b/corpustools/corpus/io/textgrid.py
@@ -20,6 +20,9 @@ class PCTTextGrid(TextGrid):
     def __init__(self):
         super().__init__()
 
+    def name_filter(self,name):
+        return name.capitalize() if not all([x.isupper() for x in name]) else name
+
     def read(self, f):
         """
         Read the tiers contained in the Praat-formated TextGrid file
@@ -35,7 +38,7 @@ def read(self, f):
             source.readline()
             if source.readline().rstrip().split()[2] == '"IntervalTier"':
                 inam = source.readline().rstrip().split(' = ')[1].strip('"')
-                inam = inam.lower()
+                inam = self.name_filter(inam)
                 imin = round(float(source.readline().rstrip().split()[2]), 5)
                 imax = round(float(source.readline().rstrip().split()[2]), 5)
                 itie = IntervalTier(inam)
@@ -49,7 +52,7 @@ def read(self, f):
                 self.append(itie)
             else: # pointTier
                 inam = source.readline().rstrip().split(' = ')[1].strip('"')
-                inam = inam.lower()
+                inam = self.name_filter(inam)
                 imin = round(float(source.readline().rstrip().split()[2]), 5)
                 imax = round(float(source.readline().rstrip().split()[2]), 5)
                 itie = PointTier(inam)