Interlinear / textgrid word identification

Check BOTH spelling AND transcription when deciding there exists a same word in the corpus. Relevant for homophones, homographs, etc. see #787
PhonologicalCorpusTools · Nov 16, 2021 · 77e172d · 77e172d
1 parent dc8fe6b
commit 77e172d
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 12 deletions.
diff --git a/corpustools/corpus/classes/lexicon.py b/corpustools/corpus/classes/lexicon.py
@@ -3229,7 +3229,7 @@ def add_word(self, word, allow_duplicates=False):
 
         """
         word._corpus = self
-        tokens = word.wordtokens[:]  # What is this doing?
+        tokens = word.wordtokens[:]  # only becomes relevant when same spelling but different transcription?
 
         #If the word doesn't exist, add it
         try:
@@ -3245,6 +3245,8 @@ def add_word(self, word, allow_duplicates=False):
                     try:
                         check = self.find(key)
                     except KeyError:
+                        if word.frequency == 0:
+                            word.frequency += 1
                         self.wordlist[key] = word
                         break
             else:

diff --git a/corpustools/corpus/io/csv.py b/corpustools/corpus/io/csv.py
@@ -230,10 +230,10 @@ def load_corpus_csv(corpus_name, path, delimiter, annotation_types=None, feature
             if not line:  # blank or just a newline
                 continue
 
-            d = {}
+            d = {}      # d is the dictionary to be fed as the argument of Word()
             for k, v in zip(headers, line.split(delimiter)):
                 v = v.strip()
-                if k.attribute.att_type == 'tier':
+                if k.attribute.att_type == 'tier':      # if dealing with a transcription column
                     trans = parse_transcription(v, k, feature_matrix=feature_matrix, corpus=corpus)  # trans is a list of BaseAnnotation
                     if not trans_check and len(trans) > 1:
                         trans_check = True

diff --git a/corpustools/corpus/io/helper.py b/corpustools/corpus/io/helper.py
@@ -613,7 +613,7 @@ def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, s
                 elif item.end is not None:
                     end = item.end
                     curr_word.append(item)
-                    curr_word = Transcription(curr_word)
+                    curr_word = Transcription(curr_word)    # here, combine segments as the transcription of a word
                     annotations[at].append((curr_word, begin, end))
                     curr_word = list()
         else:
@@ -666,7 +666,7 @@ def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, s
         add_frequency = False
 
     ind = 0
-    limit = max([len(list(v)) for v in annotations.values()])
+    limit = max([len(list(v)) for v in annotations.values()])  # limit = number of wordtokens
     for n in range(limit):
         if stop_check is not None and stop_check():
             return
@@ -684,12 +684,29 @@ def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, s
                     #annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None)
                 except IndexError:
                     word_kwargs[at.attribute.name] = (at.attribute, None)
+
+        if add_frequency:
+            word_kwargs['_freq_name'] = 'Frequency'
         word = Word(**word_kwargs)
-        try:
-            word = discourse.lexicon.find(word.spelling)
-            if add_frequency:
-                word.frequency += 1
-        except KeyError:
+
+        existing_words = discourse.lexicon.find_all(word.spelling)
+        # existing_words => list of words already in the corpus and with the same spelling with 'word'
+
+        if len(existing_words) > 0:
+            for existing_word in existing_words:
+                if existing_word.Transcription == word.Transcription:
+                    # same spelling AND same transcription => same word. so freq += 1
+                    word = existing_word
+                    if add_frequency:
+                        word.frequency += 1
+                    need_to_add = False
+                    break
+                else:
+                    need_to_add = True
+
+            # same spelling BUT different transcription => homophones. so add this as a separate entry
+            discourse.lexicon.add_word(word, allow_duplicates=True) if need_to_add else None
+        else:
             discourse.lexicon.add_word(word)
 
         word_token_kwargs = dict()

diff --git a/corpustools/gui/iogui.py b/corpustools/gui/iogui.py
@@ -885,7 +885,7 @@ def generateKwargs(self):
                                          'but in the Parsing Preview window you did not select an orthography.')
                     return
 
-        if type_ != 'ilg' and type_ != 'multiple':
+        if type_ != 'ilg' and type_ != 'multiple' and type_ != 'textgrid':
             variant_tokens = [x.is_token_base for x in kwargs['annotation_types']]
             if any(variant_tokens):
                 QMessageBox.critical(self,

diff --git a/corpustools/gui/main.py b/corpustools/gui/main.py
@@ -499,16 +499,18 @@ def generateInventoryModel(self):
     def compatibility_check(self, corpus):
         update_corpus, update_inventory, update_words = False, False, False
         for attribute in Corpus.corpus_attributes:
+            # the loaded corpus should be updated unless it has all required attributes as in the Corpus class
             if not hasattr(corpus, attribute):
                 update_corpus = True
                 break
         for attribute in Inventory.inventory_attributes:
+            # the loaded inventory should be updated unless it has all required attributes as in the Inventory class
             if not hasattr(corpus.inventory, attribute):
                 update_inventory = True
                 break
         word = corpus.random_word()
 
-        for attribute in Word.word_attributes:
+        for attribute in Word.word_attributes:      # randomly pick a word and check all word_attributes
             if not hasattr(word, attribute):
                 update_words = True
                 break