Skip to content

Commit

Permalink
Interlinear / textgrid word identification
Browse files Browse the repository at this point in the history
Check BOTH spelling AND transcription when deciding
there exists a same word in the corpus. Relevant for homophones, homographs, etc.
see #787
  • Loading branch information
stannam committed Nov 16, 2021
1 parent dc8fe6b commit 77e172d
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 12 deletions.
4 changes: 3 additions & 1 deletion corpustools/corpus/classes/lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -3229,7 +3229,7 @@ def add_word(self, word, allow_duplicates=False):
"""
word._corpus = self
tokens = word.wordtokens[:] # What is this doing?
tokens = word.wordtokens[:] # only becomes relevant when same spelling but different transcription?

#If the word doesn't exist, add it
try:
Expand All @@ -3245,6 +3245,8 @@ def add_word(self, word, allow_duplicates=False):
try:
check = self.find(key)
except KeyError:
if word.frequency == 0:
word.frequency += 1
self.wordlist[key] = word
break
else:
Expand Down
4 changes: 2 additions & 2 deletions corpustools/corpus/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,10 @@ def load_corpus_csv(corpus_name, path, delimiter, annotation_types=None, feature
if not line: # blank or just a newline
continue

d = {}
d = {} # d is the dictionary to be fed as the argument of Word()
for k, v in zip(headers, line.split(delimiter)):
v = v.strip()
if k.attribute.att_type == 'tier':
if k.attribute.att_type == 'tier': # if dealing with a transcription column
trans = parse_transcription(v, k, feature_matrix=feature_matrix, corpus=corpus) # trans is a list of BaseAnnotation
if not trans_check and len(trans) > 1:
trans_check = True
Expand Down
31 changes: 24 additions & 7 deletions corpustools/corpus/io/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@ def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, s
elif item.end is not None:
end = item.end
curr_word.append(item)
curr_word = Transcription(curr_word)
curr_word = Transcription(curr_word) # here, combine segments as the transcription of a word
annotations[at].append((curr_word, begin, end))
curr_word = list()
else:
Expand Down Expand Up @@ -666,7 +666,7 @@ def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, s
add_frequency = False

ind = 0
limit = max([len(list(v)) for v in annotations.values()])
limit = max([len(list(v)) for v in annotations.values()]) # limit = number of wordtokens
for n in range(limit):
if stop_check is not None and stop_check():
return
Expand All @@ -684,12 +684,29 @@ def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, s
#annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None)
except IndexError:
word_kwargs[at.attribute.name] = (at.attribute, None)

if add_frequency:
word_kwargs['_freq_name'] = 'Frequency'
word = Word(**word_kwargs)
try:
word = discourse.lexicon.find(word.spelling)
if add_frequency:
word.frequency += 1
except KeyError:

existing_words = discourse.lexicon.find_all(word.spelling)
# existing_words => list of words already in the corpus and with the same spelling with 'word'

if len(existing_words) > 0:
for existing_word in existing_words:
if existing_word.Transcription == word.Transcription:
# same spelling AND same transcription => same word. so freq += 1
word = existing_word
if add_frequency:
word.frequency += 1
need_to_add = False
break
else:
need_to_add = True

# same spelling BUT different transcription => homophones. so add this as a separate entry
discourse.lexicon.add_word(word, allow_duplicates=True) if need_to_add else None
else:
discourse.lexicon.add_word(word)

word_token_kwargs = dict()
Expand Down
2 changes: 1 addition & 1 deletion corpustools/gui/iogui.py
Original file line number Diff line number Diff line change
Expand Up @@ -885,7 +885,7 @@ def generateKwargs(self):
'but in the Parsing Preview window you did not select an orthography.')
return

if type_ != 'ilg' and type_ != 'multiple':
if type_ != 'ilg' and type_ != 'multiple' and type_ != 'textgrid':
variant_tokens = [x.is_token_base for x in kwargs['annotation_types']]
if any(variant_tokens):
QMessageBox.critical(self,
Expand Down
4 changes: 3 additions & 1 deletion corpustools/gui/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,16 +499,18 @@ def generateInventoryModel(self):
def compatibility_check(self, corpus):
update_corpus, update_inventory, update_words = False, False, False
for attribute in Corpus.corpus_attributes:
# the loaded corpus should be updated unless it has all required attributes as in the Corpus class
if not hasattr(corpus, attribute):
update_corpus = True
break
for attribute in Inventory.inventory_attributes:
# the loaded inventory should be updated unless it has all required attributes as in the Inventory class
if not hasattr(corpus.inventory, attribute):
update_inventory = True
break
word = corpus.random_word()

for attribute in Word.word_attributes:
for attribute in Word.word_attributes: # randomly pick a word and check all word_attributes
if not hasattr(word, attribute):
update_words = True
break
Expand Down

0 comments on commit 77e172d

Please sign in to comment.