In [1]:
import pandas as pd
import re

In [2]:
data = pd.read_excel('Ruuli_dictionary.xlsx')

# NOUNS

# 1. Check Noun Class Correspondence

In [3]:
dct_nc = set([j for i in list(set(data.NC.values))[1:] for j in re.findall(r'\d{1,2}\w?', i)])

In [4]:
an_nc = set([str(i) for i in range(1,17)])
an_nc.update(('1a', '3a', '5a', '5b', '9a', '10a'))

In [5]:
an_nc.difference(dct_nc)

{'10a', '3a', '5b', '9a'}

In [6]:
classes = '''<1><n>:mu[1]
<2><n>:ba[2]
<3><n>:mu[3]
<4><n>:mi[4]
<5><n>:i[5]
<5><n>:li[5b]
<6><n>:ma[6]
<7><n>:ki[7]
<8><n>:bi[8]
<9><n>:n[9]
<10><n>:n[10]
<11><n>:lu[11]
<12><n>:ka[12]
<13><n>:tu[13]
<14><n>:bu[14]
<15><n>:ku[15]
<16><n>:wa[16]
<20><n>:gu[20]
<22><n>:ga[22]'''

class2prefix = {}
class2prefix['1a'] = ''
class2prefix['5a'] = ''
for line in classes.split('\n'):
    class_num = re.findall(r'\[\d{1,2}\w?\]', line)[0][1:-1]
    prefix = re.findall(r':[a-z]{1,2}\[', line)[0][1:-1]
    class2prefix[class_num] = prefix

In [7]:
class2prefix

{'1a': '',
 '5a': '',
 '1': 'mu',
 '2': 'ba',
 '3': 'mu',
 '4': 'mi',
 '5': 'i',
 '5b': 'li',
 '6': 'ma',
 '7': 'ki',
 '8': 'bi',
 '9': 'n',
 '10': 'n',
 '11': 'lu',
 '12': 'ka',
 '13': 'tu',
 '14': 'bu',
 '15': 'ku',
 '16': 'wa',
 '20': 'gu',
 '22': 'ga'}

# 2. Verifier of Non-mentioned Classes

In [8]:
def isclass(word, class_num, class2prefix=class2prefix):
    word = word.lower().replace('r', 'l')
    prefix = class2prefix[class_num]
    if prefix and prefix[-1] in ('u', 'o') and word[len(prefix)] in ('i', 'o', 'e', 'a'):
        prefix = prefix[:-1] + 'w'
    elif prefix and prefix[-1] == 'i' and word[len(prefix)] in ('o', 'e', 'a'):
        prefix = prefix[:-1] + 'y'
    elif prefix == 'n' and word[len(prefix)] in ('b', 'p'):
        prefix = 'm'
    if word[:len(prefix)] == prefix:
        return word[len(prefix):], class_num

    if prefix[-1] in ('a', 'i', 'y') and word[len(prefix)] in ('e', 'o', 'i'):
        prefix = prefix[:-1]+word[len(prefix)]
    if word[:len(prefix)] == prefix:
        return word[len(prefix):], class_num
    
    if prefix[-1] == 'a' and word[len(prefix)-1] in ('e', 'o', 'i'):
        prefix = prefix[:-1]
    if word[:len(prefix)] == prefix:
        return word[len(prefix):], class_num
    
    if class_num == '5':
        prefix = class2prefix['5b']
        if word[len(prefix)] in ('o', 'e', 'a'):
            prefix = 'ly'
        if word[:len(prefix)] == prefix:
            return word[len(prefix):], '5b'
        
    if class_num in ('1', '3', '5', '9', '10'):
        return word, class_num+'a'

In [9]:
print(isclass('koomi', '7'), 
      isclass('iralu', '5'), 
      isclass('riiso', '5'), 
      isclass('lyanda', '5'), 
      isclass('mwozi', '1'), 
      isclass('kiigo', '12'),
      isclass('yindi', '1'),
     sep='\n')

('omi', '7')
('lalu', '5')
('iso', '5b')
('anda', '5b')
('ozi', '1')
('igo', '12')
('yindi', '1a')


# Meaning Finder

In [10]:
def getmeaning(sense):
    brackets = re.compile(r' \(.+?\)')
    sense = brackets.subn('', sense)[0]
    sense = re.sub(r'\w+\. ', '', sense)
    meaning = re.compile(r'[\w][\w\-\' ]*[\w]')
    out = None
    for alternative in meaning.findall(sense):
        var = alternative[4:] if alternative[:4] == 'the ' else alternative
        if out == None or len(var) < len(out):
            out = var.replace(' ', '_').replace('-', '').replace("'", '')
    return out

In [11]:
print(getmeaning('cracked heel, heel fissure'),
      getmeaning('next day, the following day, the day after'),
      getmeaning('way of worshipping, worshipping style'),
      getmeaning('to pull (a trigger), to release (an arrow), to unsnap, to release (a trap)'),
      getmeaning('Lat. Acalypha bipartita (type of plant)'),
      sep='\n')

cracked_heel
next_day
worshipping_style
to_pull
Acalypha_bipartita


In [12]:
with open('noundict.txt', 'w', encoding='utf-8') as f:
    f.write('''<17.loc>:ku[17]
<18.loc>:mu[18]
<23.loc>:e[23]
''')
with open('errors.txt', 'w', encoding='utf-8') as err:
    pass

for entry in data[data.pos == 'n'][['Lemma', 'citation', 'NC', 'Sense']].itertuples():
    
    if not isinstance(entry.NC, str):
        with open('errors.txt', 'a', encoding='utf-8') as err:
            err.write(f'ERROR: NO NC. For lemma "{entry.Lemma}" noun class was not found;\n')
        continue
    
    if not isinstance(entry.Sense, str):
        with open('errors.txt', 'a', encoding='utf-8') as err:
            err.write(f'ERROR: NO MEANING. For lemma "{entry.Lemma}" no meaning was not found;\n')
        continue
            
    classes = [class_num for class_num in re.findall(r'\d{1,2}\w?', entry.NC)]
    
    if len(classes) > 2:
        with open('errors.txt', 'a', encoding='utf-8') as err:
            err.write(f'ERROR: INVALID NC. For lemma "{entry.Lemma}" noun class marker is not appropriate (only 1 '
                      'or 2 classes per marker supported);\n')
        continue
    
    analyzed = isclass(entry.Lemma, classes[0])
    
    if not analyzed:
        with open('errors.txt', 'a', encoding='utf-8') as err:
            err.write(f'ERROR: UNSUPPORTED PREFIX. For lemma "{entry.Lemma}" of noun class {entry.NC} prefix '
                      'correspondence was not detected;\n')
        continue
    
    root, class_num = analyzed
    
    if classes[0] != class_num:
        classes.remove(classes[0])
        classes.append(class_num)
        
    if class_num == '9a' and '10' in classes:
        classes.remove('10')
        classes.append('10a')
    
    sense = getmeaning(entry.Sense)
    
    if len(classes) == 1 and sense[0].islower() and classes[0] in ('1a', '3', '4', '5', '6', 
                                                                    '8', '9', '10', '11', '14'):
        classes.append('mass')
    
    if isinstance(entry.citation, str):
        if entry.citation[:3] not in ('(a)', '(o)', '(e)'):
            classes.append('naug')
    else:
        if class_num[-1] == 'a' and root[0] == 'a':
            classes.append('naug')
    
    with open('noundict.txt', 'a', encoding='utf-8') as f:
        f.write(f'<{sense}>:{root.replace(" ", "")}[{",".join(classes)}]\n')
        
        if root[0] == 'y' and '5' in classes and '6' in classes:
            f.write(f'<{sense}>:{root[1:].replace(" ", "")}[{",".join(classes)}]\n')

In [13]:
#f'<{meaning}>:{root}[{classes,naug,mass}]'

# VERBS

In [14]:
def getinf(npfv, pfv):
    if pfv[:-3] == npfv[:-1] and pfv[-3:] in ('ire', 'ere'):
        return ((npfv[:-1], (pfv[-3])), ())
    
    if not 'a' in npfv[:-1] and not 'o' in npfv[:-1] and not 'u' in npfv[:-1] and not 'i' in npfv[:-1] and \
                                                    not 'e' in npfv[:-1]:
        return ((npfv, ('i','1sy')), ())
    
    for char in npfv[-2::-1]:
        if char in ('e', 'o'):
            cls = 'e'
            break
        if char in ('i', 'u', 'a'):
            cls = 'i'
            break
    return ((npfv[:-1], (cls)), (pfv, 'pfv'))

In [15]:
def getmeaning(sense):
    brackets = re.compile(r'[ ]?\(.+?\)')
    sense = brackets.subn('', sense)[0]
    sense = re.sub(r'\w+\. ', '', sense)
    meaning = re.compile(r'[\w][\w\-\' ]*[\w]')
    out = None
    for alternative in meaning.findall(sense):
        var = alternative[3:] if alternative[:3] == 'to ' else alternative
        if out == None or len(var) < len(out):
            out = var.replace(' ', '_').replace('-', '').replace("'", '')
    return out

In [16]:
with open('verbdict.txt', 'w', encoding='utf-8') as f:
    pass

for entry in data[data.pos == 'v'][['Lemma', 'PFV', 'Sense']].itertuples():
    if entry.Lemma[-1] != 'a':
        with open('errors.txt', 'a', encoding='utf-8') as err:
            err.write(f'ERROR: UNSUPPORTED LEMMA. For lemma "{entry.Lemma}" final vowel cannot be detected;\n')
        continue
        
    sense = getmeaning(entry.Sense)
    
    if not isinstance(sense, str):
        with open('errors.txt', 'a', encoding='utf-8') as err:
            err.write(f'ERROR: NO MEANING. For lemma "{entry.Lemma}" no meaning was found;\n')
        continue
    
    if not entry.Lemma.isalpha():
        with open('errors.txt', 'a', encoding='utf-8') as err:
            err.write(f'ERROR: WRONG LEMMA. Lemma "{entry.Lemma}" is not supported;\n')
        continue
    
    if type(entry.PFV) != str or not entry.PFV.isalpha():    
        npfv = entry.Lemma[:-1]
        for char in npfv[::-1]:
            if char in ('e', 'o'):
                cls = 'e'
                break
            if char in ('i', 'u', 'a'):
                cls = 'i'
                break
        with open('verbdict.txt', 'a', encoding='utf-8') as f:
            f.write(f'<{sense}>:{npfv}[{cls}]\n')
    else:  
        npfv, pfv = getinf(entry.Lemma, entry.PFV)            
        with open('verbdict.txt', 'a', encoding='utf-8') as f:
            f.write(f'<{sense}>:{npfv[0]}[{",".join(npfv[1])}]\n')
            if pfv != ():
                f.write(f"<{sense}>:{pfv[0]}[pfv]\n")

# ADJECTIVES

In [17]:
def getmeaning(sense):
    brackets = re.compile(r'[ ]?\(.+?\)')
    sense = brackets.subn('', sense)[0]
    sense = re.sub(r'\w+\. ', '', sense)
    meaning = re.compile(r'[\w][\w\-\' ]*[\w]')
    out = None
    for alternative in meaning.findall(sense):
        var = alternative
        if out == None or len(var) < len(out):
            out = var.replace(' ', '_').replace('-', '').replace("'", '')
    return out

In [18]:
with open('adjdict.txt', 'w', encoding='utf-8') as f:
    pass
for entry in data[data.pos == 'adj'][['Lemma', 'Sense']].itertuples():
    if not entry.Lemma.isalpha():
        with open('errors.txt', 'a', encoding='utf-8') as err:
            err.write(f'ERROR: WRONG LEMMA. Lemma "{entry.Lemma}" is not supported;\n')
        continue
        
    sense = getmeaning(entry.Sense)
    if not isinstance(sense, str):
        with open('errors.txt', 'a', encoding='utf-8') as err:
            err.write(f'ERROR: NO MEANING. For lemma "{entry.Lemma}" no meaning was found;\n')
        continue
        
    with open('adjdict.txt', 'a', encoding='utf-8') as f:
            f.write(f'<{sense}>:{entry.Lemma}\n')

# NON-INFLECTING PARTS ON SPEECH

In [19]:
def getmeaning(sense):
    brackets = re.compile(r'[ ]?\(.+?\)')
    sense = brackets.subn('', sense)[0]
    sense = re.sub(r'\w+\. ', '', sense)
    meaning = re.compile(r'[\w][\w\-\' ]*[\w]')
    out = None
    for alternative in meaning.findall(sense):
        var = alternative
        if out == None or len(var) < len(out):
            out = var.replace(' ', '_').replace('-', '').replace("'", '')
    return out

In [20]:
with open('restdict.txt', 'w', encoding='utf-8') as f:
    pass

conv = {'adv': 'adv',
       'conj': 'conj',
       'ideo': 'ideo',
       'interj': 'intj',
       'interrog': 'inter',
       'num': 'num',
       'part': 'part',
       'prep': 'prep',
       'pro': 'pro',
       'quant': 'q'}

for entry in data[[i in conv for i in data.pos]][['Lemma', 'pos', 'Sense']].itertuples():
    if not entry.Lemma.isalpha():
        with open('errors.txt', 'a', encoding='utf-8') as err:
            err.write(f'ERROR: WRONG LEMMA. Lemma "{entry.Lemma}" is not supported;\n')
        continue
        
    sense = getmeaning(entry.Sense)
    if not isinstance(sense, str):
        with open('errors.txt', 'a', encoding='utf-8') as err:
            err.write(f'ERROR: NO MEANING. For lemma "{entry.Lemma}" no meaning was found;\n')
        continue
    form = entry.Lemma.strip('(').strip('-').strip(')')
    pos = conv[entry.pos]
    with open('restdict.txt', 'a', encoding='utf-8') as f:
            f.write(f'<{sense}><{pos}>:{form}\n')