### Imports

In [72]:
from utils import NounParser,VerbParser,AdjectiveNumeralParser
import pandas as pd

### Functions

In [130]:
#Instantiating parser objects
noun_parser=NounParser()
verb_parser=VerbParser()
adj_num_parser=AdjectiveNumeralParser()


def prior_forms(row):
    """
    puts all forms that have been parsed into a single column
    """
    if pd.isna(row["POS"]):  
        return None
    elif row["POS"] in ["noun", "numeral", "adjective"]:
        form = str(row["SINGULAR"]) if isinstance(row["SINGULAR"], str) else ""
        return form
    elif row["POS"] == "verb":
        verb = row["FORM"]
        if pd.isna(verb):
            return None
        else:
            return verb




def parse_data(row):
    """
    parses data and places parsed data into one column
    """
    if pd.isna(row["POS"]):  
        return None
    elif row["POS"] == "noun":
        noun = row["SINGULAR"]
        if pd.isna(noun):
            return None
        return noun_parser.identified_suffixes(
            noun_parser.hyphen_space(
                noun_parser.nasalized_stops(
                    noun_parser.cvcv_segmentation(
                        noun_parser.second_parse_durationals(
                            noun_parser.first_parse_durationals(
                                noun_parser.existing_parses(
                                    adj_num_parser.y_suffixes(noun.strip("()/_"))))))))) if noun.endswith("y") or (noun.endswith("ⁿ") and noun[-2]=="y") else noun_parser.identified_suffixes(
            noun_parser.hyphen_space(
                noun_parser.nasalized_stops(
                    noun_parser.cvcv_segmentation(
                        noun_parser.second_parse_durationals(
                            noun_parser.first_parse_durationals(
                                noun_parser.existing_parses(noun.strip("()/_")))))))) 
    elif row["POS"] == "verb":
        verb = row["FORM"]
        if pd.isna(verb):
            return None
        return verb_parser.post_editing_short_strings(
            verb_parser.segment_cvcs(
                verb_parser.second_parse_durationals(
                    verb_parser.first_parse_durationals(
                        verb_parser.existing_parses(verb.strip(")(_"))))))
    elif row["POS"] == "numeral" or row["POS"] == "adjective":
        adjective_numeral = str(row["SINGULAR"]) if isinstance(row["SINGULAR"], str) else ""
        if pd.isna(adjective_numeral):
            return None
        return adj_num_parser.miscellaneous(
            adj_num_parser.switch_hyphen_position(
                adj_num_parser.replace_hyphens_keep_last(
                    adj_num_parser.y_suffixes(
                        adj_num_parser.isolating_suffixes(
                            adj_num_parser.second_parse_durationals(
                                adj_num_parser.first_parse_durationals(
                                    adj_num_parser.existing_parses(adjective_numeral.strip("()/_")))))))))
    elif row["POS"] in ["pronoun", "other"]:
        form=row["SINGULAR"]
        return form
    else:
        return None

### Loading data

In [131]:
pd.set_option('display.max_rows', None)
path = r"D:\ERC_Bang\Tasks\data_cleaning" 
os.chdir(path)
data = pd.read_csv("data.tsv", sep="\t", encoding="utf-8")
list_to_drop=["FRENCH", "ENGLISH_SHORT", "FRENCH_SHORT", "ENGLISH_CATEGORY", "FRENCH_CATEGORY", "PARSED FORM", "MCF", "RECONSTRUCTION", "NOTE", "NOTES","Unnamed: 18", "Unnamed: 19", "Unnamed: 20", "Unnamed: 21", "COGID", "COGIDS", "Unnamed: 24"]
data=data.drop(list_to_drop, axis=1)

In [132]:
data["BEFORE_PARSE"]=data.apply(prior_forms, axis=1)

In [133]:
data["BEFORE_PARSE"][:10]

0       jáwdì-m
1       dá:bà-m
2      àrsɛ̌:-m
3       kɔ́mbɔ̀
4      bɛ́lɛ̀-g
5     bɛ́lɛ̀-gù
6         bɛ̀lú
7         bɛ̀lú
8    gàr sɛ̀gɛ́
9      à sɛ̀gɛ́
Name: BEFORE_PARSE, dtype: object

In [134]:
data['PARSED'] = data.apply(parse_data, axis=1)

In [135]:
data["PARSED"][:10]

0       jáw-dì-m
1        dáábà-m
2       àrsɛ̌ɛ-m
3       kɔ́-mbɔ̀
4       bɛ́lɛ̀-g
5      bɛ́lɛ̀-gù
6          bɛ̀lú
7          bɛ̀lú
8    gàr sɛ̀-gɛ́
9      à sɛ̀-gɛ́
Name: PARSED, dtype: object

In [136]:
data.to_csv("second_parse.csv", index=False)

### Nouns

In [68]:
nouns= data[data["SINGULAR"].notna()]["SINGULAR"]

In [69]:
noun_parser=NounParser()

In [70]:
nouns_parsed = nouns.apply(lambda x: adj_num_parser.y_suffixes(x) if x.endswith("y") or (x.endswith("ⁿ") and x[-2]=="y") else x).apply(
    lambda x: noun_parser.first_parse_durationals(x)).apply(
    lambda x: noun_parser.second_parse_durationals(x)).apply(
    lambda x: noun_parser.cvcv_segmentation(x)).apply(
    lambda x: noun_parser.nasalized_stops(x)).apply(
    lambda x: noun_parser.hyphen_space(x)).apply(
    lambda x: noun_parser.identified_suffixes(x))

In [88]:
nouns_parsed[:10]

0       jáw-dì-m
1        dáábà-m
2       àrsɛ̌ɛ-m
3       kɔ́-mbɔ̀
4       bɛ́lɛ̀-g
5      bɛ́lɛ̀-gù
6          bɛ̀lú
7          bɛ̀lú
8    gàr sɛ̀-gɛ́
9      à sɛ̀-gɛ́
Name: SINGULAR, dtype: object

### Verbs

In [165]:
class VerbParser(NounParser):
    def __init__(self):
       super().__init__()
        
    def consonant_count(self, item):
        """
        counts number of consonants in words
        """
        
        if item is None:
            return 0
        
        consonant_count=0
        for letter in item:
            if letter != "ⁿ" and letter in self.consonants:
                consonant_count +=1
        return consonant_count
                      

    def first_parse_durationals(self, item):
        """Double cases where a vowel has durataional marking"""
      
        def get_last_vowel(item):
            """Extracts the last vowel from the input string ending with ':'"""
            if item.endswith(":") or item.endswith(":"):
                return item[-2]
            return None

        vowels = [get_last_vowel(item)] # extracting vowels 
        for letter in self.replacements:     # Handle exceptions like 'tɔ́d-ɛ̀:' and 'nàrⁿ-ɛ́:'
            if item.endswith(letter + ':') or item.endswith(letter + ":"):
                word= item[:-1] + "-" + self.replacements[letter]
            else:
                word=item

        if 'ɛ᷈:' in item:                # Handle the exceptional case 'jɛ᷈:'
            word=item[:-1] + "-ɛ᷈"

        else:
            word= item

        for letter in self.replacements:     # Handle cases of unextracted vowels
            if word.endswith(":") or word.endswith(":"):
                if letter in word[1:4]:
                    return word[:-1] + "-" + self.replacements[letter]
        material = self.extra_material.get(vowels[0], vowels[0])
        
        return self.second_parse_durationals(word[:-1] + "-" + material if vowels[0] else word)

    def second_parse_durationals(self, word):
            """
            normal parsing of durationals by doubling the vowel with the durational feature. This is neccesary to ensure no leftovers from first parse
            """
        
            new_word=""
            for index, letter in enumerate(word):
                idx=len(word)-index
                if letter==":" or letter==":" and idx !=0:
                    letter=f"{self.replacements.get(word[index -1],word[index -1])}"
                    new_word += letter
                elif letter==":" or letter==":" and idx ==0:
                    letter=f"-{self.replacements.get(word[index -1],word[index -1])}"
                    new_word += letter  
                else:
                    new_word +=letter
            return new_word.replace("--", "-")

    def post_editing_short_strings(self, word):
        """
        post edits short words that have been "over-parsed" as a result of issues encountered with character fonts
        """
        
        if word is None:
            return None
        
        consonant_count=self.consonant_count(word)
        if consonant_count <=2 and word.count("-") >1:
            word=word.replace("-", "", 1)
        return word
        
    def hyphen_space(self, word):
        """
        special utility function to remove hyphens after morphemic boundaries indicated by a space and hyphens occurring word initially
        """
        new_word=""
        for index, letter in enumerate(word):
            if letter== "-" and index+1 < len(word) and (word[index+1]== " " or word[index-1]== " ") :
                new_word += ""
            else:
                new_word +=  letter
        return new_word

    
    def maintaining_nasality_on_segment(self, word):
        """
        ensures that nasality remains on segment that is marked for it, and also parses sounds occuring after the nasal marker  
        """
        new_word=""
        for index, letter in enumerate(word):
            if letter == "-" and index + 1 < len(word) and word[index + 1]== "ⁿ":         #solve zǐǐ-ⁿ zǐ-ǐⁿ from here
                new_word +=""
            else:
                new_word += letter
    
        word=""
        for index, letter in enumerate(new_word):
            idx=len(new_word)-index
            if letter == "ⁿ" and 3 >= idx > 1:
                if new_word[index]!= "-":
                    word += f"{letter}-"
    
            else:
                word +=letter
        
        if "-" not in word:
            new_word=""
            for index, letter in enumerate(word):
                if letter not in self.consonants and index+1 < len(word) and "ⁿ" in word[index: index+2]:
                    new_word += f"-{letter}"
                else:
                    new_word +=letter
            return self.hyphen_space(new_word)
        else:       
            return self.hyphen_space(word)
        

    def verify_exceptions(self, word):
        """
        verifies that consonant ensembles are not parsed as different consonants and reparses long words with consonant ensembles
        """
        for i in self.exceptions:
            with_hyphen = i[0] + "-" + i[1]
            if with_hyphen in word:
                idx = word.index(with_hyphen)
                word = word[:idx] + i + "-" + word[idx + 3:]
            else:
                word=word
        word = word.replace("--", "-")
        
        return self.maintaining_nasality_on_segment(word)
            

    def vowel_tone_hyphen(self, word):
        """
        handles cases in which despite parse_durationals, it is difficult to extract vowels
        """
        if len(word) >= 4:
            if word[-1] not in self.vowels.union(self.consonants) and word[-2] == "-":
                return self.verify_exceptions(word[:-3] + "-" + word[-3] + word[-1]) 
        return self.verify_exceptions(word)

    def syllabic_vowels(self, word):
        """
        isolates syllabic vowels and parses them off
        """
        final_word = word

        for alphabet in self.vowels:
            if alphabet in word[2:-2]:
                try:
                    index = word.index(alphabet)
                    if word[index - 1] == "-" and word[index + 1] in self.consonants:
                        final_word = final_word.replace(word[index], f"{alphabet}-")
                except ValueError:
                    pass

        return self.vowel_tone_hyphen(final_word.rstrip('-').replace("--", "-"))

    def post_coda(self, word):
        """
        parses vowels sounds that occur after codas
        """
        for i in range(1, len(word) - 3):
            if word[i] in self.consonants:
                if i + 1 < len(word) and word[i + 1] == "-":
                    try:
                        if i + 2 < len(word) and i + 3 < len(word) and word[i + 2] in self.vowels and word[
                            i + 3] in self.consonants:
                            material = word[:i + 2] + "-" + word[i + 2:]
                            return self.syllabic_vowels(material)
                        else:
                            return self.syllabic_vowels(word)
                    except IndexError:
                        return self.syllabic_vowels(word)
        return self.syllabic_vowels(word)

    def special_mid_forms(self, item):
        """
        takes words of between 3-4 alphabets long that have special characters and parses them
        """
        if "-" not in item[-3:] and item[-1] not in self.consonants:
            return self.post_coda(item[:-1] + "-" + item[-1:])
        return self.post_coda(item)

    def special_long_words(self, word):
        """
        takes word of length 5 and parses them into CVC: this is because of the font-recognition issues encountered
        """
        current_syllable = ''
        consonant_count = 0

        for i in range(len(word)):
            if word[i] != "ⁿ" and word[i] in self.consonants:
                consonant_count += 1
                if consonant_count == 2 and i < len(word):
                    word = word[:i + 1] + '-' + word[i + 1:]
                    consonant_count = 0
        return self.post_coda(word)
   
    
    def long_words(self, word):
        """
        parses long words according to a cvc priority structure and also takes care of idiosyncratic words
        """
        num=[2,4,6,8]
        new_word="" #word parsed according to cvc priority
        consonant_count=0
        for index, letter in enumerate(word):
            
            if letter == " " or letter== "ⁿ":
                consonant_count=0
                new_word +=letter
                
            elif letter!= "ⁿ" and letter in self.consonants: #just to be extra sure
                consonant_count +=1
                if consonant_count in num:
                    new_word += f"{letter}-"
                elif consonant_count==3 and word[index-1] != " ":
                    new_word +=f"-{letter}"
                else:
                    new_word += letter
                    
            else:
                new_word +=letter
    
        if " "  in new_word and "-" not in new_word[new_word.index(" "):]:   #parsing words such as "gúr-ɔ́ gùrɔ́" which have second parts not responding to code above so they become "gúr-ɔ́ gùr-ɔ́"
            consonant_count=self.consonant_count(new_word[new_word.index(" "):])
            if consonant_count >=2 and new_word[-1] not in self.consonants:
                conso=[x for x in new_word[new_word.index(" "):] if x in self.consonants]
                idx=new_word.index(conso[1])
                new_word=new_word[:idx+1] + "-" + new_word[idx+1:] 
        
        new_word=new_word[:-1].replace("--", "-") if new_word[-1] == '-' else new_word.replace("--", "-")
 
        return self.verify_exceptions(new_word)


    
    def segment_cvcs(self, item):
        """
        main segmentation function to be implemented after parse_durationals has been implemented
        """
        if len(item) >= 5:
            return self.long_words(item)
        elif 3 <= len(item) <= 4:
            return self.special_mid_forms(item) #calling special_mid_forms function
        else:
            return self.post_coda(item) #directly sending words to post_coda function for parsing

In [174]:
verbs=data[data["FORM"].notna()]["FORM"]

In [168]:
verb_parser=VerbParser()

In [175]:
verbs_parsed= (verbs
    .apply(verb_parser.existing_parses)
    .apply(verb_parser.first_parse_durationals)
    .apply(verb_parser.segment_cvcs)
    .apply(verb_parser.post_editing_short_strings)
)

In [176]:
verbs_parsed[200:300]

2736                        jìr-é kím-jɛ́
2737                       gìr-ó kɛ́m-sɛ̀
2738          gìr-ù kúm-ò-lò kùm-ò-lò kán
2739                               kám-ɲá
2740                  kúm-ɛ́ kúm-ɛ́ kán-í
2741                  mìj-ɛ̀ mìj-ɛ̀ kán-í
2742                gìr-è ɲɛ́m-ɛ́-lɛ́ kán
2743                              kɛ́m-jí
2744                                kám-í
2745                              cíwⁿ-ɛ́
2746                     gìr-è kám-dá kán
2747        jír-ó kùmb-ì yò kùm-bì lè kán
2748                            kúm-ú-ɲɔ́
2749                     gìr-é kúm-ú-nj-ó
2750    jìr-iⁿ kúm-ú-jɛ̀ kúm-ú-jɛ̀ bìr-ɛ̀
2751                       jìr-ì jáʔ-à-nì
2752                           jìr-ó cɛ́m
2753                    gìd-è kúm-zɔ́ kán
2754              gìr-ì ý kɛ́m-ɲɛ́ kárⁿ-á
2755                                   kó
2756                                   kó
2757                               kày-ɛ́
2758                                káy-è
2759                              

In [None]:
len("")

In [156]:
verb_parser.segment_cvcs("t-ɔ́")

't-ɔ́'

### Adjective_numerals

In [113]:
adjective_numeral = data[(data["POS"] == "numeral") | (data["POS"] == "adjective")]["SINGULAR"]
adjective_numeral = adjective_numeral.apply(lambda x: str(x) if isinstance(x, str) else "" )

In [114]:
adj_num_parser=AdjectiveNumeralParser()

In [115]:
parsed_adjective_numerals=adjective_numeral.apply(
    adj_num_parser.existing_parses).apply(
    adj_num_parser.first_parse_durationals).apply(
    adj_num_parser.isolating_suffixes).apply(
    adj_num_parser.y_suffixes).apply(
    adj_num_parser.replace_hyphens_keep_last).apply(
    adj_num_parser.switch_hyphen_position).apply(
    adj_num_parser.miscellaneous
    )

In [116]:
parsed_adjective_numerals[:20]

772       pɛ̌-ɛ́
773       pɛ̌-ɛ́
774       ʔìl-ɛ̀
775       dìy-ɔ́
776     dìyɔ̌-ɔ́
777        pɛ̌-y
778       kóór-ó
779       pɛ́-yⁿ
780       pɛ̌-yⁿ
781       pɛ̌-yⁿ
782       káán-ú
783    kúnjɔ́-ɔ́
784       pɛ̌-ɛ́
785     káámn-ɔ́
786        pɛ̌-y
787    pɛ̀y g-ɔ́
788       pɛ̌-ɛ́
789       kúnj-ú
790         pɛ̌ⁿ
791        pɛ̌-y
Name: SINGULAR, dtype: object