### Imports

In [72]:
from utils import NounParser,VerbParser,AdjectiveNumeralParser
import pandas as pd

### Functions

In [130]:
#Instantiating parser objects
noun_parser=NounParser()
verb_parser=VerbParser()
adj_num_parser=AdjectiveNumeralParser()


def prior_forms(row):
    """
    puts all forms that have been parsed into a single column
    """
    if pd.isna(row["POS"]):  
        return None
    elif row["POS"] in ["noun", "numeral", "adjective"]:
        form = str(row["SINGULAR"]) if isinstance(row["SINGULAR"], str) else ""
        return form
    elif row["POS"] == "verb":
        verb = row["FORM"]
        if pd.isna(verb):
            return None
        else:
            return verb




def parse_data(row):
    """
    parses data and places parsed data into one column
    """
    if pd.isna(row["POS"]):  
        return None
    elif row["POS"] == "noun":
        noun = row["SINGULAR"]
        if pd.isna(noun):
            return None
        return noun_parser.identified_suffixes(
            noun_parser.hyphen_space(
                noun_parser.nasalized_stops(
                    noun_parser.cvcv_segmentation(
                        noun_parser.second_parse_durationals(
                            noun_parser.first_parse_durationals(
                                noun_parser.existing_parses(
                                    adj_num_parser.y_suffixes(noun.strip("()/_"))))))))) if noun.endswith("y") or (noun.endswith("ⁿ") and noun[-2]=="y") else noun_parser.identified_suffixes(
            noun_parser.hyphen_space(
                noun_parser.nasalized_stops(
                    noun_parser.cvcv_segmentation(
                        noun_parser.second_parse_durationals(
                            noun_parser.first_parse_durationals(
                                noun_parser.existing_parses(noun.strip("()/_")))))))) 
    elif row["POS"] == "verb":
        verb = row["FORM"]
        if pd.isna(verb):
            return None
        return verb_parser.post_editing_short_strings(
            verb_parser.segment_cvcs(
                verb_parser.second_parse_durationals(
                    verb_parser.first_parse_durationals(
                        verb_parser.existing_parses(verb.strip(")(_"))))))
    elif row["POS"] == "numeral" or row["POS"] == "adjective":
        adjective_numeral = str(row["SINGULAR"]) if isinstance(row["SINGULAR"], str) else ""
        if pd.isna(adjective_numeral):
            return None
        return adj_num_parser.miscellaneous(
            adj_num_parser.switch_hyphen_position(
                adj_num_parser.replace_hyphens_keep_last(
                    adj_num_parser.y_suffixes(
                        adj_num_parser.isolating_suffixes(
                            adj_num_parser.second_parse_durationals(
                                adj_num_parser.first_parse_durationals(
                                    adj_num_parser.existing_parses(adjective_numeral.strip("()/_")))))))))
    elif row["POS"] in ["pronoun", "other"]:
        form=row["SINGULAR"]
        return form
    else:
        return None

### Loading data

In [131]:
pd.set_option('display.max_rows', None)
path = r"D:\ERC_Bang\Tasks\data_cleaning" 
os.chdir(path)
data = pd.read_csv("data.tsv", sep="\t", encoding="utf-8")
list_to_drop=["FRENCH", "ENGLISH_SHORT", "FRENCH_SHORT", "ENGLISH_CATEGORY", "FRENCH_CATEGORY", "PARSED FORM", "MCF", "RECONSTRUCTION", "NOTE", "NOTES","Unnamed: 18", "Unnamed: 19", "Unnamed: 20", "Unnamed: 21", "COGID", "COGIDS", "Unnamed: 24"]
data=data.drop(list_to_drop, axis=1)

In [132]:
data["BEFORE_PARSE"]=data.apply(prior_forms, axis=1)

In [133]:
data["BEFORE_PARSE"][:10]

0       jáwdì-m
1       dá:bà-m
2      àrsɛ̌:-m
3       kɔ́mbɔ̀
4      bɛ́lɛ̀-g
5     bɛ́lɛ̀-gù
6         bɛ̀lú
7         bɛ̀lú
8    gàr sɛ̀gɛ́
9      à sɛ̀gɛ́
Name: BEFORE_PARSE, dtype: object

In [134]:
data['PARSED'] = data.apply(parse_data, axis=1)

In [135]:
data["PARSED"][:10]

0       jáw-dì-m
1        dáábà-m
2       àrsɛ̌ɛ-m
3       kɔ́-mbɔ̀
4       bɛ́lɛ̀-g
5      bɛ́lɛ̀-gù
6          bɛ̀lú
7          bɛ̀lú
8    gàr sɛ̀-gɛ́
9      à sɛ̀-gɛ́
Name: PARSED, dtype: object

In [136]:
data.to_csv("second_parse.csv", index=False)

### Nouns

In [68]:
nouns= data[data["SINGULAR"].notna()]["SINGULAR"]

In [69]:
noun_parser=NounParser()

In [70]:
nouns_parsed = nouns.apply(lambda x: adj_num_parser.y_suffixes(x) if x.endswith("y") or (x.endswith("ⁿ") and x[-2]=="y") else x).apply(
    lambda x: noun_parser.first_parse_durationals(x)).apply(
    lambda x: noun_parser.second_parse_durationals(x)).apply(
    lambda x: noun_parser.cvcv_segmentation(x)).apply(
    lambda x: noun_parser.nasalized_stops(x)).apply(
    lambda x: noun_parser.hyphen_space(x)).apply(
    lambda x: noun_parser.identified_suffixes(x))

In [88]:
nouns_parsed[:10]

0       jáw-dì-m
1        dáábà-m
2       àrsɛ̌ɛ-m
3       kɔ́-mbɔ̀
4       bɛ́lɛ̀-g
5      bɛ́lɛ̀-gù
6          bɛ̀lú
7          bɛ̀lú
8    gàr sɛ̀-gɛ́
9      à sɛ̀-gɛ́
Name: SINGULAR, dtype: object

### Verbs

In [137]:
verbs=data[data["FORM"].notna()]["FORM"]

In [138]:
verb_parser=VerbParser()

In [139]:
verbs_parsed= (verbs
    .apply(verb_parser.existing_parses)
    .apply(verb_parser.first_parse_durationals)
    .apply(verb_parser.segment_cvcs)
    .apply(verb_parser.post_editing_short_strings)
)

In [141]:
verbs_parsed[:10]

424     pánd-í
425         pó
426      pál-í
427      úr-ɔ́
428    ʔɔ́r-ɛ̀
429    kɛ̀s-ɛ́
430      úr-ɛ̀
431      úr-ɛ̀
432     pɔ́-ɔ́
433      pán-á
Name: FORM, dtype: object

### Adjective_numerals

In [143]:
adjective_numeral = data[(data["POS"] == "numeral") | (data["POS"] == "adjective")]["SINGULAR"]
adjective_numeral = adjective_numeral.apply(lambda x: str(x) if isinstance(x, str) else "" )

In [144]:
adj_num_parser=AdjectiveNumeralParser()

In [145]:
parsed_adjective_numerals=adjective_numeral.apply(
    adj_num_parser.existing_parses).apply(
    adj_num_parser.first_parse_durationals).apply(
    adj_num_parser.isolating_suffixes).apply(
    adj_num_parser.y_suffixes).apply(
    adj_num_parser.replace_hyphens_keep_last).apply(
    adj_num_parser.switch_hyphen_position).apply(
    adj_num_parser.miscellaneous
    )

In [146]:
parsed_adjective_numerals[:10]

772      pɛ̌-ɛ́
773      pɛ̌-ɛ́
774      ʔìl-ɛ̀
775      dìy-ɔ́
776    dìyɔ̌-ɔ́
777       pɛ̌-y
778      kóór-ó
779      pɛ́-yⁿ
780      pɛ̌-yⁿ
781      pɛ̌-yⁿ
Name: SINGULAR, dtype: object