### Imports

In [28]:
from utils import NounParser,VerbParser,AdjectiveNumeralParser
import pandas as pd
from lingpy import *
from segments.tokenizer import Tokenizer
import os

### Functions

In [29]:
#Instantiating parser objects
noun_parser=NounParser()
verb_parser=VerbParser()
adj_num_parser=AdjectiveNumeralParser()


def prior_forms(row):
    """
    puts all forms that have been parsed into a single column
    """
    if pd.isna(row["POS"]):  
        return None
    elif row["POS"] in ["noun", "numeral", "adjective"]:
        form = str(row["SINGULAR"]) if isinstance(row["SINGULAR"], str) else ""
        return form
    elif row["POS"] == "verb":
        verb = row["FORM"]
        if pd.isna(verb):
            return None
        else:
            return verb
    else:
        form = str(row["SINGULAR"]) if isinstance(row["SINGULAR"], str) else ""
        return form




def parse_data(row):
    """
    Parses data and places parsed data into one column.
    """
    if pd.isna(row["POS"]):  
        return None
    elif row["POS"] == "noun":
        noun = row["IPA"]
        if pd.isna(noun):
            return None
        parsed_noun = (noun_parser.identified_suffixes(
            noun_parser.hyphen_space(
                noun_parser.nasalized_stops(
                    noun_parser.cvcv_segmentation(
                        noun_parser.parse_noun_durationals(
                            noun_parser.parse_off_final_nasals(
                                noun_parser.existing_parses(
                                    adj_num_parser.y_suffixes(noun.strip("()/_")))))))))) if (noun.endswith("y") or (noun.endswith("ⁿ") and noun[-2]=="y")) else \
                            (noun_parser.identified_suffixes(
                                noun_parser.hyphen_space(
                                    noun_parser.nasalized_stops(
                                        noun_parser.cvcv_segmentation(
                                            noun_parser.parse_noun_durationals(
                                                noun_parser.parse_off_final_nasals(
                                                    noun_parser.existing_parses(noun.strip("()/_")))))))))
        return parsed_noun
    elif row["POS"] == "verb":
        verb = row["IPA"]
        if pd.isna(verb):
            return None
        parsed_verb = verb_parser.post_editing_short_strings(
            verb_parser.segment_cvcs(
                verb_parser.parse_verb_durationals(
                    verb_parser.existing_parses(verb.strip(")(_")))))
        return parsed_verb
    elif row["POS"] == "numeral" or row["POS"] == "adjective":
        adjective_numeral = str(row["IPA"]) if isinstance(row["IPA"], str) else ""
        if pd.isna(adjective_numeral):
            return None
        parsed_adj_num = adj_num_parser.miscellaneous(
            adj_num_parser.switch_hyphen_position(
                adj_num_parser.replace_hyphens_keep_last(
                    adj_num_parser.y_suffixes(
                        adj_num_parser.isolating_suffixes(
                            adj_num_parser.parse_verb_durationals(
                                adj_num_parser.existing_parses(adjective_numeral.strip("()/_"))))))))
        return parsed_adj_num
    elif row["POS"] in ["pronoun", "other"]:
        form = row["IPA"]
        return noun_parser.parse_noun_durationals(noun_parser.existing_parses(form))
    else:
        return None



def data_prep_for_ortho_profile(input_file):
    """
    prepares data so that command line can run on it to output an orthography profile
    """
    df = pd.read_csv(input_file, delimiter='\t')
    df["ID"]=df["ID"].astype(str)
    df['ID'] = pd.to_numeric(df['ID'].str.replace(',', '').str.replace('.', ''), errors='coerce')
    df = df.dropna(subset=['ID'])
    df['ID'] = df['ID'].astype(int)
    return df.to_csv('heathdogon2.tsv', index=False, sep='\t')



def add_tab_after_quote(input_file, output_file, encoding='utf-8'):
    """
    creates a tsv file in which quotation characters are eliminated
    input_file: orthography profile created via the command line: $ lingpy profile -i heathdogon2.tsv -o  P_created-profile.tsv --column=ipa
    """
    with open(input_file, 'r', encoding=encoding) as infile, open(output_file, 'w', encoding=encoding) as outfile:
        for line in infile:
            new_line = ''
            quote_added = False
            for char in line:
                if char == '"':
                    new_line += char + '\t'
                    quote_added = True
                else:
                    new_line += char
                    if quote_added:
                        quote_added = False
            outfile.write(new_line)

### Loading data

In [30]:
#loading data
pd.set_option('display.max_rows', None)
path = r"D:\ERC_Bang\Tasks\data_cleaning" 
os.chdir(path)
data = pd.read_csv("data.tsv", sep="\t", encoding="utf-8")
list_to_drop=["FRENCH", "ENGLISH_SHORT", "FRENCH_SHORT", "ENGLISH_CATEGORY", "FRENCH_CATEGORY", "PARSED FORM", "MCF", "RECONSTRUCTION", "NOTE", "NOTES","Unnamed: 18", "Unnamed: 19", "Unnamed: 20", "Unnamed: 21", "COGID", "COGIDS", "Unnamed: 24"]
data=data.drop(list_to_drop, axis=1)

In [31]:
#putting all data parsable data into one columns
data["BEFORE_PARSE"]=data.apply(prior_forms, axis=1)

In [32]:
#creating orthography profile
op=Tokenizer("output.tsv")

In [33]:
#applying orthography profile
data["IPA"]=data["BEFORE_PARSE"].apply(lambda x: op(x) if isinstance(x, str) else x)

In [34]:
#parsing data
data["PARSED"]=data.apply(parse_data,axis=1)

In [35]:
data["PARSED"][:10]

0       jáw-dì-m
1        dáábà-m
2      àrsɛ̌ɛ̌-m
3       kɔ́-mbɔ̀
4       bɛ́lɛ̀-g
5      bɛ́lɛ̀-gù
6          bɛ̀lú
7          bɛ̀lú
8    gàr sɛ̀-gɛ́
9      à sɛ̀-gɛ́
Name: PARSED, dtype: object

In [36]:
data.to_csv("Parsed_data.csv", index=False)