# Imports

In [1]:
from utils import NounParser,VerbParser,AdjectiveNumeralParser
import pandas as pd
from lingpy import *
from lingpy.sequence.sound_classes import check_tokens
from lingpy.compare.util import (mutual_coverage_check, mutual_coverage_subset)
from lingpy.compare.sanity import average_coverage
from lingpy.compare.sanity import synonymy
from segments.tokenizer import Tokenizer
from collections import defaultdict
import matplotlib.pyplot as plt
import os
from pprint import pprint
from lingpy.evaluate.acd import bcubes
from lingpy.compare.partial import Partial

# Functions

In [3]:
#dictionaries, sets and lists
replacement_dict={"DogulDomBendiely":"DogulDomKundialang",
                  "JamsayGourou":"JamsayDouentza",
                  "JamsayMondoro":"JamsayDouentza",
                  "PergeTegu":"JamsayDouentza",
                  "TomoKanSegue":"TomoKanDiangassagou"}

new_name={'TommoSoTongoTongo': "TommoSo",
         "BonduSoNajamba": "BonduSo",
          
         "TomoKanDiangassagou": "TomoKan",
         "TomoKanSegue":"TomoKan",
          
         "DogulDomKundialang": "DogulDom",
         'DogulDomBendiely':"DogulDom",
          
          "JamsayDouentza":"Jamsay",
         'JamsayGourou':"Jamsay",
         'JamsayMondoro':"Jamsay",
          "PergeTegu": "Jamsay"}
replacements={'j':'j','á':'a','w': 'w','d':'d','ì': 'i','-':'-','m':'m',':': ':','b': 'b','à': 'a','r':'r',
              's':'s','ɛ':'ɛ','k':'k','ɔ': 'ɔ','l':'l','g':'g','ù':'u','ú':'u','è':'e','í':'i','ŋ': 'ŋ',
              'ó':'o','n': 'n','ⁿ': '\u207F','ý':'y','z':'z','ɲ':'\u0272','y':'y','ò':'o','ɣ': 'ɣ',
              'ʔ': '\u0294','ǎ':'a','â':'a','é':'e','ě':'e','ê':'e','î':'i','ǐ': 'i','ǹ': 'n','û':'u',
              'š':'s','ə': 'ə','p':'p','ǒ':'o','t':'t','e':'e','ô':'o','ǔ':'u','ń':'n','ỳ':'y','c':'c',
              'h':'h','f': 'f','ʷ': '\u02B7','i':'i','?': '\u0294','x':'x','o':'o','a':'a','ɗ': 'ɗ',
              'ḱ':'k','ǽ':'ae','ẁ':'w','ʒ': '\u0292','v':'v','ḿ':'m','ẃ':'w','å':'a','u':'u','q':'q',
              'ʄ': 'ƒ','ɓ':'b','ᵇ': '\u1D47','ʸ':'\u02B9','Ǹ':'n','V': 'v', " ": " "}

#Instantiating parser objects
noun_parser=NounParser()
verb_parser=VerbParser()
adj_num_parser=AdjectiveNumeralParser()


def prior_forms(row):
    """
    puts all forms that have been parsed into a single column
    """
    if pd.isna(row["POS"]):  
        return None
    elif row["POS"] in ["noun", "numeral", "adjective"]:
        form = str(row["SINGULAR"]) if isinstance(row["SINGULAR"], str) else ""
        return form
    elif row["POS"] == "verb":
        verb = row["FORM"]
        if pd.isna(verb):
            return None
        else:
            return verb
    else:
        form = str(row["SINGULAR"]) if isinstance(row["SINGULAR"], str) else ""
        return form




def parsing_data(row):
    """
    puts all forms that have been parsed into a single column
    """
    if pd.isna(row["POS"]):  
        return None
    elif row["POS"]=="noun":
        noun=row["BEFORE_PARSE"]
        if pd.isna(noun):
            return None
        else:
            noun=(noun_parser.identified_suffixes(
            noun_parser.hyphen_space(
                noun_parser.nasalized_stops(
                    noun_parser.cvcv_segmentation(
                        noun_parser.parse_noun_durationals(
                            noun_parser.parse_off_final_nasals(
                                noun_parser.existing_parses(
                                    adj_num_parser.y_suffixes(noun.strip("()/_")))))))))) if (noun.endswith("y") or (noun.endswith("ⁿ") and noun[-2]=="y")) else \
                            (noun_parser.identified_suffixes(
                                noun_parser.hyphen_space(
                                    noun_parser.nasalized_stops(
                                        noun_parser.cvcv_segmentation(
                                            noun_parser.parse_noun_durationals(
                                                noun_parser.parse_off_final_nasals(
                                                    noun_parser.existing_parses(noun.strip("()/_")))))))))
            return noun
            
    elif row["POS"] == "numeral" or row["POS"] == "adjective":
        form = row["BEFORE_PARSE"]
        if pd.isna(form):
            return None
        else:
            form=adj_num_parser.miscellaneous(
            adj_num_parser.switch_hyphen_position(
                adj_num_parser.replace_hyphens_keep_last(
                    adj_num_parser.y_suffixes(
                        adj_num_parser.isolating_suffixes(
                            adj_num_parser.parse_verb_durationals(
                                adj_num_parser.existing_parses(form.strip("()/_"))))))))
            return form
    elif row["POS"] == "verb":
        verb = row["BEFORE_PARSE"]
        if pd.isna(verb):
            return None
        else:
            verb=verb_parser.post_editing_short_strings(
            verb_parser.segment_cvcs(
                verb_parser.parse_verb_durationals(
                    verb_parser.existing_parses(verb.strip(")(_")))))
            return verb
    else:
        form = str(row["BEFORE_PARSE"]) if isinstance(row["BEFORE_PARSE"], str) else ""
        return noun_parser.parse_noun_durationals(form)



def data_prep_for_ortho_profile(input_file):
    """
    prepares data so that command line can run on it to output an orthography profile
    """
    df = pd.read_csv(input_file, delimiter='\t')
    df["ID"]=df["ID"].astype(str)
    df['ID'] = pd.to_numeric(df['ID'].str.replace(',', '').str.replace('.', ''), errors='coerce')
    df = df.dropna(subset=['ID'])
    df['ID'] = df['ID'].astype(int)
    return df.to_csv('heathdogon2.tsv', index=False, sep='\t')



def add_tab_after_quote(input_file, output_file, encoding='utf-8'):
    """
    creates a tsv file in which quotation characters are eliminated
    input_file: orthography profile created via the command line: $ lingpy profile -i heathdogon2.tsv -o  P_created-profile.tsv --column=ipa
    """
    with open(input_file, 'r', encoding=encoding) as infile, open(output_file, 'w', encoding=encoding) as outfile:
        for line in infile:
            new_line = ''
            quote_added = False
            for char in line:
                if char == '"':
                    new_line += char + '\t'
                    quote_added = True
                else:
                    new_line += char
                    if quote_added:
                        quote_added = False
            outfile.write(new_line)


def merges(row, concepts, replacement_dict):
    """
    merges various rows based on the presence of a gloss
    """
    
    if not isinstance(row["CONCEPT"], str) or not isinstance(row["DOCULECT"], str):
        # Handle unexpected types
        return None
    
    for concept in concepts:
        for key, value in replacement_dict.items():
            if concept in row["CONCEPT"] and row["DOCULECT"] == value and row["DOCULECT"] == key:
                return row["DOCULECT"] == value
            elif concept in row["CONCEPT"] and row["DOCULECT"] != value:
                if row["CONCEPT"] == key:
                    return row["CONCEPT"] == key
            elif concept in row["CONCEPT"] and row["DOCULECT"] != value or row["DOCULECT"] != key:
                return row



def change_name(name, new_name):
    """
    changes the names of various language dialects into language
    """
    for key, value in new_name.items():  
        keep_name = value
        discard_name = key
        if name == discard_name:  
            return keep_name
    return name


def remove_tones(word):
    """
    removes tones and keeps only segments
    """
    if word is None:
        return None

    new_word=""
    for letter in word:
        if letter in replacements.keys():
            new_word += replacements.get(letter)
        elif letter==" ":
            new_word +="#"
        else:
            new_word +=""
    return new_word

def remove_spaces(word):
    if word is None:
        return None
    new_word=""
    for letter in word:
        if letter==" ":
            new_word += ""
        else:
            new_word += letter
    return new_word

# Loading data

In [246]:
#loading data
pd.set_option('display.max_rows', None)
path = r"D:\ERC_Bang\Tasks\data_cleaning" 
os.chdir(path)
data = pd.read_csv("data.tsv", sep="\t", encoding="utf-8")
list_to_drop=["ID","FRENCH", "ENGLISH_SHORT", "FRENCH_SHORT", "ENGLISH_CATEGORY", "FRENCH_CATEGORY", "PARSED FORM", "MCF", "RECONSTRUCTION", "NOTE", "NOTES","Unnamed: 18", "Unnamed: 19", "Unnamed: 20", "Unnamed: 21", "COGID", "COGIDS", "Unnamed: 24"]
data=data.drop(list_to_drop, axis=1)

In [247]:
#visualizing data
data.head()

Unnamed: 0,VARID,DOCULECT,GLOSS,VALUE_ORG,SINGULAR,PLURAL,FORM,CONCEPT,POS
0,0.0,BankanTey,domestic animal (esp. livestock),"jáwdì-m\\jáwdì, dá:bà-m\\dá:bà",jáwdì-m,jáwdì,,LIVESTOCK,noun
1,1.0,BankanTey,domestic animal (esp. livestock),"jáwdì-m\\jáwdì, dá:bà-m\\dá:bà",dá:bà-m,dá:bà,,LIVESTOCK,noun
2,0.0,BenTey,domestic animal (esp. livestock),àrsɛ̌:-m\\àrsɛ̌:,àrsɛ̌:-m,àrsɛ̌:,,LIVESTOCK,noun
3,0.0,Bunoge,domestic animal (esp. livestock),kɔ́mbɔ̀,kɔ́mbɔ̀,kɔmbɔ=gè,,LIVESTOCK,noun
4,0.0,DogulDomBendiely,domestic animal (esp. livestock),"bɛ́lɛ̀g, bɛ́lɛ̀gù",bɛ́lɛ̀-g,bɛlɛ-g-yà,,LIVESTOCK,noun


# Creating new columns

In [248]:
data["BEFORE_PARSE"]=data.apply(prior_forms, axis=1) #put all eligible data in one columns

In [249]:
data.head()

Unnamed: 0,VARID,DOCULECT,GLOSS,VALUE_ORG,SINGULAR,PLURAL,FORM,CONCEPT,POS,BEFORE_PARSE
0,0.0,BankanTey,domestic animal (esp. livestock),"jáwdì-m\\jáwdì, dá:bà-m\\dá:bà",jáwdì-m,jáwdì,,LIVESTOCK,noun,jáwdì-m
1,1.0,BankanTey,domestic animal (esp. livestock),"jáwdì-m\\jáwdì, dá:bà-m\\dá:bà",dá:bà-m,dá:bà,,LIVESTOCK,noun,dá:bà-m
2,0.0,BenTey,domestic animal (esp. livestock),àrsɛ̌:-m\\àrsɛ̌:,àrsɛ̌:-m,àrsɛ̌:,,LIVESTOCK,noun,àrsɛ̌:-m
3,0.0,Bunoge,domestic animal (esp. livestock),kɔ́mbɔ̀,kɔ́mbɔ̀,kɔmbɔ=gè,,LIVESTOCK,noun,kɔ́mbɔ̀
4,0.0,DogulDomBendiely,domestic animal (esp. livestock),"bɛ́lɛ̀g, bɛ́lɛ̀gù",bɛ́lɛ̀-g,bɛlɛ-g-yà,,LIVESTOCK,noun,bɛ́lɛ̀-g


In [250]:
data["PARSED"]=data.apply(parsing_data,axis=1)#parse all eligible data

In [251]:
data.head()

Unnamed: 0,VARID,DOCULECT,GLOSS,VALUE_ORG,SINGULAR,PLURAL,FORM,CONCEPT,POS,BEFORE_PARSE,PARSED
0,0.0,BankanTey,domestic animal (esp. livestock),"jáwdì-m\\jáwdì, dá:bà-m\\dá:bà",jáwdì-m,jáwdì,,LIVESTOCK,noun,jáwdì-m,jáw-dì-m
1,1.0,BankanTey,domestic animal (esp. livestock),"jáwdì-m\\jáwdì, dá:bà-m\\dá:bà",dá:bà-m,dá:bà,,LIVESTOCK,noun,dá:bà-m,dáábà-m
2,0.0,BenTey,domestic animal (esp. livestock),àrsɛ̌:-m\\àrsɛ̌:,àrsɛ̌:-m,àrsɛ̌:,,LIVESTOCK,noun,àrsɛ̌:-m,àrsɛ̌ɛ̌-m
3,0.0,Bunoge,domestic animal (esp. livestock),kɔ́mbɔ̀,kɔ́mbɔ̀,kɔmbɔ=gè,,LIVESTOCK,noun,kɔ́mbɔ̀,kɔ́-mbɔ̀
4,0.0,DogulDomBendiely,domestic animal (esp. livestock),"bɛ́lɛ̀g, bɛ́lɛ̀gù",bɛ́lɛ̀-g,bɛlɛ-g-yà,,LIVESTOCK,noun,bɛ́lɛ̀-g,bɛ́lɛ̀-g


In [252]:
data["TONES_REMOVED"] = data["PARSED"].apply(lambda x: remove_tones(x)) #remove tones

In [253]:
data.head()

Unnamed: 0,VARID,DOCULECT,GLOSS,VALUE_ORG,SINGULAR,PLURAL,FORM,CONCEPT,POS,BEFORE_PARSE,PARSED,TONES_REMOVED
0,0.0,BankanTey,domestic animal (esp. livestock),"jáwdì-m\\jáwdì, dá:bà-m\\dá:bà",jáwdì-m,jáwdì,,LIVESTOCK,noun,jáwdì-m,jáw-dì-m,jaw-di-m
1,1.0,BankanTey,domestic animal (esp. livestock),"jáwdì-m\\jáwdì, dá:bà-m\\dá:bà",dá:bà-m,dá:bà,,LIVESTOCK,noun,dá:bà-m,dáábà-m,daaba-m
2,0.0,BenTey,domestic animal (esp. livestock),àrsɛ̌:-m\\àrsɛ̌:,àrsɛ̌:-m,àrsɛ̌:,,LIVESTOCK,noun,àrsɛ̌:-m,àrsɛ̌ɛ̌-m,arsɛɛ-m
3,0.0,Bunoge,domestic animal (esp. livestock),kɔ́mbɔ̀,kɔ́mbɔ̀,kɔmbɔ=gè,,LIVESTOCK,noun,kɔ́mbɔ̀,kɔ́-mbɔ̀,kɔ-mbɔ
4,0.0,DogulDomBendiely,domestic animal (esp. livestock),"bɛ́lɛ̀g, bɛ́lɛ̀gù",bɛ́lɛ̀-g,bɛlɛ-g-yà,,LIVESTOCK,noun,bɛ́lɛ̀-g,bɛ́lɛ̀-g,bɛlɛ-g


In [254]:
tk = Tokenizer('profile.tsv')
data["IPA"]=data["TONES_REMOVED"].apply(lambda x: tk(x, column="IPA") if isinstance(x, str) else x) #running orthography profile
data["IPA"]=data["IPA"].apply(remove_spaces)#removing spaces, else Lingpy throws an error

In [255]:
data.head()

Unnamed: 0,VARID,DOCULECT,GLOSS,VALUE_ORG,SINGULAR,PLURAL,FORM,CONCEPT,POS,BEFORE_PARSE,PARSED,TONES_REMOVED,IPA
0,0.0,BankanTey,domestic animal (esp. livestock),"jáwdì-m\\jáwdì, dá:bà-m\\dá:bà",jáwdì-m,jáwdì,,LIVESTOCK,noun,jáwdì-m,jáw-dì-m,jaw-di-m,jaw+di+m
1,1.0,BankanTey,domestic animal (esp. livestock),"jáwdì-m\\jáwdì, dá:bà-m\\dá:bà",dá:bà-m,dá:bà,,LIVESTOCK,noun,dá:bà-m,dáábà-m,daaba-m,daaba+m
2,0.0,BenTey,domestic animal (esp. livestock),àrsɛ̌:-m\\àrsɛ̌:,àrsɛ̌:-m,àrsɛ̌:,,LIVESTOCK,noun,àrsɛ̌:-m,àrsɛ̌ɛ̌-m,arsɛɛ-m,arsɛɛ+m
3,0.0,Bunoge,domestic animal (esp. livestock),kɔ́mbɔ̀,kɔ́mbɔ̀,kɔmbɔ=gè,,LIVESTOCK,noun,kɔ́mbɔ̀,kɔ́-mbɔ̀,kɔ-mbɔ,kɔ+mbɔ
4,0.0,DogulDomBendiely,domestic animal (esp. livestock),"bɛ́lɛ̀g, bɛ́lɛ̀gù",bɛ́lɛ̀-g,bɛlɛ-g-yà,,LIVESTOCK,noun,bɛ́lɛ̀-g,bɛ́lɛ̀-g,bɛlɛ-g,bɛlɛ+g


In [256]:
data=data[["DOCULECT", "GLOSS", "IPA"]].dropna(subset=["DOCULECT", "GLOSS", "IPA"])

In [257]:
data.head()

Unnamed: 0,DOCULECT,GLOSS,IPA
0,BankanTey,domestic animal (esp. livestock),jaw+di+m
1,BankanTey,domestic animal (esp. livestock),daaba+m
2,BenTey,domestic animal (esp. livestock),arsɛɛ+m
3,Bunoge,domestic animal (esp. livestock),kɔ+mbɔ
4,DogulDomBendiely,domestic animal (esp. livestock),bɛlɛ+g


In [258]:
data.to_csv("new_beginining_1.csv", index=False, encoding="utf-8")#empty spaces cleaned in Google sheets

# Special data processing(with Lingpy)

In [259]:
wl=Wordlist("new_beginining_2.tsv")

In [260]:
wl.header

{'doculect': 0, 'concept': 1, 'ipa': 2}

In [261]:
x=25
count, results = mutual_coverage_subset(wl, x)
coverage, languages = results[0]

In [262]:
concepts={}
for i in languages:
    concepts[i]=set(wl.get_dict(language=i, entry="concepts"))

In [263]:
wl.output("tsv", filename="new_beginining_3", subset=True, rows=dict(doculect = "in "+str(languages)))
wl.output("csv", filename="new_beginining_3", subset=True, rows=dict(doculect = "in "+str(languages))) #empty spaces cleaned in Google sheets

2024-04-23 13:03:47,223 [INFO] Data has been written to file <new_beginining_3.tsv>.
2024-04-23 13:03:47,926 [INFO] Data has been written to file <new_beginining_3.csv>.


# Running algorithm

In [4]:
path = r"D:\ERC_Bang\Tasks\data_cleaning" 
os.chdir(path)

In [5]:
wl=Wordlist("new_beginining_4.tsv")

In [6]:
part=Partial(wl)

In [7]:
part.get_scorer(runs=10000)
part.cluster(method='lexstat', threshold=0.55, ref='cogid')
part.output('tsv', filename='part_beginining_clusters')
part.output('csv', filename='part_beginining_clusters')

CORRESPONDENCE CALCULATION:   0%|                                                            | 0/288.0 [00:00<?, ?it/s]2024-04-23 14:22:52,040 [INFO] Calculating alignments for pair Ampari / Ampari.
2024-04-23 14:22:52,053 [INFO] Calculating alignments for pair Ampari / BankanTey.
2024-04-23 14:22:52,064 [INFO] Calculating alignments for pair Ampari / BenTey.
2024-04-23 14:22:52,073 [INFO] Calculating alignments for pair Ampari / BonduSoNajamba.
2024-04-23 14:22:52,082 [INFO] Calculating alignments for pair Ampari / Bunoge.
2024-04-23 14:22:52,089 [INFO] Calculating alignments for pair Ampari / DogulDomBendiely.
2024-04-23 14:22:52,096 [INFO] Calculating alignments for pair Ampari / DogulDomKundialang.
2024-04-23 14:22:52,103 [INFO] Calculating alignments for pair Ampari / DonnoSo.
2024-04-23 14:22:52,111 [INFO] Calculating alignments for pair Ampari / JamsayDouentza.
2024-04-23 14:22:52,119 [INFO] Calculating alignments for pair Ampari / JamsayGourou.
2024-04-23 14:22:52,124 [INFO] Ca

# ALIGNMENTS

In [8]:
part=Partial("new_beginining_clusters.tsv")
alm = Alignments(part, ref='cogid')
alm.align()
alm.output("html", filename='part_beginining_alignments', subset=True,
cols=['doculect', 'concept', 'ipa', 'tokens', 'cogid', 'alignment'],
prettify=False, ignore='all')

2024-04-23 14:28:44,194 [INFO] Data has been written to file <C:\Users\PROMIS~1\AppData\Local\Temp\tmp9y24oup7.alm>.
2024-04-23 14:28:48,462 [INFO] Data has been written to file <part_beginining_alignments.html>.
