In [1]:
import pandas as pd
import numpy as np

In [2]:
phobile_file_path = "./downloads/phoible.csv"

phoible_df = pd.read_csv(phobile_file_path, encoding="utf-8", engine="python", dtype=str)

dialects_df = (phoible_df[["LanguageName", "SpecificDialect"]]
                         .fillna({"SpecificDialect": "none"})
                         .drop_duplicates()
                         .reset_index(drop=True))

language_counts = dialects_df.LanguageName.value_counts()

In [3]:
import re

def map_dialect(d) -> str:
    dialect = (d.SpecificDialect
               .replace("as spoken in ", "")
               .replace("Eastern districts of Assam (North Eastern state of India); Jorhat", "(Eastern)")
               .replace("Bangladeshi Standard (spoken in Dhaka and other urban aread of Bangladesh)", "(Bangladeshi, Standard)")
               .replace(" (can be seen as a fusion of aspects of central Igbo and Onitsha Igbo; cannot be localised in any particular region or area of Igboland)", "")
               .replace("Standard Korean (spoken in and around Seoul)", "(Standard)")
               .replace(" or other areas with similar pitch accent systems", "")
               .replace("Educated colloquial Hungarian (Budapest of the 1990s)", "(Educated colloquial)")
               .replace(" (Kano) Hausa (spoken in Kano, Nigeria)", "")
               .replace("Standard Hindi (as spoken in Varanasi, Lucknow, Delhi etc.)", "(Standard)")
               .replace("Standard Thai (combinative style)", "(Standard)")
               .replace(" English (spoken in Newcastle)", "")
               )
    language_parts = d.LanguageName.split(";")
    language = language_parts[0] if len(language_parts) == 1 else f"{language_parts[0]} ({language_parts[1]})"

    if dialect == "none" or language_counts[d.LanguageName] == 1:
        return language
        
    
    m = re.match(r'^(.+) \(([^)]+)\) (.+)$', dialect)

    if m is not None:
        if m.group(2) == m.group(3):
            dialect = f"{m.group(1)} ({m.group(2)})"

    m = re.match(r'^the.+dialect of (\w+) .+$', dialect)

    if m is not None:
        dialect = f"({m.group(1)})"

    if dialect.startswith(language) and language_counts[d.LanguageName] == 1:
        return dialect
    
    return f"{language} {dialect}"
    
    
language_names = dialects_df.apply(map_dialect, axis=1).to_numpy()
print(language_names.shape)

assert(len(language_names) == len(set(language_names)) == len(dialects_df))

np.save("./data/language_names.npy", language_names)

(2949,)
