# The Code for the Phonological Complexity in Andic Languages

In [1]:
import csv
import json

In [2]:
languages = ['akhv1239', 'andi1255', 'bagv1239', 'botl1242', 'cham1309', 'ghod1238', 'kara1474', 'tind1238', 'toki1238']

In [3]:
consonants = ['b', 'bː', 'bʷ',
              'p', 'pː', "p'",
              'd', 'dː', 'dʷ', 'dːʷ',
              't', 'tː', "t'", "t'ː", 'tʷ', "t'ʷ", 'tsːʷ', 'tʲ',
              'ɡ', 'ɡʷ', 'ɡʲ',
              'k', "k'", 'kː', "k'ː", 'kʷ', 'kʷ', "k'ʷ", 'kːʷ', "k'ːʷ", 'kʲ', "kʲ'", 'kʲː',
              'ɢ','ɢʷ', 
              'q', "q'", 'qː', "q'ː", 'qʷ', "q'ʷ", 'qːʷ', "q'ːʷ",
              'ʔ', 'ʔʷ', 
              'dz',
              'ts', 'tsː', "ts'", "tsː'", "ts'ː", 'tsʷ', "ts'ʷ", "ts'ːʷ",
              'dʒ', 'dʒʷ', 'tʃ', 'tʃː', "tʃ'", "tʃː'", "tʃ'ː", 'tʃʷ', "tʃ'ʷ", 'tʃːʷ', "tʃ'ːʷ",
              'tɬ', "tɬ'", 'tɬː', "tɬː'", "tɬ'ː", 'tɬʷ', "tɬ'ʷ", 'tɬːʷ', "tɬ'ːʷ",
              'qχ', "qχ'", 'qχː', "qχː'", 'qχʷ', "qχ'ʷ",
              'f', 
              'z', 'zː', 'zʷ',
              's', 'sː', "s'", "s'ː", 'sʷ', "s'ʷ", 'sːʷ', "s'ːʷ",
              'ʒ', 'ʒʷ',
              'ʃ', 'ʃː', "ʃ'ː", 'ʃʷ', 'ʃːʷ', "ʃ'ːʷ",
              'ɬ', 'ɬː', 'ɬʷ', 'ɬːʷ', 'ɬʲ',
              'x', 'xː', 'xʷ', 'xːʷ', 'xʲ', 
              'ʁ', 'ʁʷ',
              'χ', 'χː', 'χʷ', 'χːʷ',
              'ʕ', 'ʕʷ',
              'ħ', 'ħː', 'ħʷ',
              'h', 'hʷ',
              'm', 'mː',
              'w',
              'n', 'nː', 'nʷ', 'nʲ',
              'r', 'rʷ', 'rʲ', 
              'l', 'lː', 'lʷ', 'lʲ',
              'j']

vowels = ['a', 'aː','ˌa', 'ˌaː', "a'", 'ã', 'ãː',
          'i', 'ˌi', 'iː', 'ĩ', 'ĩː',
          'e', 'ˌe', 'eː', 'ẽ', 'ẽː',
          'o', 'ˌo', 'oː', 'õ', 'õː',
          'u', 'uː', 'ˌu', "u'", 'ũ', 'ũː']


## 1. Preprocessing the data

Preprocessing for the level of phonetics

In [4]:
with open('andic_dicts.csv', encoding='utf8') as csv_database:
    database = csv.DictReader(csv_database, delimiter=',')

    i = 0
    phon_dict = {}

    for row in database:
        if row['glottocode'] != languages[i]:
            with open(languages[i] + 'phonemes.json', 'w', encoding='utf-8') as f:
                json.dump(phon_dict, f, ensure_ascii=False, indent=2)
                phon_dict = {}
                i += 1
        phonemes = row['ipa'].split('-')
        for sound in phonemes:
            sound = sound.replace('(', '')
            sound = sound.replace(')', '')
            if len(sound) == 1:
                if phon_dict.get(sound):
                    phon_dict[sound] += 1
                else:
                    phon_dict[sound] = 1
            if len(sound) > 1:
                sound2 = sound.split(' ')
                for s in sound2:
                    if s[0] == "'":
                        s = s[1:]
                    if phon_dict.get(s):
                        phon_dict[s] += 1
                    else:
                        phon_dict[s] = 1
    with open(languages[i] + 'phonemes.json', 'w', encoding='utf-8') as f:
                json.dump(phon_dict, f, ensure_ascii=False, indent=2)

Preprocessing for the level of functional phonetics

In [14]:
with open ('andic_dicts.csv', encoding='utf8') as csv_database:
    database = csv.DictReader(csv_database, delimiter=',')
    
    i = 0
    place_dict = {}

    for row in database:
        if row['glottocode'] != languages[i]:
            for k, v in place_dict.items():
                v = set(v)
                place_dict[k] = list(v)
            with open(languages[i] + 'places.json', 'w', encoding='utf-8') as f:
                json.dump(place_dict, f, ensure_ascii=False, indent=2)
                place_dict = {}
                i += 1
        trans = row['ipa']
        if len(trans.split(' ')) == 1:
            trans = trans.split('-')
            
            if place_dict.get(trans[0]):
                place_dict[trans[0]].append('first')
            else:
                place_dict[trans[0]] = ['first']
                
            if len(trans) > 1:
                if place_dict.get(trans[1]):
                    place_dict[trans[1]].append('second')
                else:
                    place_dict[trans[1]] = ['second']
                                        
                if place_dict.get(trans[-2]):
                    place_dict[trans[-2]].append('penultimate')
                else:
                    place_dict[trans[-2]] = ['penultimate']
                
                if place_dict.get(trans[-1]):
                    place_dict[trans[-1]].append('last')
                else:
                    place_dict[trans[-1]] = ['last']

        else:
            for word in trans.split(' '):
                word = word.split('-')

                if place_dict.get(word[0]):
                    place_dict[word[0]].append('first')
                else:
                    place_dict[word[0]] = ['first']
                
                if len(word) > 1:
                    if place_dict.get(word[1]):
                        place_dict[word[1]].append('second')
                    else:
                        place_dict[word[1]] = ['second']
                                        
                    if place_dict.get(word[-2]):
                        place_dict[word[-2]].append('penultimate')
                    else:
                        place_dict[word[-2]] = ['penultimate']
                
                    if place_dict.get(word[-1]):
                        place_dict[word[-1]].append('last')
                    else:
                        place_dict[word[-1]] = ['last']
                    
    for k, v in place_dict.items():
        v = set(v)
        place_dict[k] = list(v)
    with open(languages[i] + 'places.json', 'w', encoding='utf-8') as f:
                json.dump(place_dict, f, ensure_ascii=False, indent=2)

## 2. Comparison on the Level of Phonetics
Firstly, we will compare sounds on the level of phonetics. For this purpose, we create a dictionary where the keys are languages, and the values are dictionaries with two keys: consonants and vowels where the values are lists of consonants and vowels.

In [6]:
all_phonemes = {}

for language in languages:
    cons = []
    vow = []
    with open(language + 'phonemes.json', encoding='utf8') as f:
        ph_dct = json.load(f)
    for sound, amount in ph_dct.items():
        if sound in consonants:
            cons.append(sound)
        elif sound in vowels:
            vow.append(sound)
    all_phonemes[language] = {'consonants': cons, 'vowels': vow}

with open('phonemes by language.json', 'w', encoding='utf8') as f:
    json.dump(all_phonemes, f, ensure_ascii=False, indent=2)

Here we are making list of number of languages in which every phoneme appears just in case to look at the distribution

In [7]:
c_num = {}
v_num = {}
c_num_sort = {}
v_num_sort = {}

with open('phonemes by language.json', encoding='utf8') as f:
    dct = json.load(f)
    
for lang in dct.keys():
    for kind, sound in dct[lang].items():
        if kind == 'consonants':
            for s in sound:
                if c_num.get(s):
                    c_num[s] += 1
                else:
                    c_num[s] = 1
        if kind == 'vowels':
            for s in sound:
                if v_num.get(s):
                    v_num[s] += 1
                else:
                    v_num[s] = 1
                    
sort = sorted(c_num, key=c_num.get, reverse=True)
for v in sort:
    c_num_sort[v] = c_num[v]
    
sort = sorted(v_num, key=v_num.get, reverse=True)
for v in sort:
    v_num_sort[v] = v_num[v]
    
print(c_num_sort)
print(v_num_sort)

{'b': 9, 'd': 9, 'ʒ': 9, 'r': 9, 'χ': 9, 'ʃ': 9, 'w': 9, 'ɬ': 9, 'l': 9, 'j': 9, "k'": 9, 's': 9, 'ʁ': 9, 'z': 9, 'h': 9, 'ɡʷ': 9, 'k': 9, 'ɡ': 9, 'n': 9, 't': 9, 'm': 9, "tʃ'": 9, 'sː': 9, "q'": 9, 'ɬː': 9, 'ʔ': 9, 'χː': 9, 'tʃ': 9, "t'": 9, 'p': 9, 'ts': 9, "k'ʷ": 9, 'ʃː': 9, 'ʕ': 9, 'q': 9, "ts'": 9, 'tʃː': 9, 'kʷ': 9, 'tɬ': 8, 'ħ': 8, 'χʷ': 8, 'ʁʷ': 8, 'χːʷ': 8, 'qʷ': 8, 'tʃʷ': 8, "t'ʷ": 8, "q'ʷ": 8, 'x': 8, 'dʒ': 7, "p'": 7, 'tsː': 7, "tɬ'": 7, "tʃ'ː": 7, 'sʷ': 7, 'kː': 7, 'hʷ': 7, "tʃ'ʷ": 7, 'zʷ': 7, 'dː': 7, 'ʒʷ': 6, 'dʷ': 6, "k'ː": 6, "ts'ː": 6, "k'ːʷ": 6, 'sːʷ': 6, 'tʷ': 6, "tɬ'ʷ": 6, 'ʃːʷ': 6, 'xʷ': 6, 'ħʷ': 5, 'tɬʷ': 5, 'lː': 4, 'mː': 4, 'tsːʷ': 4, "ts'ːʷ": 4, "ts'ʷ": 4, 'kʲ': 4, 'tsʷ': 4, 'kːʷ': 4, 'ʃʷ': 3, 'xː': 3, 'ɬːʷ': 3, "tʃ'ːʷ": 3, 'ʔʷ': 3, 'lʲ': 3, 'bː': 3, "kʲ'": 3, 'nː': 3, 'tɬː': 2, "q'ː": 2, 'xːʷ': 2, 'ɬʷ': 2, 'ɢ': 2, 'tː': 2, 'pː': 2, "s'ː": 2, "s'": 2, "t'ː": 2, 'rʷ': 2, 'nʷ': 2, 'lʷ': 2, 'zː': 2, 'ɡʲ': 2, 'xʲ': 2, 'kʲː': 2, 'tʃːʷ': 2, 'bʷ': 2, 'qː': 1, "tɬ'ː":

Here we count the number of phonemes in each language to figure out what languages are most complex at the level of phonetics. 

In [8]:
with open('phonemes by language.json', encoding='utf8') as f:
    phon = json.load(f)
    
for language in phon.keys():
    c = len(phon[language]['consonants'])
    v = len(phon[language]['vowels'])
    print(language, 'consonants:', c, 'vowels', v)

akhv1239 consonants: 87 vowels 18
andi1255 consonants: 68 vowels 5
bagv1239 consonants: 85 vowels 18
botl1242 consonants: 73 vowels 16
cham1309 consonants: 89 vowels 25
ghod1238 consonants: 68 vowels 14
kara1474 consonants: 80 vowels 17
tind1238 consonants: 90 vowels 25
toki1238 consonants: 51 vowels 9


## 3. Comparison on the Level of Functional Phonetics

In [17]:
cons_all = []
vow_all = []
cons_all_dict = {}
vow_all_dict = {}

for k, v in c_num_sort.items():
    if v == 9:
        cons_all.append(k)
for k, v in v_num_sort.items():
    if v == 9:
        vow_all.append(k)

for lang in languages:
    with open(lang + 'places.json', encoding='utf8') as f:
        phon_places = json.load(f)
        
    for c in cons_all:
        if not phon_places.get(c):
            if cons_all_dict[c].get('middle'):
                cons_all_dict[c]['middle'].append(lang)
            else:
                cons_all_dict[c]['middle'] = [lang]
            continue
            
        places = phon_places[c]
        if not cons_all_dict.get(c):
            cons_all_dict[c] = {}
        for place in places:
            if cons_all_dict[c].get(place):
                cons_all_dict[c][place].append(lang)
            else:
                cons_all_dict[c][place] = [lang]
                
    for v in vow_all:
        places = phon_places[v]
        if not vow_all_dict.get(v):
            vow_all_dict[v] = {}
        for place in places:
            if vow_all_dict[v].get(place):
                vow_all_dict[v][place].append(lang)
            else:
                vow_all_dict[v][place] = [lang]
                
with open('consonants places.json', 'w', encoding='utf8') as f:
    json.dump(cons_all_dict, f, ensure_ascii=False, indent=2)
with open('vowels places.json', 'w', encoding='utf8') as f:
    json.dump(vow_all_dict, f, ensure_ascii=False, indent=2)

In [20]:
for k1,v1 in vow_all_dict.items():
    for k2,v2 in v1.items():
        print(k1, k2, len(v2))

a second 9
a first 9
a penultimate 9
a last 9
i second 9
i first 9
i penultimate 9
i last 9
e second 9
e first 9
e penultimate 8
e last 9
u second 9
u first 9
u penultimate 9
u last 9
o second 9
o first 9
o penultimate 9
o last 9


In [22]:
for k1,v1 in cons_all_dict.items():
    for k2,v2 in v1.items():
        if len(v2) < 9:
            print(k1, k2, v2)

d second ['akhv1239', 'andi1255', 'bagv1239', 'botl1242', 'cham1309', 'ghod1238', 'kara1474', 'tind1238']
d last ['akhv1239', 'andi1255', 'bagv1239', 'botl1242', 'cham1309', 'ghod1238', 'kara1474', 'tind1238']
ʒ second ['akhv1239', 'andi1255', 'bagv1239', 'botl1242', 'cham1309', 'ghod1238', 'kara1474', 'tind1238']
ʒ first ['akhv1239', 'andi1255', 'bagv1239', 'botl1242', 'cham1309', 'ghod1238', 'kara1474', 'tind1238']
ʒ last ['andi1255', 'bagv1239', 'botl1242', 'cham1309', 'ghod1238', 'kara1474']
χ last ['andi1255', 'bagv1239', 'botl1242', 'cham1309', 'ghod1238', 'kara1474', 'tind1238', 'toki1238']
ʃ second ['akhv1239', 'andi1255', 'bagv1239', 'botl1242', 'cham1309', 'ghod1238', 'kara1474', 'tind1238']
ʃ first ['akhv1239', 'andi1255', 'bagv1239', 'botl1242', 'cham1309', 'ghod1238', 'kara1474', 'tind1238']
ʃ penultimate ['akhv1239', 'andi1255', 'bagv1239', 'botl1242', 'cham1309', 'ghod1238', 'kara1474', 'tind1238']
ʃ last ['andi1255', 'bagv1239', 'botl1242', 'cham1309', 'ghod1238', 'kara