## Packages

In [1]:
import pandas as pd
import re

## Load datasets

### Corpus

In [2]:
file_name = "PtLanka.csv"
column_name = "SriLanka_creole"

word_df = pd.read_csv(file_name, keep_default_na=False)
word_df = word_df[[column_name]]
word_df.columns = ["word"]
word_df

Unnamed: 0,word
0,Abaço
1,Aberçà
2,abersà
3,aburçá
4,Aburcé
...,...
2517,Wine
2518,Zumbá
2519,Zomberías
2520,zombrias


### IPA

In [3]:
file_name = "IPA_replacement_rules.csv"

IPA_rules = pd.read_csv(file_name, keep_default_na=False)
IPA_rules

Unnamed: 0,reg,IPA
0,z$,s
1,c$,k
2,tch,t̠ʃ
3,ch,t̠ʃ
4,dj,d̠ʒ
5,j,d̠ʒ
6,g(?=[eéèêiíì]),d̠ʒ
7,gg,g
8,gu(?=[eéèêiíì]),g
9,[vw],ʋ


### Consonant and vowel classes

Before syllable parsing, we replace the phonemes by larger categories:

- T: corresponds to obstruents (consonants like T, which typically occur as first unit of an onset only);

- R: corresponds to sonorants (consonants like R, which typically occur as onset- first or second unit- or coda);

- V: corresponds to vowels;

- G: corresponds to glides.

In [4]:
file_name = "TRSVG_replacement_rules.csv"

TRSVG_rules = pd.read_csv(file_name, keep_default_na=False)
TRSVG_rules

Unnamed: 0,reg,TRSVG
0,s,S
1,g,T
2,k,T
3,ʋ,T
4,ɲ,R
5,ʎ,R
6,z,S
7,p,T
8,b,T
9,t,T


### Consonant and vowel class exceptions

In [5]:
file_name = "TRSVG_exceptions.csv"

TRSVG_exceptions = pd.read_csv(file_name, keep_default_na=False)
TRSVG_exceptions

Unnamed: 0,reg,TRSVG
0,papagaia,papagVGa
1,peɾseguaisaːo,peɾseguVGsaːo
2,peɾsuaidiɾ,peɾsuVGdiɾ
3,aio,VGo


### Word split rules

In [6]:
file_name = "word_split_rules.csv"

word_split_rules = pd.read_csv(file_name, keep_default_na=False)
word_split_rules

Unnamed: 0,reg,split
0,VT,V.T
1,VGT,VG.T
2,RT,R.T
3,ST,S.T
4,VRV,V.RV
5,VRVG,V.RVG
6,VGRV,VG.RV
7,VGRVG,VG.RVG
8,RSV,R.SV
9,RSVG,R.SVG


## Preprocessing

Convert to lower case

In [7]:
word_df["word"] = word_df["word"].str.lower()
word_df

Unnamed: 0,word
0,abaço
1,aberçà
2,abersà
3,aburçá
4,aburcé
...,...
2517,wine
2518,zumbá
2519,zomberías
2520,zombrias


Remove spurious characters

In [8]:
chars = ['-','  ','\.\.\.']
subst = [' ',' ',' ']

preproc = {'chars': chars,
      'subst': subst}

preproc = dict(zip(preproc['chars'], preproc['subst']))

word_df['word'] = word_df['word'].replace(preproc, regex=True)

word_df

Unnamed: 0,word
0,abaço
1,aberçà
2,abersà
3,aburçá
4,aburcé
...,...
2517,wine
2518,zumbá
2519,zomberías
2520,zombrias


# Convert to IPA

In [9]:
word_df['IPA'] = word_df['word']

for row in IPA_rules.itertuples():
    word_df['IPA'] = word_df['IPA'].str.replace(row.reg, row.IPA, regex=True)
    
word_df

Unnamed: 0,word,IPA
0,abaço,abaso
1,aberçà,abeɾsaː
2,abersà,abeɾsaː
3,aburçá,abuɾsaː
4,aburcé,abuɾseː
...,...,...
2517,wine,ʋine
2518,zumbá,zumbaː
2519,zomberías,zombeɾiːas
2520,zombrias,zombɾias


# Convert to consonant and vowel classes

In [10]:
word_df['TRSVG'] = word_df['IPA']

for row in TRSVG_exceptions.itertuples():
    word_df['TRSVG'] = word_df['TRSVG'].str.replace(row.reg, row.TRSVG, regex=True)

for row in TRSVG_rules.itertuples():
    word_df['TRSVG'] = word_df['TRSVG'].str.replace(row.reg, row.TRSVG, regex=True)
    
word_df

Unnamed: 0,word,IPA,TRSVG
0,abaço,abaso,VTVSV
1,aberçà,abeɾsaː,VTVRSV
2,abersà,abeɾsaː,VTVRSV
3,aburçá,abuɾsaː,VTVRSV
4,aburcé,abuɾseː,VTVRSV
...,...,...,...
2517,wine,ʋine,TVRV
2518,zumbá,zumbaː,SVRTV
2519,zomberías,zombeɾiːas,SVRTVRVVS
2520,zombrias,zombɾias,SVRTRVVS


# Split word

In [11]:
word_df['TRSVG_split'] = word_df['TRSVG']

for row in word_split_rules.itertuples():
    word_df['TRSVG_split'] = word_df['TRSVG_split'].str.replace(row.reg, row.split, regex=True)
    
word_df

Unnamed: 0,word,IPA,TRSVG,TRSVG_split
0,abaço,abaso,VTVSV,V.TV.SV
1,aberçà,abeɾsaː,VTVRSV,V.TVR.SV
2,abersà,abeɾsaː,VTVRSV,V.TVR.SV
3,aburçá,abuɾsaː,VTVRSV,V.TVR.SV
4,aburcé,abuɾseː,VTVRSV,V.TVR.SV
...,...,...,...,...
2517,wine,ʋine,TVRV,TV.RV
2518,zumbá,zumbaː,SVRTV,SVR.TV
2519,zomberías,zombeɾiːas,SVRTVRVVS,SVR.TV.RV.VS
2520,zombrias,zombɾias,SVRTRVVS,SVR.TRV.VS


# Convert to CV

In [12]:
chars = ['G','[SRT]']
subst = ['V','C']

to_cv = {'chars': chars,
      'subst': subst}

to_cv = dict(zip(to_cv['chars'], to_cv['subst']))

word_df['CV'] = word_df['TRSVG_split'].replace(to_cv, regex=True)

word_df

Unnamed: 0,word,IPA,TRSVG,TRSVG_split,CV
0,abaço,abaso,VTVSV,V.TV.SV,V.CV.CV
1,aberçà,abeɾsaː,VTVRSV,V.TVR.SV,V.CVC.CV
2,abersà,abeɾsaː,VTVRSV,V.TVR.SV,V.CVC.CV
3,aburçá,abuɾsaː,VTVRSV,V.TVR.SV,V.CVC.CV
4,aburcé,abuɾseː,VTVRSV,V.TVR.SV,V.CVC.CV
...,...,...,...,...,...
2517,wine,ʋine,TVRV,TV.RV,CV.CV
2518,zumbá,zumbaː,SVRTV,SVR.TV,CVC.CV
2519,zomberías,zombeɾiːas,SVRTVRVVS,SVR.TV.RV.VS,CVC.CV.CV.VC
2520,zombrias,zombɾias,SVRTRVVS,SVR.TRV.VS,CVC.CCV.VC


In [13]:
word_df.to_csv("crioulo_IPA_CV.csv", index = False, header = True)