### Correspondence between Standard Mandarin and Teochew Language 

In [1]:
import pandas as pd 
import sys 
import re

In [2]:
# must run this notebook under python 3.7 as the function isascii() is not available for older version

In [3]:
%config IPCompleter.greedy=True

In [4]:
# used online tool to convert .tsv to .csv
clics_raw = '../data/raw_data/lexibank-beidasinitic-a94870e/raw/'
raw_data = '../data/raw_data/'

In [5]:
dialects_pd = pd.read_csv(raw_data+'output-dialects.txt')

# get all teochew data
teochew_pd = dialects_pd[dialects_pd.DOCULECT=='Chaozhou']

In [6]:
merged_pd = teochew_pd.copy()
merged_pd = merged_pd[['BENZI_IN_SOURCE','SEGMENTS','CHINESE']]
merged_pd = merged_pd.rename(columns={'BENZI_IN_SOURCE': 'BENZI_IN_SOURCE_teo', 
                                      'SEGMENTS': 'SEGMENTS_teo', 'CHINESE': 'CHINESE'})
merged_pd.head()

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE
20,三十夜,s ã ∼ ²³/³³ ts a p ²¹/⁴ m e ⁵⁵,除夕
40,我,u a ⁵³,我
59,裌裘,k o i ʔ ⁴/²¹ h ĩ ũ ∼ ⁵⁵,夾祅
76,!,s u ²³/³³ s u ³³,稍微
91,下,e ¹¹,下(打一下)


### Data Wrangling

In [7]:
merged_pd.drop_duplicates(subset="BENZI_IN_SOURCE_teo", keep='first', inplace=True)

# removes extra characters and spaces from beizi_in_source 
merged_pd['BENZI_IN_SOURCE_teo'] = merged_pd['BENZI_IN_SOURCE_teo'].apply(lambda x: re.sub('[a-zA-Z0-9’!"#$%&\'() \
                                                                    *+,-./:;<=>?@，。?★、…【】□\
                                                                    《》？“”‘’！[\\]^_`{|}~\s]+', "", str(x)))
# removes rows that no chinese words can be found for teochew pronounciation
merged_pd = merged_pd.loc[merged_pd['BENZI_IN_SOURCE_teo']!=""]
merged_pd.head()

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE
20,三十夜,s ã ∼ ²³/³³ ts a p ²¹/⁴ m e ⁵⁵,除夕
40,我,u a ⁵³,我
59,裌裘,k o i ʔ ⁴/²¹ h ĩ ũ ∼ ⁵⁵,夾祅
91,下,e ¹¹,下(打一下)
114,算盤,s ɯ ŋ ⁵³/²¹³ p ũ ã ∼ ⁵⁵,算盤


### Convert Traditional Chinese to Simplified Chinese/Pinyin

In [8]:
from hanziconv import HanziConv
from xpinyin import Pinyin

In [9]:
merged_pd['BENZI_man'] = merged_pd['BENZI_IN_SOURCE_teo'].apply(lambda x: HanziConv.toSimplified(x))
p = Pinyin()
merged_pd['pinyin'] = merged_pd['BENZI_man'].apply(lambda x: p.get_pinyin(x, tone_marks='numbers'))

In [10]:
merged_pd.head(10)

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE,BENZI_man,pinyin
20,三十夜,s ã ∼ ²³/³³ ts a p ²¹/⁴ m e ⁵⁵,除夕,三十夜,san1-shi2-ye4
40,我,u a ⁵³,我,我,wo3
59,裌裘,k o i ʔ ⁴/²¹ h ĩ ũ ∼ ⁵⁵,夾祅,裌裘,jia2-qiu2
91,下,e ¹¹,下(打一下),下,xia4
114,算盤,s ɯ ŋ ⁵³/²¹³ p ũ ã ∼ ⁵⁵,算盤,算盘,suan4-pan2
132,伴,pʰ ũ ã ∼ ³⁵,陪,伴,ban4
133,陪,p u e ⁵⁵,陪,陪,pei2
154,疊,tʰ i ə p ⁴,疊(堆疊),叠,die2
172,事,s ɿ ¹¹,事情,事,shi4
192,下掛,e ²¹/³⁵ k u a ²¹³,下午,下挂,xia4-gua4


### Separate Citation tones with Sandhi Tones for Teochew

In [11]:
SUB = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")
SUP = str.maketrans("⁰¹²³⁴⁵⁶⁷⁸⁹", "0123456789")

CONSONANTS = {'m', 'n', 'ŋ', 'pʰ', 'tʰ', 'kʰ', 'p', 't', 'k', 'g',
              'b', 'tsʰ', 'ts', 's', 'z', 'dz', 'l', 'h', 'h̃'}

merged_pd['citation_teo'] = merged_pd['SEGMENTS_teo'].apply(lambda x: x.translate(SUP))

def get_teochew_parts(s, delimit=' '):
    s = s.translate(SUP)
    citation, initials, finals = [], [], [] 
    placeholder = "" 
    blocks = s.split(delimit)
    for idx, block in enumerate(blocks):
        if any(c.isdigit() for c in block):
            citation.append(block.split('/')[-1]) 
            finals.append(placeholder)
            placeholder = ""
        elif block in CONSONANTS and blocks[idx-1][-1].isdigit():
            initials.append(block)
        else:
            if blocks[idx-1][-1].isdigit():
                initials.append("")
            placeholder = placeholder + block        
    return citation, initials, finals  

print(get_teochew_parts('s ɯ ŋ ⁵³/²¹³ p ũ ã ∼ ⁵⁵'))
print(get_teochew_parts('e ~ ²¹/³⁵ a u ~ ²¹³'))

(['213', '55'], ['s', 'p'], ['ɯŋ', 'ũã∼'])
(['35', '213'], ['', ''], ['e~', 'au~'])


In [12]:
merged_pd['citation_teo'], merged_pd['initial_teo'], merged_pd['final_teo'] = \
zip(*merged_pd['citation_teo'].apply(lambda x: get_teochew_parts(x)))

In [13]:
merged_pd.head()

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE,BENZI_man,pinyin,citation_teo,initial_teo,final_teo
20,三十夜,s ã ∼ ²³/³³ ts a p ²¹/⁴ m e ⁵⁵,除夕,三十夜,san1-shi2-ye4,"[33, 4, 55]","[s, ts, m]","[ã∼, ap, e]"
40,我,u a ⁵³,我,我,wo3,[53],[],[ua]
59,裌裘,k o i ʔ ⁴/²¹ h ĩ ũ ∼ ⁵⁵,夾祅,裌裘,jia2-qiu2,"[21, 55]","[k, h]","[oiʔ, ĩũ∼]"
91,下,e ¹¹,下(打一下),下,xia4,[11],[],[e]
114,算盤,s ɯ ŋ ⁵³/²¹³ p ũ ã ∼ ⁵⁵,算盤,算盘,suan4-pan2,"[213, 55]","[s, p]","[ɯŋ, ũã∼]"


In [14]:
# initials and finals for pinyin
INITIALS = {'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
                'q', 'x', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's', 'w', 'y'}

FINALS = {'in', 'ao', 've', 'en', 'ai', 'ei', 'ie', 'an', 'ong', 'ui', 'eng', 
          'e', 'ing', 'u', 'v', 'er', 'i', 'a', 'ang', 'un', 'o', 'iu', 'vn',
          'ia', 'ian', 'iang', 'iao', 'iong', 'ua', 'uai', 'uan', 'uang', 'uo', 'van'}

print("Total number of initials is {} and finals is {}".format(len(INITIALS), len(FINALS)))

Total number of initials is 23 and finals is 34


In [15]:
# get_parts returns initials if n = 1, and returns finals if n = 2 
def get_parts(x):
    pinyin = x.split('-')
    initials = []
    finals = [] 
    for syllable in pinyin: 
        if syllable[0] + syllable[1] in INITIALS:
            initials.append(syllable[0]+syllable[1])
            finals.append(syllable[2:-1])  
            
        elif syllable[0] in INITIALS:
            initials.append(syllable[0])
            # convert u to v for special cases 
            if syllable[0] in ['j', 'q', 'x', 'y'] and syllable[1] == 'u':
                finals.append(syllable[1:-1].replace('u', 'v'))
            else:
                finals.append(syllable[1:-1])  
            
        else:
            initials.append("")
            finals.append(syllable[:-1])
    return initials, finals 
    
print(get_parts('a4-yun4-hui4-yi1-ge4-kuan1-kuo4-de5-jian1-bang3'))

(['', 'y', 'h', 'y', 'g', 'k', 'k', 'd', 'j', 'b'], ['a', 'vn', 'ui', 'i', 'e', 'uan', 'uo', 'e', 'ian', 'ang'])


In [16]:
def filter_non_pinyin(df):
    return df['pinyin'].replace(" ", "").isascii() 

merged_pd = merged_pd[merged_pd.apply(filter_non_pinyin, axis=1, reduce=True)]

  after removing the cwd from sys.path.


In [17]:
merged_pd['citation_man'] = merged_pd['pinyin'].apply(lambda x: [t[-1] for t in x.split('-')])
merged_pd['initial_man'], merged_pd['final_man'] = zip(*merged_pd['pinyin'].apply(lambda x: get_parts(x))) 

In [18]:
merged_pd_copy = pd.DataFrame(merged_pd,columns=['BENZI_IN_SOURCE_teo','BENZI_man','citation_teo',
                                                 'initial_teo','final_teo', 
                                                 'citation_man','initial_man','final_man'])

merged_pd_copy = merged_pd_copy[merged_pd_copy.citation_teo.map(len)==merged_pd_copy.citation_man.map(len)]

pd1 = pd.DataFrame(merged_pd_copy['BENZI_IN_SOURCE_teo'].apply(lambda x: list(x)))
pd2 = pd.DataFrame(merged_pd_copy['BENZI_man'].apply(lambda x: list(x)))
pd3 = pd.DataFrame(merged_pd_copy['citation_teo'])
pd4 = pd.DataFrame(merged_pd_copy['initial_teo'])
pd5 = pd.DataFrame(merged_pd_copy['final_teo'])
pd6 = pd.DataFrame(merged_pd_copy['citation_man'])
pd7 = pd.DataFrame(merged_pd_copy['initial_man'])
pd8 = pd.DataFrame(merged_pd_copy['final_man'])

pd1 = pd1.explode('BENZI_IN_SOURCE_teo')
pd2 = pd2.explode('BENZI_man')
pd3 = pd3.explode('citation_teo')
pd4 = pd4.explode('initial_teo')
pd5 = pd5.explode('final_teo')
pd6 = pd6.explode('citation_man')
pd7 = pd7.explode('initial_man')
pd8 = pd8.explode('final_man')
combined_data = pd.concat([pd1, pd2, pd3, pd4, pd5, pd6, pd7, pd8], axis=1)
combined_data.drop_duplicates(subset="BENZI_IN_SOURCE_teo", keep='first', inplace=True)

In [19]:
combined_data.sample(20)
# drop the 52 tone since there are only two cases, probably transcription error 
combined_data = combined_data.loc[~combined_data.citation_teo.str.contains('52')]

In [38]:
combined_data.loc[combined_data.final_man.isin(['iao'])]

Unnamed: 0,BENZI_IN_SOURCE_teo,BENZI_man,citation_teo,initial_teo,final_teo,citation_man,initial_man,final_man
916,蕉,蕉,33,ts,ie,1,j,iao
1177,骹,骹,33,kʰ,a,1,q,iao
1601,橋,桥,55,k,ie,2,q,iao
1732,小,小,53,s,ie,3,x,iao
1940,膠,胶,33,kʰ,a,1,j,iao
1978,鉸,铰,33,k,a,3,j,iao
2099,叫,叫,213,k,ie,4,j,iao
2480,交,交,33,k,au,1,j,iao
2625,寮,寮,55,l,iəu,2,l,iao
3158,料,料,11,l,iəu,4,l,iao


In [None]:
combined_data.initial_man.value_counts()

In [None]:
#model_input = combined_data.to_csv('../data/clean_data/model_input.csv', index = None, header=True)

### Group the Citation Tones by Teochew/Mandarin

In [None]:
teochew_citation = combined_data['citation_teo'].values.tolist() 
mandarin_citation = combined_data['citation_man'].values.tolist() 
dic = dict() 
for tc, mc in zip(teochew_citation, mandarin_citation):
    if (tc, mc) not in dic:
        dic[(tc, mc)] = 1 
        continue 
    dic[(tc, mc)] += 1 
print(sum(dic.values()))

In [None]:
teochew_mapping = {'33':'mid', '11':'low', '21':'low_checked', 
                   '213':'low_rising', '35':'high_rising', 
                   '4':'high_checked', '53':'falling', '55':'high'} 

mandarin_mapping = {'1': 'high', '2': 'rising', '3': 'dipping', '4': 'falling'}

In [None]:
freq_pd = pd.DataFrame(columns=['teochew_tone', 'mandarin_tone', 'frequency'])
idx = 0
for (teochew_tone, mandarin_tone), freq in dic.items():
    freq_pd.loc[idx] = [teochew_mapping[teochew_tone]+'({})'.format(teochew_tone), 
                        mandarin_mapping[mandarin_tone]+'({})'.format(mandarin_tone), freq]
    idx += 1 
freq_pd.head()

### Data Visualization

In [None]:
pd.crosstab(freq_pd["teochew_tone"], freq_pd['mandarin_tone'], values=freq_pd['frequency'],
           aggfunc=lambda x: x)

### Export the dataframe to R for Visualization 

In [None]:
#export_csv = freq_pd.to_csv('../output/r_input.csv', index = None, header=True)