### Correspondence between Beijing Mandarin and Teochew Language 

In [163]:
import pandas as pd 
import sys 
import re

In [164]:
%config IPCompleter.greedy=True

In [165]:
# used online tool to convert .tsv to .csv
clics_raw = '../raw_data/lexibank-beidasinitic-a94870e/raw/'
raw_data = '../raw_data/'

In [166]:
dialects_pd = pd.read_csv(raw_data+'output-dialects.txt')

In [167]:
dialects_pd.head()

Unnamed: 0,ID,DOCULECT,GLOTTOLOG,ISO,CONCEPT,CONCEPTICON_ID,CHINESE,PINYIN,BENZI,BENZI_IN_SOURCE,...,FORM,SEGMENTS,COGID,COGIDS,NOTE,SOURCE,BEIDA_ID,PAGE,ORDER,ALIGNMENTS
0,16629,Beijing,beij1234,cmn,Chinese New Year's Eve (chú xī 除夕),2188.0,除夕,chú xī,大 年 三 十,大年三十兒,...,ta⁵¹nian³⁵san⁵⁵ʂʅr³⁵,t a ⁵¹ n i a n ³⁵ s a n ⁵⁵ ʂ */ʅ ³⁵,1.0,47 391 392 393,,BeijingDaxue1964,48,24,1,t a - - ⁵¹ + n i - a n - ³⁵ + s a n ⁵⁵ + ʂ */ʅ...
1,16630,Beijing,beij1234,cmn,Chinese New Year's Eve (chú xī 除夕),2188.0,除夕,chú xī,三 十 晚 上,三十兒晚上,...,san⁵⁵ʂʅr³⁵uan²¹⁴₂₁ʂaŋ⁰,s a n ⁵⁵ ʂ */ʅ ³⁵ u a n ²¹/²¹⁴ ʂ a ŋ ⁰,2.0,392 393 394 96,[2],BeijingDaxue1964,48,24,2,s a n ⁵⁵ + ʂ */ʅ - - ³⁵ + - u a n - ²¹/²¹⁴ + ʂ...
2,16631,Jinan,jina1245,cmn,Chinese New Year's Eve (chú xī 除夕),2188.0,除夕,chú xī,三 十,三十兒,...,sæ̃²¹³ʂʅr⁴²,s æ̃ ∼ ²¹³ ʂ */ʅ ⁴²,3.0,392 393,,BeijingDaxue1964,48,24,1,s æ̃ ∼ ²¹³ + ʂ */ʅ - - ⁴²
3,16632,Shenyang,shen1252,cmn,Chinese New Year's Eve (chú xī 除夕),2188.0,除夕,chú xī,三 十,三十兒,...,san³³sɿr³⁵,s a n ³³ s */ɿ ³⁵,3.0,392 393,,BeijingDaxue1964,48,24,1,s a n ³³ + s */ɿ - - ³⁵
4,16633,Xi_an,xian1253,cmn,Chinese New Year's Eve (chú xī 除夕),2188.0,除夕,chú xī,三 十,三十兒〔晚上〕,...,sæ̃²¹ʂʅr²⁴,s æ̃ ∼ ²¹ ʂ */ʅ ²⁴,3.0,392 393,,BeijingDaxue1964,48,24,1,s æ̃ ∼ ²¹ + ʂ */ʅ - - ²⁴


### Extract All Pairs from Mandarin and Teochew 

In [168]:
teochew_pd = dialects_pd[dialects_pd.DOCULECT=='Chaozhou']
mandarin_pd = dialects_pd[dialects_pd.DOCULECT=='Beijing']

In [169]:
teochew_pd.columns

Index(['ID', 'DOCULECT', 'GLOTTOLOG', 'ISO', 'CONCEPT', 'CONCEPTICON_ID',
       'CHINESE', 'PINYIN', 'BENZI', 'BENZI_IN_SOURCE', 'VALUE', 'FORM',
       'SEGMENTS', 'COGID', 'COGIDS', 'NOTE', 'SOURCE', 'BEIDA_ID', 'PAGE',
       'ORDER', 'ALIGNMENTS'],
      dtype='object')

In [170]:
merged_pd = pd.merge(teochew_pd, mandarin_pd, how="inner", on="BEIDA_ID", suffixes=('_teo', '_man'))

In [171]:
merged_pd = merged_pd[['BENZI_IN_SOURCE_teo','SEGMENTS_teo','CHINESE_man']]

# removes extra characters and spaces from beizi_in_source 
merged_pd['BENZI_IN_SOURCE_teo'] = merged_pd['BENZI_IN_SOURCE_teo'].apply(lambda x: re.sub('[a-zA-Z0-9’!"#$%&\'() \
                                                                    *+,-./:;<=>?@，。?★、…【】\
                                                                    《》？“”‘’！[\\]^_`{|}~\s]+', "", str(x)))
# removes rows that no chinese words can be found for teochew pronounciation
merged_pd = merged_pd.loc[merged_pd['BENZI_IN_SOURCE_teo']!=""]

In [172]:
merged_pd.head()

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE_man
0,三十夜,s ã ∼ ²³/³³ ts a p ²¹/⁴ m e ⁵⁵,除夕
1,三十夜,s ã ∼ ²³/³³ ts a p ²¹/⁴ m e ⁵⁵,除夕
2,我,u a ⁵³,我
3,裌裘,k o i ʔ ⁴/²¹ h ĩ ũ ∼ ⁵⁵,夾祅
5,下,e ¹¹,下(打一下)


### Convert Traditional Chinese to Simplified Chinese/Pinyin

In [173]:
from hanziconv import HanziConv
from xpinyin import Pinyin

In [174]:
merged_pd['BENZI_man'] = merged_pd['BENZI_IN_SOURCE_teo'].apply(lambda x: HanziConv.toSimplified(x))
p = Pinyin()
merged_pd['pinyin'] = merged_pd['BENZI_man'].apply(lambda x: p.get_pinyin(x, tone_marks='numbers'))

In [175]:
merged_pd.drop_duplicates(subset="SEGMENTS_teo", keep='first', inplace=True)
merged_pd.head()

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE_man,BENZI_man,pinyin
0,三十夜,s ã ∼ ²³/³³ ts a p ²¹/⁴ m e ⁵⁵,除夕,三十夜,san1-shi2-ye4
2,我,u a ⁵³,我,我,wo3
3,裌裘,k o i ʔ ⁴/²¹ h ĩ ũ ∼ ⁵⁵,夾祅,裌裘,jia2-qiu2
5,下,e ¹¹,下(打一下),下,xia4
6,算盤,s ɯ ŋ ⁵³/²¹³ p ũ ã ∼ ⁵⁵,算盤,算盘,suan4-pan2


### Separate Citation tones with Sandhi Tones for Teochew

In [176]:
SUB = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")
SUP = str.maketrans("⁰¹²³⁴⁵⁶⁷⁸⁹", "0123456789")

merged_pd['citation_teo'] = merged_pd['SEGMENTS_teo'].apply(lambda x: x.translate(SUP))

def extract_citation(s, delimit=' '):
    ans = [] 
    blocks = s.split(delimit)
    for block in blocks:
        if any(c.isdigit() for c in block):
            ans.append(block.split('/')[-1]) 
    return " ".join(ans)

merged_pd['citation_teo'] = merged_pd['citation_teo'].apply(lambda x: extract_citation(x))

In [177]:
merged_pd['citation_man'] = merged_pd['pinyin'].apply(lambda x: " ".join([t[-1] for t in x.split('-')]))

def filter_row(row):
    return row['citation_man'].isdigit() 

merged_pd = merged_pd[merged_pd.apply(filter_row, axis=1, reduce=True)]

merged_pd.head()

  


Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE_man,BENZI_man,pinyin,citation_teo,citation_man
2,我,u a ⁵³,我,我,wo3,53,3
5,下,e ¹¹,下(打一下),下,xia4,11,4
7,伴,pʰ ũ ã ∼ ³⁵,陪,伴,ban4,35,4
8,陪,p u e ⁵⁵,陪,陪,pei2,55,2
9,疊,tʰ i ə p ⁴,疊(堆疊),叠,die2,4,2


### Group the Citation Tones by Teochew/Mandarin

In [181]:
teochew_citation = merged_pd['citation_teo'].values.tolist() 
mandarin_citation = merged_pd['citation_man'].values.tolist() 
dic = dict() 
for ts, ms in zip(teochew_citation, mandarin_citation):
    ts = ts.split(' ')
    ms = ms.split(' ')
    for tc, mc in zip(ts, ms):
        if (tc, mc) not in dic:
            dic[(tc, mc)] = 1 
            continue 
        dic[(tc, mc)] += 1 
print(dic.keys())

dict_keys([('53', '3'), ('11', '4'), ('35', '4'), ('55', '2'), ('4', '2'), ('213', '4'), ('21', '3'), ('21', '4'), ('21', '2'), ('21', '1'), ('33', '1'), ('35', '3'), ('213', '3'), ('33', '4'), ('55', '1'), ('213', '1'), ('11', '3'), ('33', '3'), ('4', '4'), ('53', '4'), ('4', '3'), ('53', '1'), ('33', '2'), ('53', '2'), ('11', '2'), ('35', '2'), ('213', '2'), ('55', '3'), ('55', '4'), ('4', '1')])
