### Correspondence between Beijing Mandarin and Teochew Language 

In [1]:
import pandas as pd 
import sys 
import re

In [2]:
%config IPCompleter.greedy=True

In [3]:
# used online tool to convert .tsv to .csv
clics_raw = '../raw_data/lexibank-beidasinitic-a94870e/raw/'
raw_data = '../raw_data/'

In [4]:
dialects_pd = pd.read_csv(raw_data+'output-dialects.txt')

### Extract All Pairs from Mandarin and Teochew 

In [5]:
teochew_pd = dialects_pd[dialects_pd.DOCULECT=='Chaozhou']
mandarin_pd = dialects_pd[dialects_pd.DOCULECT=='Beijing']

In [6]:
teochew_pd.columns

Index(['ID', 'DOCULECT', 'GLOTTOLOG', 'ISO', 'CONCEPT', 'CONCEPTICON_ID',
       'CHINESE', 'PINYIN', 'BENZI', 'BENZI_IN_SOURCE', 'VALUE', 'FORM',
       'SEGMENTS', 'COGID', 'COGIDS', 'NOTE', 'SOURCE', 'BEIDA_ID', 'PAGE',
       'ORDER', 'ALIGNMENTS'],
      dtype='object')

In [7]:
merged_pd = pd.merge(teochew_pd, mandarin_pd, how="inner", on="BEIDA_ID", suffixes=('_teo', '_man'))

### Data Wrangling

In [8]:
merged_pd = merged_pd[['BENZI_IN_SOURCE_teo','SEGMENTS_teo','CHINESE_man']]
merged_pd.drop_duplicates(subset="BENZI_IN_SOURCE_teo", keep='first', inplace=True)

# removes extra characters and spaces from beizi_in_source 
merged_pd['BENZI_IN_SOURCE_teo'] = merged_pd['BENZI_IN_SOURCE_teo'].apply(lambda x: re.sub('[a-zA-Z0-9’!"#$%&\'() \
                                                                    *+,-./:;<=>?@，。?★、…【】□\
                                                                    《》？“”‘’！[\\]^_`{|}~\s]+', "", str(x)))
# removes rows that no chinese words can be found for teochew pronounciation
merged_pd = merged_pd.loc[merged_pd['BENZI_IN_SOURCE_teo']!=""]
merged_pd.describe()

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE_man
count,890,890,890
unique,879,868,810
top,擔,k ɯ ŋ ³³,剛才
freq,2,3,3


### Convert Traditional Chinese to Simplified Chinese/Pinyin

In [9]:
from hanziconv import HanziConv
from xpinyin import Pinyin

In [10]:
merged_pd['BENZI_man'] = merged_pd['BENZI_IN_SOURCE_teo'].apply(lambda x: HanziConv.toSimplified(x))
p = Pinyin()
merged_pd['pinyin'] = merged_pd['BENZI_man'].apply(lambda x: p.get_pinyin(x, tone_marks='numbers'))

In [11]:
merged_pd.describe()

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE_man,BENZI_man,pinyin
count,890,890,890,890,890
unique,879,868,810,879,816
top,擔,k ɯ ŋ ³³,剛才,咀,shi4
freq,2,3,3,2,5


### Separate Citation tones with Sandhi Tones for Teochew

In [12]:
SUB = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")
SUP = str.maketrans("⁰¹²³⁴⁵⁶⁷⁸⁹", "0123456789")

merged_pd['citation_teo'] = merged_pd['SEGMENTS_teo'].apply(lambda x: x.translate(SUP))

def extract_citation(s, delimit=' '):
    ans = [] 
    blocks = s.split(delimit)
    for block in blocks:
        if any(c.isdigit() for c in block):
            ans.append(block.split('/')[-1]) 
    return " ".join(ans)

merged_pd['citation_teo'] = merged_pd['citation_teo'].apply(lambda x: extract_citation(x))

In [13]:
merged_pd['citation_man'] = merged_pd['pinyin'].apply(lambda x: " ".join([t[-1] for t in x.split('-')]))

def filter_row(row):
    return row['citation_man'].replace(" ", "").isdigit() 

merged_pd = merged_pd[merged_pd.apply(filter_row, axis=1, reduce=True)]

  


In [17]:
merged_pd_copy = pd.DataFrame(merged_pd,columns=['BENZI_IN_SOURCE_teo','BENZI_man','citation_teo','citation_man'])
merged_pd_copy['citation_teo'] = merged_pd_copy['citation_teo'].apply(lambda x: x.split(' '))
merged_pd_copy['citation_man'] = merged_pd_copy['citation_man'].apply(lambda x: x.split(' '))
merged_pd_copy = merged_pd_copy[merged_pd_copy.citation_teo.map(len)==merged_pd_copy.citation_man.map(len)]

pd1 = pd.DataFrame(merged_pd_copy['BENZI_IN_SOURCE_teo'].apply(lambda x: list(x)))
pd2 = pd.DataFrame(merged_pd_copy['BENZI_man'].apply(lambda x: list(x)))
pd3 = pd.DataFrame(merged_pd_copy['citation_teo'])
pd4 = pd.DataFrame(merged_pd_copy['citation_man'])

pd1 = pd1.explode('BENZI_IN_SOURCE_teo')
pd2 = pd2.explode('BENZI_man')
pd3 = pd3.explode('citation_teo')
pd4 = pd4.explode('citation_man')
combined_data = pd.concat([pd1, pd2, pd3, pd4], axis=1)
combined_data.drop_duplicates(subset="BENZI_IN_SOURCE_teo", keep='first', inplace=True)
combined_data.describe()

Unnamed: 0,BENZI_IN_SOURCE_teo,BENZI_man,citation_teo,citation_man
count,803,803,803,803
unique,803,803,9,4
top,妖,妖,33,4
freq,1,1,187,232


### Group the Citation Tones by Teochew/Mandarin

In [29]:
# drop the 52 tone since there are only two cases, probably transcription error 
combined_data = combined_data.loc[~combined_data.citation_teo.str.contains('52')]

In [35]:
teochew_citation = combined_data['citation_teo'].values.tolist() 
mandarin_citation = combined_data['citation_man'].values.tolist() 
dic = dict() 
for tc, mc in zip(teochew_citation, mandarin_citation):
    if (tc, mc) not in dic:
        dic[(tc, mc)] = 1 
        continue 
    dic[(tc, mc)] += 1 
print(sum(dic.values()))

801


In [31]:
teochew_mapping = {'33':'mid', '11':'low', '21':'low_checked', 
                   '213':'low_rising', '35':'high_rising', 
                   '4':'high_checked', '53':'falling', '55':'high'} 

mandarin_mapping = {'1': 'high', '2': 'rising', '3': 'dipping', '4': 'falling'}

In [32]:
freq_pd = pd.DataFrame(columns=['teochew_tone', 'mandarin_tone', 'frequency'])
idx = 0
for (teochew_tone, mandarin_tone), freq in dic.items():
    freq_pd.loc[idx] = [teochew_mapping[teochew_tone]+'({})'.format(teochew_tone), 
                        mandarin_mapping[mandarin_tone]+'({})'.format(mandarin_tone), freq]
    idx += 1 
freq_pd.head()

Unnamed: 0,teochew_tone,mandarin_tone,frequency
0,mid(33),high(1),168
1,high_checked(4),rising(2),32
2,high(55),falling(4),10
3,falling(53),dipping(3),111
4,low_checked(21),rising(2),18


### Data Visualization

In [33]:
pd.crosstab(freq_pd["teochew_tone"], freq_pd['mandarin_tone'], values=freq_pd['frequency'],
           aggfunc=lambda x: x)

mandarin_tone,dipping(3),falling(4),high(1),rising(2)
teochew_tone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
falling(53),111.0,7.0,4.0,3.0
high(55),4.0,10.0,7.0,135.0
high_checked(4),1.0,24.0,3.0,32.0
high_rising(35),12.0,40.0,,3.0
low(11),5.0,32.0,,4.0
low_checked(21),9.0,18.0,23.0,18.0
low_rising(213),9.0,92.0,5.0,3.0
mid(33),3.0,9.0,168.0,7.0


### Export the dataframe to R for Visualization 

In [37]:
export_csv = freq_pd.to_csv('../output/r_input.csv', index = None, header=True)