### Correspondence between Beijing Mandarin and Teochew Language 

In [1]:
import pandas as pd 
import sys 
import re

In [2]:
%config IPCompleter.greedy=True

In [3]:
# used online tool to convert .tsv to .csv
clics_raw = '../raw_data/lexibank-beidasinitic-a94870e/raw/'
raw_data = '../raw_data/'

In [4]:
dialects_pd = pd.read_csv(raw_data+'output-dialects.txt')

### Extract All Pairs from Mandarin and Teochew 

In [5]:
teochew_pd = dialects_pd[dialects_pd.DOCULECT=='Chaozhou']
mandarin_pd = dialects_pd[dialects_pd.DOCULECT=='Beijing']

In [6]:
teochew_pd.columns

Index(['ID', 'DOCULECT', 'GLOTTOLOG', 'ISO', 'CONCEPT', 'CONCEPTICON_ID',
       'CHINESE', 'PINYIN', 'BENZI', 'BENZI_IN_SOURCE', 'VALUE', 'FORM',
       'SEGMENTS', 'COGID', 'COGIDS', 'NOTE', 'SOURCE', 'BEIDA_ID', 'PAGE',
       'ORDER', 'ALIGNMENTS'],
      dtype='object')

In [7]:
merged_pd = pd.merge(teochew_pd, mandarin_pd, how="inner", on="BEIDA_ID", suffixes=('_teo', '_man'))

### Data Wrangling

In [8]:
merged_pd = merged_pd[['BENZI_IN_SOURCE_teo','SEGMENTS_teo','CHINESE_man']]
merged_pd.drop_duplicates(subset="BENZI_IN_SOURCE_teo", keep='first', inplace=True)

# removes extra characters and spaces from beizi_in_source 
merged_pd['BENZI_IN_SOURCE_teo'] = merged_pd['BENZI_IN_SOURCE_teo'].apply(lambda x: re.sub('[a-zA-Z0-9’!"#$%&\'() \
                                                                    *+,-./:;<=>?@，。?★、…【】□\
                                                                    《》？“”‘’！[\\]^_`{|}~\s]+', "", str(x)))
# removes rows that no chinese words can be found for teochew pronounciation
merged_pd = merged_pd.loc[merged_pd['BENZI_IN_SOURCE_teo']!=""]
merged_pd.describe()

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE_man
count,890,890,890
unique,879,868,810
top,擔,k ɯ ŋ ³³,爸爸
freq,2,3,3


### Convert Traditional Chinese to Simplified Chinese/Pinyin

In [9]:
from hanziconv import HanziConv
from xpinyin import Pinyin

In [10]:
merged_pd['BENZI_man'] = merged_pd['BENZI_IN_SOURCE_teo'].apply(lambda x: HanziConv.toSimplified(x))
p = Pinyin()
merged_pd['pinyin'] = merged_pd['BENZI_man'].apply(lambda x: p.get_pinyin(x, tone_marks='numbers'))

In [11]:
merged_pd.describe()

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE_man,BENZI_man,pinyin
count,890,890,890,890,890
unique,879,868,810,879,816
top,擔,k ɯ ŋ ³³,爸爸,裤,shi4
freq,2,3,3,2,5


### Separate Citation tones with Sandhi Tones for Teochew

In [12]:
SUB = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")
SUP = str.maketrans("⁰¹²³⁴⁵⁶⁷⁸⁹", "0123456789")

merged_pd['citation_teo'] = merged_pd['SEGMENTS_teo'].apply(lambda x: x.translate(SUP))

def extract_citation(s, delimit=' '):
    ans = [] 
    blocks = s.split(delimit)
    for block in blocks:
        if any(c.isdigit() for c in block):
            ans.append(block.split('/')[-1]) 
    return " ".join(ans)

merged_pd['citation_teo'] = merged_pd['citation_teo'].apply(lambda x: extract_citation(x))

In [13]:
merged_pd['citation_man'] = merged_pd['pinyin'].apply(lambda x: " ".join([t[-1] for t in x.split('-')]))

def filter_row(row):
    return row['citation_man'].isdigit() 

merged_pd = merged_pd[merged_pd.apply(filter_row, axis=1, reduce=True)]

merged_pd.head()

  


Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE_man,BENZI_man,pinyin,citation_teo,citation_man
2,我,u a ⁵³,我,我,wo3,53,3
5,下,e ¹¹,下(打一下),下,xia4,11,4
7,伴,pʰ ũ ã ∼ ³⁵,陪,伴,ban4,35,4
8,陪,p u e ⁵⁵,陪,陪,pei2,55,2
9,疊,tʰ i ə p ⁴,疊(堆疊),叠,die2,4,2


### Group the Citation Tones by Teochew/Mandarin

In [14]:
teochew_citation = merged_pd['citation_teo'].values.tolist() 
mandarin_citation = merged_pd['citation_man'].values.tolist() 
dic = dict() 
for ts, ms in zip(teochew_citation, mandarin_citation):
    ts = ts.split(' ')
    ms = ms.split(' ')
    for tc, mc in zip(ts, ms):
        if (tc, mc) not in dic:
            dic[(tc, mc)] = 1 
            continue 
        dic[(tc, mc)] += 1 
print(dic.items())

dict_items([(('53', '3'), 50), (('11', '4'), 18), (('35', '4'), 20), (('55', '2'), 57), (('4', '2'), 19), (('213', '4'), 51), (('21', '3'), 7), (('21', '2'), 6), (('21', '1'), 13), (('33', '1'), 71), (('35', '3'), 10), (('21', '4'), 7), (('213', '3'), 7), (('33', '4'), 6), (('55', '1'), 6), (('213', '1'), 5), (('11', '3'), 1), (('33', '3'), 3), (('4', '4'), 13), (('53', '4'), 5), (('4', '3'), 2), (('33', '2'), 3), (('53', '1'), 3), (('53', '2'), 1), (('11', '2'), 1), (('35', '2'), 2), (('213', '2'), 1), (('55', '3'), 1), (('55', '4'), 2), (('4', '1'), 1)])


In [15]:
teochew_mapping = {'33':'mid', '11':'low', '21':'low_checked', 
                   '213':'low_rising', '35':'high_rising', 
                   '4':'high_checked', '53':'falling', '55':'high'}

mandarin_mapping = {'1': 'high', '2': 'rising', '3': 'dipping', '4': 'falling'}

In [16]:
freq_pd = pd.DataFrame(columns=['teochew_tone', 'mandarin_tone', 'frequency'])
idx = 0
for (teochew_tone, mandarin_tone), freq in dic.items():
    freq_pd.loc[idx] = [teochew_mapping[teochew_tone]+'({})'.format(teochew_tone), 
                        mandarin_mapping[mandarin_tone]+'({})'.format(mandarin_tone), freq]
    idx += 1 
freq_pd.head()

Unnamed: 0,teochew_tone,mandarin_tone,frequency
0,falling(53),dipping(3),50
1,low(11),falling(4),18
2,high_rising(35),falling(4),20
3,high(55),rising(2),57
4,high_checked(4),rising(2),19


### Data Visualization

In [17]:
pd.crosstab(freq_pd["teochew_tone"], freq_pd['mandarin_tone'], values=freq_pd['frequency'],
           aggfunc=lambda x: x)

mandarin_tone,dipping(3),falling(4),high(1),rising(2)
teochew_tone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
falling(53),50.0,5.0,3.0,1.0
high(55),1.0,2.0,6.0,57.0
high_checked(4),2.0,13.0,1.0,19.0
high_rising(35),10.0,20.0,,2.0
low(11),1.0,18.0,,1.0
low_checked(21),7.0,7.0,13.0,6.0
low_rising(213),7.0,51.0,5.0,1.0
mid(33),3.0,6.0,71.0,3.0


### Export the dataframe to R for Visualization 

In [18]:
export_csv = freq_pd.to_csv('../output/r_input.csv', index = None, header=True)