### Correspondence between Standard Mandarin and Teochew Language 

In [1]:
import pandas as pd 
import numpy as np 
import sys 
sys.path.append('../src/')
import preprocess_script 

In [2]:
raw_input = '../data/raw_data/output-dialects.csv'
clean_data = '../data/clean_data/'
src_lan = 'Guangzhou'
dst_lan = 'Chaozhou'

In [3]:
merged_pd = preprocess_script.get_dst_only(raw_input)
merged_pd.sample(10)

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE
12314,粗,tsʰ o u ³³,粗
14317,古怪,k o u ²⁴/⁵³ k u a i ²¹³,奇怪
1482,一樽酒,ts e k ⁵/²¹ ts u ŋ ²³/³³ ts i u ⁵³,一瓶酒
8369,舐,ts i ³⁵,舔
7883,洋油,ĩ ẽ ∼ ²¹³/⁵⁵ i u ⁵⁵,煤油
16153,瞞日,m u a ⁵³/²¹³ z i k ⁴,明天
7037,鋤頭,t ɯ ²¹³/⁵⁵ tʰ a u ⁵⁵,鋤頭
1524,盒,a p ⁴,盒子
10108,唔,m̩ ³⁵,不
5445,偷走,tʰ a u ²³/³³ ts a u ⁵³,逃跑


In [4]:
merged_pd = preprocess_script.process_teochew(merged_pd)
merged_pd.head() 

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE,citation_teo,initial_teo,final_teo
20,三十夜,s ã ∼ ²³/³³ ts a p ²¹/⁴ m e ⁵⁵,除夕,"[33, 4, 55]","[s, ts, m]","[ã∼, ap, e]"
40,我,u a ⁵³,我,[53],[],[ua]
59,裌裘,k o i ʔ ⁴/²¹ h ĩ ũ ∼ ⁵⁵,夾祅,"[21, 55]","[k, h]","[oiʔ, ĩũ∼]"
91,下,e ¹¹,下(打一下),[11],[],[e]
114,算盤,s ɯ ŋ ⁵³/²¹³ p ũ ã ∼ ⁵⁵,算盤,"[213, 55]","[s, p]","[ɯŋ, ũã∼]"


### Convert Chinese to Cantonese 

In [5]:
from pyjyutping import jyutping

merged_pd['jyutping'] = merged_pd['BENZI_IN_SOURCE_teo'].apply(lambda x: jyutping.convert(x))

In [6]:
merged_pd.sample(10)

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE,citation_teo,initial_teo,final_teo,jyutping
1483,一酒,ts e k ²¹/⁴ t a ŋ ²¹/³⁵ ts i u ⁵³,一瓶酒,"[4, 35, 53]","[ts, t, ts]","[ek, aŋ, iu]",jat1 zau2
5520,折,ts i ʔ ²¹,折(折疊),[21],[ts],[iʔ],zit3
11351,豬舌,t ɯ ²³/³³ ts i ʔ ⁴,豬舌頭,"[33, 4]","[t, ts]","[ɯ, iʔ]",zyu1 sit3
3047,撞,ts u a ŋ ³⁵,碰,[35],[ts],[uaŋ],zong6
13728,湯匙,tʰ ɯ ŋ ²³/³³ s i ⁵⁵,調羹,"[33, 55]","[tʰ, s]","[ɯŋ, i]",tong1 ci4
7332,穧,z i e ʔ ²¹/⁴ ts o i ¹¹,多少,"[4, 11]","[z, ts]","[ieʔ, oi]",zai6
17414,哋畔,t i ¹²/¹¹ p õ ĩ ∼ ⁵⁵,哪邊,"[11, 55]","[t, p]","[i, õĩ∼]",dei6 bun6
16122,番茄,h u e ŋ ²³/³³ k i e ⁵⁵,番茄,"[33, 55]","[h, k]","[ueŋ, ie]",faan1 ke2
2017,分,p u ŋ ³³,被,[33],[p],[uŋ],fan1
6570,鉄錘,tʰ i ʔ ⁴/²¹ tʰ u i ⁵⁵,錘子,"[21, 55]","[tʰ, tʰ]","[iʔ, ui]",tit3 ceoi4


In [7]:
def get_canto_parts(x):
    initials, finals, tones = [], [], []
    words = x.split(' ')
    for word in words:
        last_idx = len(word) - 1 
        # tone mark  
        tones.append(word[last_idx])
        # initials 
        first_idx = 0 
        second_idx = 1
        
        if word[first_idx]+word[second_idx] in preprocess_script.CANTON_INITIALS:
            initials.append(word[first_idx]+word[second_idx])
            finals.append(word[second_idx+1:last_idx])
            
        elif word[first_idx] in preprocess_script.CANTON_INITIALS: 
            initials.append(word[first_idx])
            finals.append(word[second_idx:last_idx])
            
        else: 
            # no initials are presented 
            initials.append('')
            finals.append(word[first_idx:last_idx])
            
    return tones, initials, finals


def process_cantonese(df):
    df = df[df.apply(preprocess_script.filter_non_jyutping, 
                     axis=1, reduce=True)]
    df['citation_can'], df['initial_can'], df['final_can'] = \
        zip(*df['jyutping'].apply(lambda x: get_canto_parts(x)))
    return df

# test 
print(get_canto_parts('hoeng1 gong2 jan4 hok6 zaap6 ping3 jam1'))

(['1', '2', '4', '6', '6', '3', '1'], ['h', 'g', 'j', 'h', 'z', 'p', 'j'], ['oeng', 'ong', 'an', 'ok', 'aap', 'ing', 'am'])


In [8]:
merged_pd = process_cantonese(merged_pd)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
merged_pd[merged_pd['BENZI_IN_SOURCE_teo'].str.contains('五')]

Unnamed: 0,BENZI_IN_SOURCE_teo,SEGMENTS_teo,CHINESE,citation_teo,initial_teo,final_teo,jyutping,citation_can,initial_can,final_can
13024,五骹砌,ŋ o u ²¹/³⁵ kʰ a ²³/³³ k i ʔ ²¹,人行道,"[35, 33, 21]","[ŋ, kʰ, k]","[ou, a, iʔ]",ng5 haau1 cai3,"[5, 1, 3]","[ng, h, c]","[, aau, ai]"
15110,五月節,ŋ o u ²¹/³⁵ g u e ʔ ²¹/⁴ ts o i ʔ ²¹,端午節,"[35, 4, 21]","[ŋ, g, ts]","[ou, ueʔ, oiʔ]",ng5 jyut6 zit3,"[5, 6, 3]","[ng, j, z]","[, yut, it]"


In [10]:
merged_pd_copy = pd.DataFrame(merged_pd,columns=['BENZI_IN_SOURCE_teo','citation_teo',
                                                 'initial_teo','final_teo', 
                                                 'citation_can','initial_can','final_can'])

merged_pd_copy = merged_pd_copy[merged_pd_copy.citation_teo.map(len)==merged_pd_copy.citation_can.map(len)]

In [11]:
pd1 = pd.DataFrame(merged_pd_copy['BENZI_IN_SOURCE_teo'].apply(lambda x: list(x)))
# pd2 = pd.DataFrame(merged_pd_copy['BENZI_can'].apply(lambda x: list(x)))
pd3 = pd.DataFrame(merged_pd_copy['citation_teo'])
pd4 = pd.DataFrame(merged_pd_copy['initial_teo'])
pd5 = pd.DataFrame(merged_pd_copy['final_teo'])
pd6 = pd.DataFrame(merged_pd_copy['citation_can'])
pd7 = pd.DataFrame(merged_pd_copy['initial_can'])
pd8 = pd.DataFrame(merged_pd_copy['final_can'])

pd1 = pd1.explode('BENZI_IN_SOURCE_teo')
# pd2 = pd2.explode('BENZI_can')
pd3 = pd3.explode('citation_teo')
pd4 = pd4.explode('initial_teo')
pd5 = pd5.explode('final_teo')
pd6 = pd6.explode('citation_can')
pd7 = pd7.explode('initial_can')
pd8 = pd8.explode('final_can')
combined_data = pd.concat([pd1, pd3, pd4, pd5, pd6, pd7, pd8], axis=1)
combined_data.drop_duplicates(subset="BENZI_IN_SOURCE_teo", keep='first', inplace=True)

In [19]:
combined_data.final_can.nunique() # there are 59 unique finals in Jyutping system
combined_data.columns

Index(['BENZI_IN_SOURCE_teo', 'citation_teo', 'initial_teo', 'final_teo',
       'citation_can', 'initial_can', 'final_can'],
      dtype='object')

In [13]:
combined_data.loc[combined_data.final_can.isin([''])]
# drop tone 52 as it appears to be an outlier
combined_data = combined_data[~combined_data.citation_teo.isin(['52'])]  

In [14]:
#model_input = combined_data.to_csv('../data/clean_data/canto_teo.csv', index = None, header=True)

### Group the Citation Tones by Teochew/Mandarin

In [15]:
freq_pd = preprocess_script.plot_tones('can', combined_data)

### Data Visualization

pd.crosstab(freq_pd["teo_tone"], freq_pd['can_tone'], values=freq_pd['frequency'],
           aggfunc=lambda x: x)

can_tone,dark_departing(3),dark_flat(1),dark_rising(2),light_departing(6),light_flat(4),light_rising(5)
teo_tone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
falling(53),1.0,4.0,81.0,5.0,3.0,31.0
high(55),2.0,3.0,2.0,7.0,140.0,3.0
high_checked(4),6.0,1.0,2.0,50.0,1.0,
high_rising(35),1.0,,4.0,28.0,3.0,19.0
low(11),,,3.0,35.0,2.0,1.0
low_checked(21),37.0,26.0,,5.0,,
low_rising(213),90.0,4.0,4.0,9.0,1.0,1.0
mid(33),3.0,172.0,2.0,4.0,5.0,1.0


In [18]:
freq_pd.frequency.sum()

802

### Export the dataframe to R for Visualization 

In [16]:
#export_csv = freq_pd.to_csv('../output/r_input.csv', index = None, header=True)