In [1]:
import pandas as pd
import numpy as np 
%matplotlib inline

In [2]:
data_dir = '../data/clean_data/'

In [3]:
data = pd.read_csv(data_dir+'model_input.csv')
data.head()

Unnamed: 0,BENZI_IN_SOURCE_teo,BENZI_man,citation_teo,initial_teo,final_teo,citation_man,initial_man,final_man
0,三,三,33,s,"['ã', '∼']",1,s,an
1,十,十,4,ts,"['a', 'p']",2,sh,i
2,夜,夜,55,m,['e'],4,y,e
3,我,我,53,u,['a'],3,w,o
4,裌,裌,21,k,"['o', 'i', 'ʔ']",2,j,ia


### Explore the Sparsity in Data 

In [4]:
# the first 30 items has frequency greater than 10 
# the last 21 items has only appear once in the dataset
data['final_teo'].value_counts()

['i']              58
['a', 'ŋ']         35
['o']              33
['ɯ', 'ŋ']         32
['o', 'u']         30
                   ..
['i', 'e', 'm']     1
['õ', 'ũ', '∼']     1
['ũ', '∼']          1
['p']               1
['ə', 'm']          1
Name: final_teo, Length: 83, dtype: int64

import ast
data[['final_teo1', 'final_teo2', 'final_teo3']] = \
data['final_teo'].apply(lambda x: pd.Series(ast.literal_eval(x)+[""]*(3-len(x))))

In [5]:
import ast 
data['final_teo'] = data['final_teo'].apply(lambda x: "".join(ast.literal_eval(x)))
df = data.fillna(value=' ', axis=1)
df[df['initial_man'].isnull()]

Unnamed: 0,BENZI_IN_SOURCE_teo,BENZI_man,citation_teo,initial_teo,final_teo,citation_man,initial_man,final_man


In [6]:
df.sample(10)

Unnamed: 0,BENZI_IN_SOURCE_teo,BENZI_man,citation_teo,initial_teo,final_teo,citation_man,initial_man,final_man
765,洞,洞,35,t,oŋ,4,d,ong
385,斲,斲,21,t,ok,2,zh,uo
623,粗,粗,33,tsʰ,ou,1,c,u
627,灠,灠,35,n,ua,3,l,an
114,伯,伯,21,p,eʔ,2,b,o
386,雹,雹,4,pʰ,ak,2,b,ao
204,服,服,4,h,uk,2,f,u
544,欖,榄,53,n,a,3,l,an
611,勸,劝,213,kʰ,ɯŋ,4,q,van
530,免,免,53,m,iŋ,3,m,ian


In [7]:
df['citation_teo'] = df['citation_teo'].astype(str)
df['citation_man'] = df['citation_man'].astype(str)
df.dtypes

BENZI_IN_SOURCE_teo    object
BENZI_man              object
citation_teo           object
initial_teo            object
final_teo              object
citation_man           object
initial_man            object
final_man              object
dtype: object

### Regression Analysis

In [54]:
# split into train test set 
from sklearn.model_selection import train_test_split

mandarin_df = df[['citation_man', 'initial_man', 'final_man']].copy()
teochew_df = df[['citation_teo', 'initial_teo', 'final_teo']].copy()

train_x, test_x = train_test_split(mandarin_df, test_size=0.2, random_state=42)
train_y, test_y = train_test_split(teochew_df, test_size=0.2, random_state=42)

In [69]:
from sklearn.preprocessing import OneHotEncoder

# convert categorical values to one-hot encoding 
encX = OneHotEncoder(handle_unknown='ignore')
train_x = encX.fit_transform(train_x).toarray()
test_x = encX.transform(test_x).toarray()

encY = OneHotEncoder(handle_unknown='ignore')
train_y = encY.fit_transform(train_y).toarray()
test_y = encY.transform(test_y).toarray()

In [56]:
train_y

Unnamed: 0,citation_teo,initial_teo,final_teo
432,52,l,iŋ
582,55,tʰ,o
442,21,u,k
643,55,l,iəm
334,21,ts,iʔ
...,...,...,...
71,33,k,eŋ
106,55,m,uŋ
270,55,s,ĩã∼
435,55,s,i


In [11]:
from sklearn.multioutput import ClassifierChain
from sklearn.svm import SVC

chains = [ClassifierChain(SVC(kernel='linear'), order='random', random_state=i)
          for i in range(10)]
for chain in chains:
    chain.fit(train_x, train_y)

Y_pred_chains = np.array([chain.predict(test_x) for chain in
                          chains])

pred = Y_pred_chains.mean(axis=0)

In [12]:
from sklearn.metrics import jaccard_score
chain_jaccard_scores = [jaccard_score(test_y, Y_pred_chain >= .5,
                                      average='samples')
                        for Y_pred_chain in Y_pred_chains]
chain_jaccard_scores

[0.38881987577639754,
 0.38716356107660455,
 0.4005175983436852,
 0.4061076604554866,
 0.38374741200828155,
 0.4042443064182195,
 0.4191511387163561,
 0.38457556935817805,
 0.44078674948240165,
 0.38768115942028986]

In [13]:
pred_df = encY.inverse_transform(pred)
test_df = encY.inverse_transform(test_y)

In [14]:
results = np.sum(pred_df==test_df, axis=0)
print(results[0], results[1], results[2])

95 97 47


In [15]:
# Accuracy for teochew_tones, teochew_initials, and teochew_finals 
print('accuracy for tones prediction: {}%'.format(results[0]/pred.shape[0]*100))
print('accuracy for initials prediction: {}%'.format(results[1]/pred.shape[0]*100))
print('accuracy for finals prediction: {}%'.format(results[2]/pred.shape[0]*100))

accuracy for tones prediction: 59.006211180124225%
accuracy for initials prediction: 60.24844720496895%
accuracy for finals prediction: 29.19254658385093%


In [16]:
cols = [['citation', 'initial', 'final', 'pred_citation', 'pred_initial', 'pred_final']]
result_df = pd.DataFrame(np.concatenate([test_df, pred_df], axis=1), columns=cols)

In [17]:
result_df.sample(30)

Unnamed: 0,citation,initial,final,pred_citation,pred_initial,pred_final
48,35,h,ɯŋ,213,h,
109,53,s,ɯŋ,53,s,
148,33,h,ia,33,s,
129,55,h,ueŋ,55,h,ueŋ
86,33,kʰ,iəm,33,kʰ,ieŋ
102,21,n,iəp,11,n,ie
82,53,k,eŋ,53,k,
138,55,l,ou,55,l,ou
99,11,l,iəu,11,l,iəu
98,53,tsʰ,ɿ,53,s,u


In [18]:
result_df.columns = ['citation','initial','final',
                     'pred_citation','pred_initial','pred_final']
result_df.columns

Index(['citation', 'initial', 'final', 'pred_citation', 'pred_initial',
       'pred_final'],
      dtype='object')

In [19]:
result_df[(result_df['citation']==result_df['pred_citation'])
          &(result_df['initial']==result_df['pred_initial'])
          &(result_df['final']==result_df['pred_final'])]

Unnamed: 0,citation,initial,final,pred_citation,pred_initial,pred_final
0,213,kʰ,ou,213,kʰ,ou
6,55,i,u,55,i,u
8,213,s,au,213,s,au
15,213,tʰ,ɯŋ,213,tʰ,ɯŋ
17,33,k,ɯŋ,33,k,ɯŋ
18,55,tʰ,ou,55,tʰ,ou
21,55,k,iu,55,k,iu
30,55,s,ui,55,s,ui
33,33,tʰ,au,33,tʰ,au
34,213,tsʰ,ou,213,tsʰ,ou


In [20]:
# harsh metrics, when all three predictions are equal 
29/pred.shape[0]*100

18.012422360248447