In [1]:
import pandas as pd
import numpy as np 
%matplotlib inline

In [2]:
data_dir = '../data/clean_data/'

In [3]:
data = pd.read_csv(data_dir+'model_input.csv')
data.head()

Unnamed: 0,BENZI_IN_SOURCE_teo,BENZI_man,citation_teo,initial_teo,final_teo,citation_man,initial_man,final_man
0,三,三,33,s,"['ã', '∼']",1,s,an
1,十,十,4,ts,"['a', 'p']",2,sh,i
2,夜,夜,55,m,['e'],4,y,e
3,我,我,53,u,['a'],3,w,o
4,裌,裌,21,k,"['o', 'i', 'ʔ']",2,j,ia


### Explore the Sparsity in Data 

In [4]:
# the first 30 items has frequency greater than 10 
# the last 21 items has only appear once in the dataset
data['final_teo'].value_counts()

['i']              57
['a', 'ŋ']         35
['o']              33
['ɯ', 'ŋ']         32
['o', 'u']         30
                   ..
['i', 'p']          1
['ũ', 'ĩ', '∼']     1
['u', 'n']          1
['i', 'a', 'k']     1
['i', 'a', 'p']     1
Name: final_teo, Length: 83, dtype: int64

import ast
data[['final_teo1', 'final_teo2', 'final_teo3']] = \
data['final_teo'].apply(lambda x: pd.Series(ast.literal_eval(x)+[""]*(3-len(x))))

In [5]:
import ast 
data['final_teo'] = data['final_teo'].apply(lambda x: "".join(ast.literal_eval(x)))
df = data.fillna(value=' ', axis=1)
df[df['initial_man'].isnull()]

Unnamed: 0,BENZI_IN_SOURCE_teo,BENZI_man,citation_teo,initial_teo,final_teo,citation_man,initial_man,final_man


In [6]:
df.sample(10)

Unnamed: 0,BENZI_IN_SOURCE_teo,BENZI_man,citation_teo,initial_teo,final_teo,citation_man,initial_man,final_man
463,年,年,55,n,i,2,n,ian
76,唔,唔,11,,m̩,2,w,u
360,揲,揲,4,t,iap,2,sh,e
25,經,经,33,k,iŋ,1,j,ing
775,戴,戴,213,t,i,4,d,ai
767,浴,浴,4,e,k,4,y,v
490,仁,仁,55,z,iŋ,2,r,en
215,銅,铜,55,t,aŋ,2,t,ong
613,粟,粟,21,tʰ,ek,4,s,u
206,活,活,4,u,aʔ,2,h,uo


In [7]:
df['citation_teo'] = df['citation_teo'].astype(str)
df['citation_man'] = df['citation_man'].astype(str)
df.dtypes

BENZI_IN_SOURCE_teo    object
BENZI_man              object
citation_teo           object
initial_teo            object
final_teo              object
citation_man           object
initial_man            object
final_man              object
dtype: object

### Regression Analysis

In [8]:
# split into train test set 
from sklearn.model_selection import train_test_split

mandarin_df = df[['citation_man', 'initial_man', 'final_man']].copy()
teochew_df = df[['citation_teo', 'initial_teo', 'final_teo']].copy()

train_x, test_x = train_test_split(mandarin_df, test_size=0.2, random_state=42)
train_y, test_y = train_test_split(teochew_df, test_size=0.2, random_state=42)

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# convert categorical values to one-hot encoding 
encX = OneHotEncoder(handle_unknown='ignore')
X_train = encX.fit_transform(train_x).toarray()
X_test = encX.transform(test_x).toarray()

# LabelEncoder for Y 
encT = LabelEncoder()
encI = LabelEncoder()
encF = LabelEncoder()

In [10]:
encT.fit(teochew_df['citation_teo'].values.tolist())
encI.fit(teochew_df['initial_teo'].values.tolist())
encF.fit(teochew_df['final_teo'].values.tolist())

LabelEncoder()

In [11]:
Y_train_tone = encT.transform(train_y['citation_teo'].values.tolist())
Y_train_initial = encI.transform(train_y['initial_teo'].values.tolist())
Y_train_final = encF.transform(train_y['final_teo'].values.tolist())

Y_test_tone = encT.transform(test_y['citation_teo'].values.tolist())
Y_test_initial = encI.transform(test_y['initial_teo'].values.tolist())
Y_test_final = encF.transform(test_y['final_teo'].values.tolist())

In [12]:
Y_train_tone.shape, Y_test_tone.shape

((640,), (161,))

In [14]:
# handles train
man_tone_unique = mandarin_df['citation_man'].nunique()

man_initial_unique = mandarin_df['initial_man'].nunique()

man_final_unique = mandarin_df['final_man'].nunique()

man_unique = mandarin_df['citation_man'].nunique() + \
mandarin_df['initial_man'].nunique() + mandarin_df['final_man'].nunique()

In [15]:
def slice_idx(model, start, end):
    return model[:, start:end]

# handles train
X_train_tone = slice_idx(X_train, 0, man_tone_unique)
X_train_initial = slice_idx(X_train, man_tone_unique, man_tone_unique+man_initial_unique)
X_train_final = slice_idx(X_train, man_tone_unique+man_initial_unique, man_unique)

# handles test 
X_test_tone = slice_idx(X_test, 0, man_tone_unique)
X_test_initial = slice_idx(X_test, man_tone_unique, man_tone_unique+man_initial_unique)
X_test_final = slice_idx(X_test, man_tone_unique+man_initial_unique, man_unique)

In [16]:
#from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

chains = [RandomForestClassifier(n_estimators=100, max_samples=0.6) for i in range(3)]

train_models = [X_train_tone,Y_train_tone,
                X_train_initial,Y_train_initial,
                X_train_final,Y_train_final]

for idx, chain in enumerate(chains):
    chain.fit(train_models[2*idx], train_models[2*idx+1])

pred_tone = np.array(chains[0].predict(X_test_tone))
pred_initial = np.array(chains[1].predict(X_test_initial))
pred_final = np.array(chains[2].predict(X_test_final))

In [17]:
pred_tone.shape, pred_initial.shape, pred_final.shape

((161,), (161,), (161,))

In [18]:
pred_tone_df = encT.inverse_transform(pred_tone)
pred_initial_df = encI.inverse_transform(pred_initial)
pred_final_df = encF.inverse_transform(pred_final)

In [19]:
pred_df = np.column_stack((pred_tone_df.T, pred_initial_df.T, pred_final_df.T))
#len(pred_tone_df), len(pred_initial_df), len(pred_final_df), pred_tone.shape

In [20]:
test_tone_df = encT.inverse_transform(Y_test_tone)
test_initial_df = encI.inverse_transform(Y_test_initial)
test_final_df = encF.inverse_transform(Y_test_final)

test_df = np.column_stack((test_tone_df.T, test_initial_df.T, test_final_df.T))

In [21]:
results = np.sum(pred_df==test_df, axis=0)
print(results[0], results[1], results[2])

91 99 49


In [22]:
# Accuracy for teochew_tones, teochew_initials, and teochew_finals 
print('accuracy for tones prediction: {}%'.format(results[0]/pred_tone.shape[0]*100))
print('accuracy for initials prediction: {}%'.format(results[1]/pred_initial.shape[0]*100))
print('accuracy for finals prediction: {}%'.format(results[2]/pred_final.shape[0]*100))

accuracy for tones prediction: 56.52173913043478%
accuracy for initials prediction: 61.49068322981367%
accuracy for finals prediction: 30.434782608695656%


In [23]:
cols = [['citation', 'initial', 'final', 'pred_citation', 'pred_initial', 'pred_final']]
result_df = pd.DataFrame(np.concatenate([test_df, pred_df], axis=1), columns=cols)

In [29]:
result_df.sample(30)

Unnamed: 0,citation,initial,final,pred_citation,pred_initial,pred_final
53,213,tʰ,o,213,tʰ,ui
114,33,t,oŋ,33,ts,aŋ
27,213,h,am,53,h,ũã∼
86,55,l,aŋ,55,l,ɯŋ
139,213,k,ai,213,k,ai
8,33,p,aŋ,33,p,ɯŋ
96,213,ts,iu,213,ts,au
70,35,kʰ,ia,53,kʰ,i
67,55,s,ai,55,tsʰ,ai
23,53,h,ɯ,53,s,ɯ


In [25]:
result_df.columns = ['citation','initial','final',
                     'pred_citation','pred_initial','pred_final']
result_df.columns

Index(['citation', 'initial', 'final', 'pred_citation', 'pred_initial',
       'pred_final'],
      dtype='object')

In [26]:
result_df[(result_df['citation']==result_df['pred_citation'])
          &(result_df['initial']==result_df['pred_initial'])
          &(result_df['final']==result_df['pred_final'])].count()

citation         25
initial          25
final            25
pred_citation    25
pred_initial     25
pred_final       25
dtype: int64

In [27]:
# harsh metrics, when all three predictions are equal 
25/pred_tone.shape[0]*100

15.527950310559005