In [1]:
import pandas as pd
import numpy as np 
%matplotlib inline

In [2]:
data_dir = '../data/clean_data/'

In [3]:
data = pd.read_csv(data_dir+'model_input.csv')
data.head()

Unnamed: 0,BENZI_IN_SOURCE_teo,BENZI_man,citation_teo,initial_teo,final_teo,citation_man,initial_man,final_man
0,三,三,33,s,"['ã', '∼']",1,s,an
1,十,十,4,ts,"['a', 'p']",2,sh,i
2,夜,夜,55,m,['e'],4,y,e
3,我,我,53,u,['a'],3,w,o
4,裌,裌,21,k,"['o', 'i', 'ʔ']",2,j,ia


### Explore the Sparsity in Data 

In [4]:
# the first 30 items has frequency greater than 10 
# the last 21 items has only appear once in the dataset
data['final_teo'].value_counts()

['i']              57
['a', 'ŋ']         35
['o']              33
['ɯ', 'ŋ']         32
['o', 'u']         30
                   ..
['p']               1
['i', 'e', 'm']     1
['i', 'e', 'k']     1
['ɯ', 'ŋ̍']         1
['ũ', 'ĩ', '∼']     1
Name: final_teo, Length: 83, dtype: int64

import ast
data[['final_teo1', 'final_teo2', 'final_teo3']] = \
data['final_teo'].apply(lambda x: pd.Series(ast.literal_eval(x)+[""]*(3-len(x))))

In [5]:
import ast 
data['final_teo'] = data['final_teo'].apply(lambda x: "".join(ast.literal_eval(x)))
df = data.fillna(value=' ', axis=1)
df[df['initial_man'].isnull()]

Unnamed: 0,BENZI_IN_SOURCE_teo,BENZI_man,citation_teo,initial_teo,final_teo,citation_man,initial_man,final_man


In [6]:
df.sample(10)

Unnamed: 0,BENZI_IN_SOURCE_teo,BENZI_man,citation_teo,initial_teo,final_teo,citation_man,initial_man,final_man
231,熱,热,4,z,iek,4,r,e
736,漸,渐,53,ts,iəm,4,j,ian
538,簿,簿,35,pʰ,ou,4,b,u
140,叫,叫,213,k,ie,4,j,iao
786,拭,拭,21,tsʰ,ik,4,sh,i
548,雙,双,33,s,aŋ,1,sh,uang
639,蝦,虾,55,h,e,1,x,ia
157,牲,牲,33,s,ẽ∼,1,sh,eng
287,茄,茄,55,k,ie,2,q,ie
729,飮,飮,53,a,m,3,y,in


In [7]:
df['citation_teo'] = df['citation_teo'].astype(str)
df['citation_man'] = df['citation_man'].astype(str)
df.dtypes

BENZI_IN_SOURCE_teo    object
BENZI_man              object
citation_teo           object
initial_teo            object
final_teo              object
citation_man           object
initial_man            object
final_man              object
dtype: object

### Regression Analysis

In [8]:
# split into train test set 
from sklearn.model_selection import train_test_split

mandarin_df = df[['citation_man', 'initial_man', 'final_man']].copy()
teochew_df = df[['citation_teo', 'initial_teo', 'final_teo']].copy()

train_x, test_x = train_test_split(mandarin_df, test_size=0.2, random_state=42)
train_y, test_y = train_test_split(teochew_df, test_size=0.2, random_state=42)

In [9]:
from sklearn.preprocessing import OneHotEncoder

# convert categorical values to one-hot encoding 
encX = OneHotEncoder(handle_unknown='ignore')
X_train = encX.fit_transform(train_x).toarray()
X_test = encX.transform(test_x).toarray()

encY = OneHotEncoder(handle_unknown='ignore')
Y_train = encY.fit_transform(train_y).toarray()
Y_test = encY.transform(test_y).toarray()

In [10]:
Y_train.shape, Y_test.shape

((640, 107), (161, 107))

In [11]:
# handles train
man_tone_unique = mandarin_df['citation_man'].nunique()
teo_tone_unique = teochew_df['citation_teo'].nunique()

man_initial_unique = mandarin_df['initial_man'].nunique()
teo_initial_unique = teochew_df['initial_teo'].nunique()

man_final_unique = mandarin_df['final_man'].nunique()
teo_final_unique = teochew_df['final_teo'].nunique()

man_unique = mandarin_df['citation_man'].nunique() + \
mandarin_df['initial_man'].nunique() + mandarin_df['final_man'].nunique()

# teo_unique = teochew_df['citation_teo'].nunique()+ \
# teochew_df['initial_teo'].nunique()+teochew_df['final_teo'].nunique()
teo_unique = train_y['citation_teo'].nunique()+ \
train_y['initial_teo'].nunique()+train_y['final_teo'].nunique()

In [12]:
def slice_idx(model, start, end):
    return model[:, start:end]

X_train_tone = slice_idx(X_train, 0, man_tone_unique)
Y_train_tone = slice_idx(Y_train, 0, teo_tone_unique)

X_train_initial = slice_idx(X_train, man_tone_unique, man_tone_unique+man_initial_unique)
Y_train_initial = slice_idx(Y_train, teo_tone_unique, teo_tone_unique+teo_initial_unique)

X_train_final = slice_idx(X_train, man_tone_unique+man_initial_unique, man_unique)
Y_train_final = slice_idx(Y_train, teo_tone_unique+teo_initial_unique, teo_unique)

# handles test 
X_test_tone = slice_idx(X_test, 0, man_tone_unique)
Y_test_tone = slice_idx(Y_test, 0, teo_tone_unique)

X_test_initial = slice_idx(X_test, man_tone_unique, man_tone_unique+man_initial_unique)
Y_test_initial = slice_idx(Y_test, teo_tone_unique, teo_tone_unique+teo_initial_unique)

X_test_final = slice_idx(X_test, man_tone_unique+man_initial_unique, man_unique)
Y_test_final = slice_idx(Y_test, teo_tone_unique+teo_initial_unique, teo_unique)

In [13]:
teo_unique, Y_train.shape, Y_test.shape, Y_train_tone.shape

(107, (640, 107), (161, 107), (640, 8))

In [14]:
from sklearn.svm import LinearSVC

chains = [LinearSVC(random_state=i) for i in range(3)]

train_models = [X_train_tone,Y_train_tone,
                X_train_initial,Y_train_initial,
                X_train_final,Y_train_final]

for idx, chain in enumerate(chains):
    chain.fit(train_models[2*idx], train_models[2*idx+1])

pred_tone = np.array(chains[0].predict(X_test_tone))
pred_initial = np.array(chains[1].predict(X_test_initial))
pred_final = np.array(chains[2].predict(X_test_final))

Y_pred = np.concatenate((pred_tone, pred_initial, pred_final), axis=1)

ValueError: bad input shape (640, 8)

In [None]:
pred_tone[0]

In [None]:
pred_tone.shape, pred_initial.shape

In [None]:
pred_df = encY.inverse_transform(Y_pred)
test_df = encY.inverse_transform(Y_test)

In [None]:
results = np.sum(pred_df==test_df, axis=0)
print(results[0], results[1], results[2])

In [None]:
# Accuracy for teochew_tones, teochew_initials, and teochew_finals 
print('accuracy for tones prediction: {}%'.format(results[0]/pred_tone.shape[0]*100))
print('accuracy for initials prediction: {}%'.format(results[1]/pred_initial.shape[0]*100))
print('accuracy for finals prediction: {}%'.format(results[2]/pred_final.shape[0]*100))

In [None]:
cols = [['citation', 'initial', 'final', 'pred_citation', 'pred_initial', 'pred_final']]
result_df = pd.DataFrame(np.concatenate([test_df, pred_df], axis=1), columns=cols)

In [None]:
result_df.sample(30)

In [None]:
result_df.columns = ['citation','initial','final',
                     'pred_citation','pred_initial','pred_final']
result_df.columns

In [None]:
result_df[(result_df['citation']==result_df['pred_citation'])
          &(result_df['initial']==result_df['pred_initial'])
          &(result_df['final']==result_df['pred_final'])]

In [None]:
# harsh metrics, when all three predictions are equal 
5/pred_tone.shape[0]*100