In [1]:
# Run this notebook to explore the results of baseline model
import pandas as pd 
import numpy as np 

data1 = pd.read_csv('../data/clean_data/baseline_input.csv')
data2 = pd.read_csv('../data/clean_data/canto_teo.csv')
data = pd.merge(data1, data2, how='inner', on=['BENZI_IN_SOURCE_teo', 'citation_teo', \
                                               'initial_teo', 'final_teo'])

In [2]:
data.sample(10)

Unnamed: 0,BENZI_IN_SOURCE_teo,BENZI_man,citation_teo,initial_teo,final_teo,citation_man,initial_man,final_man,citation_can,initial_can,final_can
412,鋤,锄,55,t,ɯ,2,ch,u,4,c,o
419,蹄,蹄,55,t,oi,2,t,i,4,t,ai
81,木,木,4,b,ak,4,m,u,6,m,uk
510,謙,谦,33,kʰ,iəm,1,q,ian,1,h,im
756,遮,遮,33,ts,ia,1,zh,e,1,z,e
96,濛,蒙,55,m,oŋ,2,m,eng,4,m,ung
323,公,公,33,k,oŋ,1,g,ong,1,g,ung
169,交,交,33,k,au,1,j,iao,1,g,aau
376,金,金,33,k,im,1,j,in,1,g,am
134,油,油,55,,iu,2,y,ou,4,j,au


In [3]:
data = data.fillna(value=' ', axis=1)

from sklearn.model_selection import train_test_split
X = data[['BENZI_IN_SOURCE_teo', 'citation_man', 'initial_man', 'final_man', \
         'citation_can', 'initial_can', 'final_can']].copy()
Y = data[['BENZI_IN_SOURCE_teo', 'citation_teo', 'initial_teo', 'final_teo']].copy()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [4]:
# return the first item of two lists that have the highest rank (smallest number)
def predict_most(pred1, pred2):
    most_pred = 'DNE'
    most_rank = -1 
    for i, val in enumerate(pred1):
        if val in pred2:
            rank = i + pred2.index(val)
            if rank < most_rank or most_rank == -1: 
                most_rank = rank 
                most_pred = val 
    return most_pred 

In [5]:
# predict the majority teochew from each mandarin tone/initial/final 
n = 3
pred = [] 
X_test = X_test.reset_index()
Y_test = Y_test.reset_index()
for i in range(len(X_test)):
    most_tone1 = Y_train[X_train['citation_man'] == X_test.iloc[i]['citation_man']]\
    .citation_teo.value_counts()[:n].index.tolist()
    most_tone2 = Y_train[X_train['citation_can'] == X_test.iloc[i]['citation_can']]\
    .citation_teo.value_counts()[:n].index.tolist()
    
#     print(most_tone1)
#     print(most_tone2) 
    most_tone = predict_most(most_tone1, most_tone2)
    
    #most_tone =  " " if most_tone.empty else most_tone[0]
    most_initial1 = Y_train[X_train['initial_man'] == X_test.iloc[i]['initial_man']].\
    initial_teo.value_counts()[:n].index.tolist()
    most_initial2 = Y_train[X_train['initial_can'] == X_test.iloc[i]['initial_can']].\
    initial_teo.value_counts()[:n].index.tolist()
    
#     print(most_initial1)
#     print(most_initial2)
    most_initial = predict_most(most_initial1, most_initial2)
    
    #most_initial = " " if most_initial.empty else most_initial[0]
    most_final1 = Y_train[X_train['final_man'] == X_test.iloc[i]['final_man']].\
    final_teo.value_counts()[:n].index.tolist()
    most_final2 = Y_train[X_train['final_can'] == X_test.iloc[i]['final_can']].\
    final_teo.value_counts()[:n].index.tolist()
    
#     print(most_final1)
#     print(most_final2)
    most_final = predict_most(most_final1, most_final2)
    
#     most_final = " " if most_final.empty else most_final[0]
#     if most_final == " ":
#         print(most_final)
    pred.append([most_tone, most_initial, most_final])
pred_pd = pd.DataFrame(pred, columns=['pred_tone', 'pred_initial', 'pred_final'])

In [6]:
merged_pd = pd.concat([X_test, Y_test, pred_pd], axis=1)

In [7]:
merged_pd.sample(30)

Unnamed: 0,index,BENZI_IN_SOURCE_teo,citation_man,initial_man,final_man,citation_can,initial_can,final_can,index.1,BENZI_IN_SOURCE_teo.1,citation_teo,initial_teo,final_teo,pred_tone,pred_initial,pred_final
79,569,欉,4,c,ong,4,c,ung,569,欉,55,ts,aŋ,DNE,tsʰ,aŋ
66,49,姆,3,m,u,5,m,ou,49,姆,53,,m̩,53,m,ou
147,326,曬,4,sh,ai,3,s,aai,326,曬,213,s,ai,213,s,i
78,595,蘭,2,l,an,4,l,aan,595,蘭,33,l,aŋ,55,l,ũã∼
35,23,巷,4,x,iang,6,h,ong,23,巷,11,h,aŋ,35,h,DNE
18,611,悔,3,h,ui,3,f,ui,611,悔,213,h,ue,21,h,DNE
98,54,斧,3,f,u,2,f,u,54,斧,53,p,ou,53,h,ou
68,357,收,1,sh,ou,1,s,au,357,收,33,s,iu,33,s,au
133,689,嬲,3,n,iao,5,n,iu,689,嬲,55,h,iəu,53,n,iəu
49,535,壁,4,b,i,1,b,ik,535,壁,21,p,iaʔ,213,p,ik


### Calculates the accuracy for tone, initial, and final 

In [8]:
tone_acc = len(merged_pd[merged_pd.citation_teo==merged_pd.pred_tone])
initial_acc = len(merged_pd[merged_pd.initial_teo==merged_pd.pred_initial])
final_acc = len(merged_pd[merged_pd.final_teo==merged_pd.pred_final])
tone_acc, initial_acc, final_acc

(111, 113, 63)

In [9]:
100*tone_acc/161, 100*initial_acc/161, 100*final_acc/161

(68.94409937888199, 70.1863354037267, 39.130434782608695)

In [None]:
reduced_df = merged_pd[['BENZI_IN_SOURCE_teo', 'citation_man', 'initial_man', 'final_man', \
                        'citation_teo', 'initial_teo', 'final_teo', 'pred_tone', \
                        'pred_initial', 'pred_final']]
#reduced_df.to_csv('../output/baseline.csv', index = None, header=True)
reduced_df.head()

In [None]:
reduced_df[(reduced_df.citation_teo==reduced_df.pred_tone)&
          (reduced_df.initial_teo==reduced_df.pred_initial)&
          (reduced_df.final_teo==reduced_df.pred_final)].drop_duplicates('initial_man')