In [1]:
import sys
import pandas as pd 
import numpy as np 
sys.path.append('../src/')
import load_transform_data

transformed = load_transform_data.TransformXY()
transformed.transform_data('LabelEncoder')

In [2]:
#from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

RFs = [RandomForestClassifier(n_estimators=100, max_samples=0.6) for i in range(3)]

train_models = [transformed.X_train_tone,transformed.Y_train_tone,
                transformed.X_train_initial,transformed.Y_train_initial,
                transformed.X_train_final,transformed.Y_train_final]

for idx, rf in enumerate(RFs):
    rf.fit(train_models[2*idx], train_models[2*idx+1])

pred_tone = np.array(RFs[0].predict(transformed.X_test_tone))
pred_initial = np.array(RFs[1].predict(transformed.X_test_initial))
pred_final = np.array(RFs[2].predict(transformed.X_test_final))

In [3]:
pred_tone.shape, pred_initial.shape, pred_final.shape

((161,), (161,), (161,))

In [4]:
pred_tone_df = transformed.encT.inverse_transform(pred_tone)
pred_initial_df = transformed.encI.inverse_transform(pred_initial)
pred_final_df = transformed.encF.inverse_transform(pred_final)

In [5]:
pred_df = np.column_stack((pred_tone_df.T, pred_initial_df.T, pred_final_df.T))
#len(pred_tone_df), len(pred_initial_df), len(pred_final_df), pred_tone.shape

In [6]:
test_tone_df = transformed.encT.inverse_transform(transformed.Y_test_tone)
test_initial_df = transformed.encI.inverse_transform(transformed.Y_test_initial)
test_final_df = transformed.encF.inverse_transform(transformed.Y_test_final)

test_df = np.column_stack((test_tone_df.T, test_initial_df.T, test_final_df.T))

In [7]:
results = np.sum(pred_df==test_df, axis=0)
print(results[0], results[1], results[2])

91 110 52


In [8]:
# Accuracy for teochew_tones, teochew_initials, and teochew_finals 
print('accuracy for tones prediction: {}%'.format(results[0]/pred_tone.shape[0]*100))
print('accuracy for initials prediction: {}%'.format(results[1]/pred_initial.shape[0]*100))
print('accuracy for finals prediction: {}%'.format(results[2]/pred_final.shape[0]*100))

accuracy for tones prediction: 56.52173913043478%
accuracy for initials prediction: 68.32298136645963%
accuracy for finals prediction: 32.298136645962735%


In [9]:
cols = [['citation', 'initial', 'final', 'pred_citation', 'pred_initial', 'pred_final']]
result_df = pd.DataFrame(np.concatenate([test_df, pred_df], axis=1), columns=cols)

In [10]:
result_df.sample(30)

Unnamed: 0,citation,initial,final,pred_citation,pred_initial,pred_final
92,33,ts,ɯŋ,33,ts,ɯŋ
56,55,,ĩẽ∼,55,,ɯŋ
132,4,b,ak,213,m,ou
10,53,ts,ui,53,s,ui
108,55,h,aŋ,55,s,eŋ
83,55,,ẽ∼,55,,eŋ
124,213,pʰ,ue,213,pʰ,ue
118,35,l,ai,213,n,ue
98,53,p,ou,53,h,ou
151,55,m,e,213,,o


In [11]:
result_df.columns = ['citation','initial','final',
                     'pred_citation','pred_initial','pred_final']
result_df.columns

Index(['citation', 'initial', 'final', 'pred_citation', 'pred_initial',
       'pred_final'],
      dtype='object')

In [12]:
result_df[(result_df['citation']==result_df['pred_citation'])
          &(result_df['initial']==result_df['pred_initial'])
          &(result_df['final']==result_df['pred_final'])].count()

citation         26
initial          26
final            26
pred_citation    26
pred_initial     26
pred_final       26
dtype: int64

In [13]:
# harsh metrics, when all three predictions are equal 
26/pred_tone.shape[0]*100

16.149068322981368