In [17]:
import json
import pickle
import importlib
from dragonmapper import hanzi

from sklearn.model_selection import train_test_split

import transform_data
importlib.reload(transform_data)
import calculate_frequencies
importlib.reload(calculate_frequencies)

<module 'calculate_frequencies' from 'd:\\Research\\tonal-adaptation\\calculate_frequencies.py'>

In [18]:
# import models
confident_onset_zh_list = [
    # ["艾", "安", "伊", "埃"], #1
    # ["埃", "奥"], #2
    # ["阿"], #3
    # ["亚", "以"], #4
    # ["欧"] #5
    # ["艾", "安", "伊", "埃"], # 1
    # ["埃", "奥"], # 2
    # ["阿"], # 3
    # ["亚"], # 4
    # ["爱"], # 5
    ["英", "乌", "厄", "安"], # 1 (1.00, 1.00, 1.00, 0.96)
    ["尤", "恩"], # 2
    ["奥", "阿"], # 3
    ["伊"], # 4
    ["埃"], # 5
    ["欧", "艾"], # 6
    ["亚"] # 7
]

model_set = "models_expanded"

clf_onset_zh_set = []
for n in range(len(confident_onset_zh_list)):
    with open(f"./{model_set}/clf_onset_zh_step_{n+1}.pkl", "rb") as clf_onset_zh_step_n_file:
        clf_onset_zh_set.append(pickle.load(clf_onset_zh_step_n_file))

# with open(f"./{model_set}/clf_onset_zh.pkl", "rb") as clf_onset_zh_file:
#     clf_onset_zh = pickle.load(clf_onset_zh_file)

with open(f"./{model_set}/clf_onset_tone_num.pkl", "rb") as clf_onset_tone_num_file:
    clf_onset_tone_num = pickle.load(clf_onset_tone_num_file)

In [19]:
# import data
data = {}

with open("./data/output_data.json", "r") as output_data_file:
    try:
        data = json.load(output_data_file)["words"]
    except Exception as e:
        print(e)

onset_frequencies = { }
onset_zh_frequencies = { }
tone_frequencies = { }

for word_en, word_data in data.items():
    onset = word_data["onset_en_ipa"]
    onset_frequencies[onset] = onset_frequencies[onset] + 1 if onset in onset_frequencies else 1

    onset_zh = word_data["word_zh"][0]
    onset_zh_frequencies[onset_zh] = onset_zh_frequencies[onset_zh] + 1 if onset_zh in onset_zh_frequencies else 1

    tone = word_data["onset_tone_num"]
    tone_frequencies[tone] = tone_frequencies[tone] + 1 if tone in tone_frequencies else 1

frequencies, cond_probs = calculate_frequencies.get_data()

onset_en_ipa_list = list(onset_frequencies.keys())
onset_zh_list = list(onset_zh_frequencies.keys())

In [38]:
# collect specific features used by models
X = []
X_onset_en_ipa = []
y = []

X_labels = []
X_labels.append("Onset N(V)")
X_labels.append("Onset Length")
X_labels += ["close", "near-close", "close-mid", "mid", "open-mid", "near-open", "open"]
X_labels += ["front", "central", "back"]
X_labels.append("Onset N(C)")
X_labels.append("initial_stress_en")
X_labels.append("first_stress_en")
X_labels.append(f"F(Onset)")
X_labels += [f"P(Onset|T={tone})" for tone in range(1,5)]
X_labels += [f"P(T={tone}|Onset)" for tone in range(1,5)]
X_labels += [f"P(Onset|Hanzi={onset_zh})" for onset_zh in onset_zh_list]
X_labels += [f"P(Hanzi={onset_zh}|Onset)" for onset_zh in onset_zh_list]
# X_labels += ["N(Syllables)"]
# X_labels += ["Onset%Word"]
X_labels += ["Bisyllabic"]

fi = [0.0, 0.0, 0.0, 0.0, 0.08181818181818189, 0.0, 0.0, 0.0, 0.0007419483553654977, 0.1359848706751721, 0.0, 0.0, 0.0, 0.2561784987256287, 0.16955825368643618, 0.941439534976628, 0.008829507540517737, 0.015128754047228186, 0.17935639837214187, 0.1639042750205036, 0.0, 0.0, 0.0, 0.0, 0.011281571229738593, 0.11268820893904659, 0.00525303959973211, 0.0509898514618308, 0.0, 0.0253313907668532, 0.0052023176962245395, 0.09556565328061264, 0.01953270633600672, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.028974709114303887, 0.0, 0.0, 0.0, 0.0010749687076382307, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8348883619771026, 0.7799252113983322, 0.885939983819047, 0.0029386957948273826, 0.6386752767923135, 0.5403100322077609, 1.0253362341914363, 0.37065466973781924, 0.07411245456308242, 0.24002916162010465, 0.0, 0.0, 0.19008926483469737, 0.0015050322158815536, 0.0, 0.0, 0.002067981302886859, 0.030953291140342665, 0.023251827823199664, 0.012945859704890548, 0.01674164228006951, 0.014332916101843117, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0004341783941961828, 0.0, 0.0, 0.0, 0.0008579280865531144, 0.005175355663822426, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
avg = sum(fi)/len(fi)
stdev = (sum([(fii - avg)**2 for fii in fi])/(len(fi)))**0.5
criterion = avg+1*stdev

for word_en, word_data in data.items():
    onset = word_data["onset_en_ipa"]

    x = transform_data.vowel_qualities(word_data["onset_en_ipa"])

    x.append(len(onset) - x[0])

    x.append(word_data["stresses_en"][0])

    x.append(word_data["stresses_en"].index(1))

    x.append(onset_frequencies[word_data["onset_en_ipa"]])

    for tone in [1,2,3,4]:
        x.append(cond_probs["onset_tone_num"][str(tone)].get(onset, False) or 0)

    for tone in [1,2,3,4]:
        x.append(cond_probs["onset_en_ipa"][onset].get(str(tone), False) or 0)

    for onset_zh in onset_zh_list:
        x.append(cond_probs["onset_zh"][onset_zh].get(str(onset), False) or 0)

    for onset_zh in onset_zh_list:
        x.append(cond_probs["onset_en_ipa-onset_zh"][onset].get(str(onset_zh), False) or 0)

    x.append(len(word_data["syllables_en_arpa"]) >= 2)

    # x.append(len(word_data["onset_en_ipa"])/len("".join(word_data["syllables_en_ipa"])))

    X.append(x)
    X_onset_en_ipa.append(onset)
    y.append(word_data["onset_tone_num"])

    # if word_en == "Olivia":
    #     skad = ""
    #     for i in range(len(X_labels)):
    #         if fi[i] > 0:
    #             skad = skad + f"{X_labels[i]},{x[i]};"
    #     print(skad)

    if word_en == "Olivia":
        skad = ""
        print("{")
        for i in range(len(X_labels)):
            if fi[i] > criterion:
                print(f"    \"{X_labels[i]}\": {x[i]}")
        print("}")


{
    "F(Onset)": 63
    "P(Hanzi=奥|Onset)": 0.8888888888888888
    "P(Hanzi=艾|Onset)": 0
    "P(Hanzi=阿|Onset)": 0
    "P(Hanzi=伊|Onset)": 0
    "P(Hanzi=亚|Onset)": 0
    "P(Hanzi=埃|Onset)": 0
    "P(Hanzi=欧|Onset)": 0.07936507936507936
}


In [21]:
X_train, X_test, _, X_onset_en_ipa_test, y_train, y_test = train_test_split(X, X_onset_en_ipa, y, test_size=0.33, random_state=100, stratify=None)

for w in range(len(X)):
    if y[w] in [2, 3]:
        print(list(data.keys())[w], X_onset_en_ipa[w], y[w], X[w])

Elijah e 3 [1, False, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 116, 0.13701067615658363, 0, 0.12, 0.1157556270096463, 0, 0, 0, 0, 0, 0.25862068965517243, 0.026923076923076925, 0.375, 0.03389830508474576, 0.043478260869565216, 0.5447154471544715, 0, 0, 0.5555555555555556, 0, 0, 0, 0, 0, 0, 0, 0.05555555555555555, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25862068965517243, 0.0603448275862069, 0.02586206896551724, 0.017241379310344827, 0.008620689655172414, 0.5775862068965517, 0, 0, 0.04310344827586207, 0, 0, 0, 0, 0, 0, 0, 0.008620689655172414, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, True]
Ezekiel e 3 [1, False, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 116, 0.13701067615658363, 0, 0.12, 0.1157556270096463, 0, 0, 0, 0, 0, 0.25862068965517243, 0.026923076923076925, 0.375, 0.03389830508474576, 0.043478260869565216, 0.5447154471544715, 0, 0, 0.5555555555555556, 0, 0, 0, 0, 0, 0, 0, 0.05555555555555555, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [22]:
# first make a prediction using the onset_zh model
predictions = []

i = 0
for x in X_test:
    found_confident_match = False

    for n in range(len(clf_onset_zh_set)):
        clf_onset_zh_step_n = clf_onset_zh_set[n]
        prediction_onset_zh = clf_onset_zh_step_n.predict([x])[0]

        if prediction_onset_zh in confident_onset_zh_list[n]:
            found_confident_match = True

            onset_tone_num = 1

            onset_tone = hanzi.to_zhuyin(prediction_onset_zh).split(" ")[0][-1]
            if onset_tone == "ˊ":
                onset_tone_num = 2
            elif onset_tone == "ˇ":
                onset_tone_num = 3
            elif onset_tone == "ˋ":
                onset_tone_num = 4
            elif onset_tone == "˙":
                onset_tone_num = 5

            predictions.append(onset_tone_num)
            break

    if not found_confident_match:
        if X_onset_en_ipa_test[i] == "ju":
            predictions.append(2)
        else:
            predictions.append(int(clf_onset_tone_num.predict([x])[0]))

    i += 1

0.7866666666666666


In [26]:
import print_metrics
importlib.reload(print_metrics)

print_metrics.print_metrics(X_test, y_test, predictions, X_labels, y_labels, clf_list=clf_onset_zh_set + [clf_onset_tone_num], print_probas=False, print_sandhi_effects=False)

DecisionTreeClassifier #0:
DecisionTreeClassifier #1:
DecisionTreeClassifier #2:
DecisionTreeClassifier #3:
DecisionTreeClassifier #4:
DecisionTreeClassifier #5:
DecisionTreeClassifier #6:
DecisionTreeClassifier #7:
{'1': 184, '2': 3, '3': 7, '4': 106} {'1': 173, '2': 2, '3': 0, '4': 61}
1 accuracy: 173/184 (94.02173913043478%)
2 accuracy: 2/3 (66.66666666666666%)
3 accuracy: 0/7 (0.0%)
4 accuracy: 61/106 (57.54716981132076%)
[[173   1   0  10]
 [  1   2   0   0]
 [  5   0   0   2]
 [ 45   0   0  61]]
0.5552522645753399
DecisionTreeClassifier #0:
Feature initial_stress_en w/ importance 0.01713227991967963
Feature first_stress_en w/ importance 0.0324625285401114
Feature F(Onset) w/ importance 1.2925012909881808e-05
Feature P(Onset|Hanzi=奥) w/ importance 0.001408146143338982
Feature P(Onset|Hanzi=艾) w/ importance 0.00265381388552345
Feature P(Onset|Hanzi=亚) w/ importance 0.009159372219857053
Feature P(Onset|Hanzi=埃) w/ importance 0.0052023176962245395
Feature P(Onset|Hanzi=欧) w/ importan

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
