In [6]:
import json
import pickle
import importlib
import numpy as np
import pandas as pd
import scipy.stats as stats
from dragonmapper import hanzi
import matplotlib.pyplot as plt
from sklearn import set_config
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text, _tree
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, accuracy_score, classification_report, matthews_corrcoef

import transform_data
importlib.reload(transform_data)
import calculate_frequencies
importlib.reload(calculate_frequencies)

<module 'calculate_frequencies' from 'd:\\Research\\tonal-adaptation\\calculate_frequencies.py'>

In [7]:
# import data
data = {}

with open("./data/output_data.json", "r") as output_data_file:
    try:
        data = json.load(output_data_file)["words"]
    except Exception as e:
        print(e)

onset_frequencies = { }
onset_zh_frequencies = { }
tone_frequencies = { }

for word_en, word_data in data.items():
    onset = word_data["onset_en_ipa"]
    onset_frequencies[onset] = onset_frequencies[onset] + 1 if onset in onset_frequencies else 1

    onset_zh = word_data["word_zh"][0]
    onset_zh_frequencies[onset_zh] = onset_zh_frequencies[onset_zh] + 1 if onset_zh in onset_zh_frequencies else 1

    tone = word_data["onset_tone_num"]
    tone_frequencies[tone] = tone_frequencies[tone] + 1 if tone in tone_frequencies else 1

frequencies, cond_probs = calculate_frequencies.get_data()

onset_en_ipa_list = list(onset_frequencies.keys())
onset_zh_list = list(onset_zh_frequencies.keys())

In [8]:
# handle data
X = []
y = []

X_labels = []
X_labels.append("Onset N(V)")
X_labels.append("Onset Length")
X_labels += ["close", "near-close", "close-mid", "mid", "open-mid", "near-open", "open"]
X_labels += ["front", "central", "back"]
X_labels.append("Onset N(C)")
X_labels.append("initial_stress_en")
X_labels.append("first_stress_en")
X_labels.append(f"F(Onset)")
X_labels += [f"P(Onset|T={tone})" for tone in range(1,5)]
X_labels += [f"P(T={tone}|Onset)" for tone in range(1,5)]
X_labels += [f"P(Onset|Hanzi={onset_zh})" for onset_zh in onset_zh_list]
X_labels += [f"P(Hanzi={onset_zh}|Onset)" for onset_zh in onset_zh_list]
# X_labels += ["N(Syllables)"]
# X_labels += ["Onset%Word"]
X_labels += ["Bisyllabic"]

y_labels = ["1", "2", "3", "4"]
# y_labels = ["Tone 1", "Tone 2", "Tone 3", "Tone 4"]
# y_labels = ["Tone 1/2", "Tone 3", "Tone 4"]
# y_labels = ["Tone 1/2", "Tone 3/4"]
# y_labels = ["Tone 1", "Tone 2/3/4"]
# y_labels = ["Tone 1/3", "Tone 2/4"]

confident_onset_zh_list = [
    # "伊", "安", # 1
    # "艾", # 2
    # "埃", # 3
    # "阿", # 4
    # "亚", "以", "奥" # 5
    ###
    # "艾", # 1
    # "安", # 1
    # "伊", # 1
    # "亚", # 2
    # "埃", # 2
    # "奥", # 2
    # "阿", # 3
    # "以", # 4
    # "爱" # 5
    ###
    "英",
    "乌",
    "厄",
    "安",
    "尤",
    "恩",
    "奥",
    "阿",
    "伊",
    "埃",
    "欧",
    "艾",
    "亚"
]

for word_en, word_data in data.items():
    if word_data["word_zh"][0] in confident_onset_zh_list:
        continue

    x = transform_data.vowel_qualities(word_data["onset_en_ipa"])
    x.append(len(onset)-x[0])

    x.append(word_data["stresses_en"][0])

    x.append(word_data["stresses_en"].index(1))

    x.append(onset_frequencies[word_data["onset_en_ipa"]])

    for tone in [1,2,3,4]:
        x.append(cond_probs["onset_tone_num"][str(tone)].get(onset, False) or 0)

    for tone in [1,2,3,4]:
        x.append(cond_probs["onset_en_ipa"][onset].get(str(tone), False) or 0)

    for onset_zh in onset_zh_list:
        x.append(cond_probs["onset_zh"][onset_zh].get(str(onset), False) or 0)

    for onset_zh in onset_zh_list:
        x.append(cond_probs["onset_en_ipa-onset_zh"][onset].get(str(onset_zh), False) or 0)

    x.append(len(word_data["syllables_en_arpa"]) >= 2)

    # x.append(len(word_data["onset_en_ipa"])/len("".join(word_data["syllables_en_ipa"])))

    X.append(x)
    y.append(str(word_data["onset_tone_num"]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=100, stratify=y)

In [9]:
# create, train, and test model
clf = DecisionTreeClassifier(criterion="gini", splitter="best", random_state=100, min_samples_split=2, min_samples_leaf=5)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

with open("./models_expanded/clf_onset_tone_num.pkl", "wb") as clf_onset_tone_num_file:
    pickle.dump(clf, clf_onset_tone_num_file)

In [10]:
import print_metrics
importlib.reload(print_metrics)

print_metrics.print_metrics(X_test, y_test, y_pred, X_labels, y_labels, clf=clf, print_probas=False, print_sandhi_effects=False)

([1, False, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0.10854092526690391, 0, 0, 0.09646302250803858, 0, 0, 0, 0, 0.14935064935064934, 0.05172413793103448, 0.2230769230769231, 0, 0, 0.043478260869565216, 0, 0.14285714285714285, 0.01818181818181818, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25274725274725274, 0.06593406593406594, 0.6373626373626373, 0, 0, 0.01098901098901099, 0, 0.02197802197802198, 0.01098901098901099, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, True], '4', '1')
([1, False, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 116, 0.10854092526690391, 0, 0, 0.09646302250803858, 0, 0, 0, 0, 0.14935064935064934, 0.05172413793103448, 0.2230769230769231, 0, 0, 0.043478260869565216, 0, 0.14285714285714285, 0.01818181818181818, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25274725274725274, 0.06593406593406594, 0.6373626373626373, 0, 0, 0.010989010

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
