In [81]:
import json
import importlib
import numpy as np
import pandas as pd
import scipy.stats as stats
from dragonmapper import hanzi
import matplotlib.pyplot as plt
from sklearn import set_config
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text, _tree
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, accuracy_score, classification_report, matthews_corrcoef

import transform_data
importlib.reload(transform_data)
import calculate_frequencies
importlib.reload(calculate_frequencies)

130 5 12 98


<module 'calculate_frequencies' from 'd:\\Research\\tonal-adaptation\\calculate_frequencies.py'>

In [82]:
# import data
data = {}

with open("./data/output_data.json", "r") as output_data_file:
    try:
        data = json.load(output_data_file)["words"]
    except Exception as e:
        print(e)

onset_frequencies = { }
tone_frequencies = { }

for word_en, word_data in data.items():
    onset = word_data["onset_en_ipa"]
    onset_frequencies[onset] = onset_frequencies[onset] + 1 if onset in onset_frequencies else 1

    tone = word_data["onset_tone_num"]
    tone_frequencies[tone] = tone_frequencies[tone] + 1 if tone in tone_frequencies else 1

frequencies, cond_probs = calculate_frequencies.get_data()

In [83]:
# handle data
X = []
y = []

X_labels = []
X_labels.append("Onset N(V)")
X_labels.append("Onset Length")
X_labels += ["close", "near-close", "close-mid", "mid", "open-mid", "near-open", "open"]
X_labels += ["front", "central", "back"]
X_labels.append("Onset N(C)")
X_labels.append("initial_stress_en")
X_labels.append("first_stress_en")
X_labels.append(f"F(Onset)")
X_labels += [f"P(Onset|T={tone})" for tone in range(1,5)]
X_labels += [f"P(T={tone}|Onset)" for tone in range(1,5)]

# y_labels = ["Tone 1", "Tone 2", "Tone 3", "Tone 4"]
# y_labels = ["Tone 1/2", "Tone 3", "Tone 4"]
y_labels = ["Tone 1/2", "Tone 3/4"]

for word_en, word_data in data.items():
    onset = word_data["onset_en_ipa"]

    x = transform_data.vowel_qualities(word_data["onset_en_ipa"])
    x.append(len(onset)-x[0])

    x.append(word_data["stresses_en"][0])

    x.append(word_data["stresses_en"].index(1))

    x.append(onset_frequencies[word_data["onset_en_ipa"]])

    for tone in [1,2,3,4]:
        x.append(cond_probs["onset_tone_num"][str(tone)].get(onset, False) or 0)
        x.append(cond_probs["onset_en_ipa"][onset].get(str(tone), False) or 0)

    X.append(x)
    y.append(transform_data.map_tone(word_data["onset_tone_num"]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=100, stratify=y)

In [84]:
# create, train, and test model
clf = DecisionTreeClassifier(criterion="gini", splitter="best", random_state=100, min_samples_split=10, min_samples_leaf=5)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [85]:
set_config(print_changed_only=False)
print(clf)

# display tree
fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (16,16), dpi=300)
tree = plot_tree(clf, feature_names=X_labels, class_names=y_labels, ax=axes, fontsize=6)
plt.savefig("./data/tree.png")

tree_text_raw = export_text(clf, feature_names=X_labels[:len(X[0])]).split("\n")
tree_text_numbered = []

l = 0
for line in tree_text_raw:
    if "class: " in line and "-" in line:
        prefix = "   "
    else:
        l += 1
        prefix = str(l).ljust(3)

    if "|" in line:
        tree_text_numbered.append(prefix + line)

tree_text_numbered = "\n".join(tree_text_numbered)
print(tree_text_numbered)

# print metrics
tone_comparisons = list(zip(X_test, y_test, y_pred))

tone_counts = { }
for y_label in y_labels:
    tone_counts[y_label] = 0

tone_corrects = { }
for y_label in y_labels:
    tone_corrects[y_label] = 0

print(tone_counts, tone_corrects)

# tone_2_probas = []
# tone_3_probas = []

decision_paths = clf.decision_path(X_test)
leaves = clf.apply(X_test)

for tc in range(len(tone_comparisons)):
    tone_comparison = tone_comparisons[tc]

    word_zh = list(data.values())[tc]["word_zh"]
    pinyin_with_sandhi = hanzi.to_pinyin(word_zh).lower()
    pinyin_without_sandhi = ''.join(hanzi.to_pinyin(syllable_zh) for syllable_zh in word_zh).lower()
    if pinyin_with_sandhi != pinyin_without_sandhi:
        print(f"{word_zh}: {pinyin_with_sandhi} w/ sandhi, {pinyin_without_sandhi} w/o sandhi gives: {tone_comparison[1]} vs. {tone_comparison[2]}")

    tone = tone_comparison[1]
    tone_counts[tone] = tone_counts[tone] + 1

    # proba = clf.predict_proba([tone_comparison[0]])[0]
    # tone_2_probas.append(proba[1])
    # tone_3_probas.append(proba[2])

    if tone_comparison[1] == tone_comparison[2]:
        tone_corrects[tone] = tone_corrects[tone] + 1
        # print(f"{leaves[t]} correctly predicted tone {tone+1}")
    else:
        if "2" in tone_comparison[1] or "3" in tone_comparison[1]:
            print(f"{leaves[tc]} incorrectly predicted tone {tone} (guessed tone {tone_comparison[2]} w/ decision path {decision_paths.indices[decision_paths.indptr[tc]:decision_paths.indptr[tc+1]]})")

# tone_2_probas = pd.Series(tone_2_probas)
# tone_3_probas = pd.Series(tone_3_probas)
# sig_tests_2 = pd.Series([(point - tone_2_probas.mean())/tone_2_probas.std() for point in tone_2_probas])
# sig_tests_3 = pd.Series([(point - tone_3_probas.mean())/tone_3_probas.std() for point in tone_3_probas])

# ts = 0
# for test_stat_2, test_stat_3 in zip(sig_tests_2, sig_tests_3):
#     if test_stat_2 >= 3 or test_stat_2 <= -3:
#         print(f"test_stat_2 ({test_stat_2}) significant for ({tone_comparisons[ts]})")

#     if test_stat_3 >= 3 or test_stat_3 <= -3:
#         print(f"test_stat_3 ({test_stat_3}) significant for ({tone_comparisons[ts]})")

#     ts += 1

for y_label, count in tone_counts.items():
    if count != 0:
        print(f"{y_label} accuracy: {tone_corrects[y_label]}/{tone_counts[y_label]} ({tone_corrects[y_label]/tone_counts[y_label] * 100}%)")

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred) * 100)
print(matthews_corrcoef(y_test, y_pred))
print(clf.feature_importances_)
print(classification_report(y_test, y_pred))

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_samples_leaf=5,
                       min_samples_split=10, min_weight_fraction_leaf=0.0,
                       random_state=100, splitter='best')
1  |--- Onset N(V) <= 1.50
2  |   |--- P(T=3|Onset) <= 0.01
3  |   |   |--- P(Onset|T=1) <= 0.00
   |   |   |   |--- class: Tone 3/4
4  |   |   |--- P(Onset|T=1) >  0.00
5  |   |   |   |--- P(T=1|Onset) <= 0.03
6  |   |   |   |   |--- open <= 0.50
   |   |   |   |   |   |--- class: Tone 1/2
7  |   |   |   |   |--- open >  0.50
   |   |   |   |   |   |--- class: Tone 1/2
8  |   |   |   |--- P(T=1|Onset) >  0.03
   |   |   |   |   |--- class: Tone 1/2
9  |   |--- P(T=3|Onset) >  0.01
10 |   |   |--- P(Onset|T=1) <= 0.01
   |   |   |   |--- class: Tone 3/4
11 |   |   |--- P(Onset|T=1) >  0.01
12 |   |   |   |--- open <= 0.50
13 |   |   | 