In [None]:
import json
from collections import namedtuple

import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as ss
import seaborn as sns
import statistics
from matplotlib.lines import Line2D
from sklearn.decomposition import PCA

from utils import read_data_into_named_tuple, qualities

sns.set(color_codes=True)

In [None]:
data = read_data_into_named_tuple()

In [None]:
langs = [
    "en^en",
    "en^de",
    "de^en",
    "en^es",
    "es^en",
    "en^fr",
    "fr^en",
    "en^it",
    "it^en",
    "en^af",
    "af^en",
    "en^hi",
    "hi^en",
    "en^ru",
    "ru^en",
]

languages = ["en", "af", "de", "es", "fr", "hi", "it", "ru"]
language_names = [
    "English",
    "Afrikaans",
    "German",
    "Spanish",
    "French",
    "Hindi",
    "Italian",
    "Russian",
]

scores_names_for_plot = [
    "blanc",
    "estime",
    "js",
    "berts",
    "bleu",
    "rougeL",
    "rouge1",
    "rouge2",
    "rouge3",
]
lang_colors = ["red", "pink", "orange", "yellow", "green", "blue", "cyan", "lime"]

In [None]:
def negate_scores(scores):
    scores_new = [-s for s in scores]
    return scores_new



scores_all = []
scores_annot = []
scores_colors = []
scores_shapes = []

# Experts:
for i in range(len(qualities)):
    scores_all.append(data.experts[i])
    scores_annot.append(qualities[i])
    scores_colors.append("black")
    scores_shapes.append("s")

# BLANC:
for language in languages:
    key_lang = "en^" + language
    scores_all.append(data.blanc[key_lang])
    scores_annot.append("b")
    scores_shapes.append("o")
scores_colors.extend(lang_colors)

# ESTIME:
for language in languages:
    key_lang = "en^" + language
    scores = data.estime[key_lang]
    scores_all.append(negate_scores(scores))
    scores_annot.append("e")
    scores_shapes.append("o")
scores_colors.extend(lang_colors)

# J-Shannon:
for language in languages:
    key_lang = "en^" + language
    scores_all.append(negate_scores(data.js[key_lang]))
    scores_annot.append("js")
    scores_shapes.append("o")
scores_colors.extend(lang_colors)

# BERTScore:
for language in languages:
    key_lang = "en^" + language
    scores_all.append(data.berts[key_lang]["bertscores_F"])
    scores_annot.append("bs")
    scores_shapes.append("o")
scores_colors.extend(lang_colors)

# BLEU:
for language in languages:
    key_lang = "en^" + language
    scores_all.append(data.bleu[key_lang])
    scores_annot.append("bl")
    scores_shapes.append("o")
scores_colors.extend(lang_colors)

# ROUGE:
for language in languages:
    key_lang = "en^" + language
    scores_all.append(data.rouge[key_lang]["rougeLsum"])
    scores_annot.append("rL")
    scores_shapes.append("o")
scores_colors.extend(lang_colors)

for language in languages:
    key_lang = "en^" + language
    scores_all.append(data.rouge[key_lang]["rouge2"])
    scores_annot.append("r2")
    scores_shapes.append("o")
scores_colors.extend(lang_colors)

for language in languages:
    key_lang = "en^" + language
    scores_all.append(data.rouge[key_lang]["rouge3"])
    scores_annot.append("r3")
    scores_shapes.append("o")
scores_colors.extend(lang_colors)


scores_all_normed = []
for i, score in enumerate(scores_all):
    score_avg = statistics.mean(score)
    score_dev = statistics.stdev(score)
    score_normed = np.array([(s - score_avg) / score_dev for s in score])
    scores_all_normed.append(score_normed)
scores_all_normed = np.array(scores_all_normed)

pca = PCA(n_components=2, svd_solver="full")
scores_all_D2 = pca.fit_transform(scores_all_normed)
scores_all_D2.shape


scores_all_ranked = []
for i, score in enumerate(scores_all):
    score_ranked = ss.rankdata(score)
    scores_all_ranked.append(score_ranked)
scores_all_ranked = np.array(scores_all_ranked)

pca = PCA(n_components=2, svd_solver="full")
scores_all_D2 = pca.fit_transform(scores_all_ranked)
scores_all_D2.shape


print(f"PCA explained variance: {sum(pca.explained_variance_ratio_)}")


shape_dict = {k: v for k, v in zip(scores_annot, scores_shapes)}
shape_dict["b"] = "^"
shape_dict["e"] = "X"
shape_dict["js"] = "P"
shape_dict["bs"] = "d"
shape_dict["bl"] = ">"
shape_dict["rL"] = "v"
shape_dict["r2"] = "o"


sns.set() 
size_x = 10
size_y = int(size_x * 38 / 50)

plt.figure(figsize=(size_x, size_x))
alp = 0.5
size = 200
rot = 45
for i in range(len(scores_all_D2)):
    s = scores_all_D2[i]
    score_name = scores_annot[i]
    if score_name == "r3":
        continue
    marker = shape_dict[score_name]
    fig = plt.scatter(
        x=s[0], y=s[1], s=size, color=scores_colors[i], alpha=alp, marker=marker
    )

for i, txt in enumerate(scores_annot[:4]):
    s = scores_all_D2[i]
    if txt == "coherence":
        plt.annotate(txt, (s[0] + 750, s[1]), rotation=0, size=14)
    else:
        plt.annotate(txt, (s[0] + 750, s[1] - 230), rotation=0, size=14)

marksize = 15
legend_elements = []
for k in range(len(language_names)):
    line = Line2D(
        [0],
        [0],
        marker="s",
        color=lang_colors[k],
        label=language_names[k],
        markerfacecolor=lang_colors[k],
        markersize=marksize,
        linestyle="None",
        alpha=alp,
    )
    legend_elements.append(line)

legend_measures = [
    Line2D(
        [0],
        [0],
        color="black",
        marker=shape_dict["b"],
        markersize=marksize,
        label="BLANC",
        linestyle="None",
    ),
    Line2D(
        [0],
        [0],
        color="black",
        marker=shape_dict["e"],
        markersize=marksize,
        label="ESTIME",
        linestyle="None",
    ),
    Line2D(
        [0],
        [0],
        color="black",
        marker=shape_dict["js"],
        markersize=marksize,
        label="Jensen-Shannon",
        linestyle="None",
    ),
    Line2D(
        [0],
        [0],
        color="black",
        marker=shape_dict["bs"],
        markersize=marksize,
        label="BERTScore",
        linestyle="None",
    ),
    Line2D(
        [0],
        [0],
        color="black",
        marker=shape_dict["bl"],
        markersize=marksize,
        label="BLEU",
        linestyle="None",
    ),
    Line2D(
        [0],
        [0],
        color="black",
        marker=shape_dict["rL"],
        markersize=marksize,
        label="ROUGE-L",
        linestyle="None",
    ),
    Line2D(
        [0],
        [0],
        color="black",
        marker=shape_dict["r2"],
        markersize=marksize,
        label="ROUGE-2",
        linestyle="None",
    ),
    Line2D(
        [0],
        [0],
        color="black",
        marker="s",
        markersize=marksize,
        label="Experts",
        linestyle="None",
    ),
]

legend1 = plt.legend(
    handles=legend_elements,
    loc="right",
    bbox_to_anchor=(0.72, 0.80),
    title="Languages:",
    fontsize=14,
    facecolor="white",
)
legend1.set_title("Languages", prop={"size": 14})

legend2 = plt.legend(
    handles=legend_measures,
    loc="right",
    bbox_to_anchor=(1.01, 0.8),
    title="Measures:",
    fontsize=14,
    facecolor="white",
)
legend2.set_title("Measures", prop={"size": 14})
plt.gca().add_artist(legend1)

plt.axis("equal")
plt.tick_params(
    axis="both",
    which="both",
    bottom=False,
    top=False,
    labelbottom=False,
    right=False,
    left=False,
    labelleft=False,
)
plt.savefig("pca.pdf", bbox_inches="tight", format="pdf")