# Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os, sys
root_folder_name = "bertaphore"
p = os.getcwd()
while os.path.basename(p) != root_folder_name:
    p = os.path.dirname(p)
sys.path.insert(0, p)

import torch
import datetime
from conllu import parse
import pandas as pd
import numpy as np
import plotly.express as px
from modules import metric
from transformers import AutoTokenizer, AutoModel, utils
utils.logging.set_verbosity_error()  # Suppress standard warnings
from bertviz import model_view, head_view
from scipy.linalg import toeplitz
import seaborn as sns
import matplotlib.pyplot as plt
import scipy


model_name = "microsoft/xtremedistil-l12-h384-uncased"  # Find popular HuggingFace models here: https://huggingface.co/models
model = AutoModel.from_pretrained(model_name, output_attentions=True)  # Configure model to return attention values
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Data processing

In [None]:
burdenizer = metric.AttentionalBurden(model, tokenizer
                                      , prune_cls_sep=True
                                      , normalize_attention=True
                                      )
burdenizer.compute_burden("Alice is eating pizza. I like Bob.")

In [None]:
ORDERING_DEPREL_UNIVERSAL = ['punct', 'case', 'nsubj', 'det', 'nmod', 'root', 'obl', 'obj', 'amod',
       'advmod', 'conj', 'compound', 'aux', 'cc', 'mark', 'cop', 'acl',
       'advcl', 'xcomp', 'flat', 'nummod', 'ccomp', 'appos', 'parataxis',
       'discourse', 'ref', 'iobj', 'fixed', 'expl', 'csubj', 'dep', 'list',
       'reparandum', 'vocative', 'dislocated', 'goeswith', 'orphan']

datasets_ud_english = ["GUM", "EWT", "LinES", "ParTUT"
                     #   , "Atis"
                       ]

In [None]:
def parse_sentence_dependencies(data):
    l_head, l_deprel = [], []
    for sentence_id, sentence in enumerate(data):
        for token_id, token in enumerate(sentence):
            id_ud = token["id"]
            if isinstance(id_ud, tuple) and len(id_ud)==3 and id_ud[1]=="-":
                continue
            l_head.append((sentence_id, token_id+1, id_ud, token["form"], token["deprel"], token["head"]))
            deps = token["deps"]
            if isinstance(deps, list):
                for dep in deps:
                    l_deprel.append((sentence_id, token_id+1, id_ud, token["form"],)+dep)
    return l_head, l_deprel

def build_dfUD(l_head, l_deprel):
    df_head = pd.DataFrame(l_head, columns=["sentence_id", "token_id", "id", "form", "deprel", "head"])
    df_deprel = pd.DataFrame(l_deprel, columns=["sentence_id", "token_id", "id", "form", "deprel", "head"])
    assert len(df_deprel)==0 or (df_head[["sentence_id", "token_id", "form"]].drop_duplicates().reset_index(drop=True)==df_deprel[["sentence_id", "token_id", "form"]].drop_duplicates().reset_index(drop=True)).all(axis=None)
    df_head = df_head.dropna(subset=["head"])
    assert not df_head.isna().any().any() and not df_deprel.isna().any().any()

    # deprel format
    df_deprel = df_deprel.merge(df_deprel[["sentence_id", "id", "token_id"]].drop_duplicates().rename(columns={"token_id":"token_head"})
                ,how="left"
                ,left_on=["sentence_id", "head"], right_on=["sentence_id", "id"]
                ,suffixes=("", "_head")).drop(columns=["id_head"])
    df_deprel["deprel_universal"] = df_deprel["deprel"].apply(lambda u: u.split(":")[0])
    # df_deprel["token_head"] = df_deprel.apply(lambda u: u["token_head"] if not(pd.isna(u["token_head"])) else u["head"], axis=1).astype(int)
    df_deprel.loc[df_deprel["deprel"]=="root", "token_head"] = 0
    df_deprel["token_head"] = df_deprel["token_head"].astype(int)
    
    # head format
    df_head = df_head.merge(df_head[["sentence_id", "id", "token_id"]].drop_duplicates().rename(columns={"token_id":"token_head"})
                ,how="left"
                ,left_on=["sentence_id", "head"], right_on=["sentence_id", "id"]
                ,suffixes=("", "_head")).drop(columns=["id_head"])
    df_head["deprel_universal"] = df_head["deprel"].apply(lambda u: u.split(":")[0])
    # df_head["token_head"] = df_head.apply(lambda u: u["token_head"] if not(pd.isna(u["token_head"])) else u["head"], axis=1).astype(int)
    df_head.loc[df_head["deprel"]=="root", "token_head"] = 0
    df_head["token_head"] = df_head["token_head"].astype(int)
    
    df_ud = pd.concat([df_deprel[["sentence_id", "form", "token_id", "token_head", "deprel_universal"]]
           ,df_head[["sentence_id", "form", "token_id", "token_head", "deprel_universal"]]]).drop_duplicates()
    df_ud["length_relation"] = df_ud["token_head"] - df_ud["token_id"]
    df_ud["abs_length_relation"] = df_ud["length_relation"].abs()
    return df_ud

def process_data(data):
    l_head, l_deprel = parse_sentence_dependencies(data)
    df_ud = build_dfUD(l_head, l_deprel)
    return df_ud

def calculate_attentionalScores(data, verbose=False):
    d = {"score_attentional":[], "length_attentional":[]}
    for i, sentence in enumerate(data):
        if i % 100 == 0 and verbose:
            print(f"Processing sentence {i}...")
        text = sentence.metadata["text"]
        burden = burdenizer.compute_burden(text)
        d["score_attentional"].append(burden)
        d["length_attentional"].append(len(burdenizer.tokens))
    return d

In [None]:
l_df = []
l_df_attentional = []
for dataset in datasets_ud_english:
    print(f"Processing {dataset}...")
    data_train = parse(open(f"/home/pierrick/datasets/Universal Dependencies 2.14/ud-treebanks-v2.14/UD_English-{dataset}/en_{dataset.lower()}-ud-train.conllu", "r").read())
    data_test = parse(open(f"/home/pierrick/datasets/Universal Dependencies 2.14/ud-treebanks-v2.14/UD_English-{dataset}/en_{dataset.lower()}-ud-test.conllu", "r").read())
    data_dev = parse(open(f"/home/pierrick/datasets/Universal Dependencies 2.14/ud-treebanks-v2.14/UD_English-{dataset}/en_{dataset.lower()}-ud-dev.conllu", "r").read())
    df_train = process_data(data_train)
    df_train["set"] = "train"
    df_test = process_data(data_test)
    df_test["set"] = "test"
    df_dev = process_data(data_dev)
    df_dev["set"] = "dev"
    df = pd.concat([df_train, df_test, df_dev])
    df["dataset"] = dataset
    l_df.append(df)
    
    # attentional burden
    # for data, set_label in zip([data_train, data_test, data_dev], ["train", "test", "dev"]):
    #     print(f"Processing attentional burden for {dataset} {set_label}...")
    #     d = calculate_attentionalScores(data, verbose=True)
    #     df_attentional = pd.DataFrame(d).reset_index().rename(columns={"index":"sentence_id"})
    #     df_attentional["dataset"] = dataset
    #     df_attentional["set"] = set_label
    #     l_df_attentional.append(df_attentional)

df_ud_english = pd.concat(l_df)
# df_attentional = pd.concat(l_df_attentional)
# df_attentional = df_attentional.merge(df_ud_english.groupby(["sentence_id", "dataset", "set"]).agg({"token_id":"max"}).reset_index()
                    #  , how="left", on=["sentence_id", "dataset", "set"]).rename(columns={"token_id":"length_linguistics"})
# display(df_ud_english), display(df_attentional)

# save
now = datetime.datetime.now()
ts = now.strftime('%Y%m%d_%H%M')
df_ud_english.to_csv(f"../data/df_ud_english_{ts}.csv", index=False)
# df_attentional.to_csv(f"../data/df_attentional_{ts}.csv", index=False)

In [None]:
df_ud_english = pd.read_csv("../data/df_ud_english_20241006_2219.csv")
df_attentional = pd.read_csv("../data/df_attentional_20241004_1208.csv")
display(df_ud_english), display(df_attentional)

# Visualization

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

s = pd.Series(df_ud_english.groupby("deprel_universal").transform("size"), name="size")
df = pd.concat([df_ud_english, s], axis=1)
ax = sns.boxplot(data=df, y="deprel_universal", hue="size", x="length_relation", showfliers=False
                 , order=ORDERING_DEPREL_UNIVERSAL)
ax.axvline(x=0, color='gray', linestyle='--')
plt.title("Length of relations")

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

s = pd.Series(df_ud_english.groupby("deprel_universal").transform("size"), name="size")
df = pd.concat([df_ud_english, s], axis=1)
ax = sns.boxplot(data=df, y="deprel_universal", hue="size", x="abs_length_relation", showfliers=False
                 , order=ORDERING_DEPREL_UNIVERSAL)
ax.axvline(x=0, color='gray', linestyle='--')
plt.title("Length of relations")

In [None]:
fig, ax = plt.subplots(figsize=(8, 20))

s = pd.Series(df_ud_english.groupby("deprel_universal").transform("size"), name="size")
df = pd.concat([df_ud_english, s], axis=1)
ax = sns.boxplot(data=df, y="deprel_universal", hue="dataset", x="length_relation", showfliers=False
                 , order=ORDERING_DEPREL_UNIVERSAL)
ax.axvline(x=0, color='gray', linestyle='--')
plt.title("Length of relations")

In [None]:
fig, ax = plt.subplots(figsize=(8, 20))

s = pd.Series(df_ud_english.groupby("deprel_universal").transform("size"), name="size")
df = pd.concat([df_ud_english, s], axis=1)
ax = sns.boxplot(data=df, y="deprel_universal", hue="dataset", x="abs_length_relation", showfliers=False
                 , order=ORDERING_DEPREL_UNIVERSAL)
ax.axvline(x=0, color='gray', linestyle='--')
plt.title("Length of relations")

In [None]:
sns.countplot(data=df_ud_english, x="dataset", order=datasets_ud_english)
plt.title("Number of relations by dataset")
plt.ylabel("Number of relations")

# comment: there is two big datasets (EWT and LinES) and three smaller datasets (GUM, ParTUT, Atis)

In [None]:
sns.countplot(data=df_ud_english[["sentence_id", "dataset", "set"]].drop_duplicates(), x="dataset", hue="set"
              ,order=datasets_ud_english)
plt.title("Number of sentences by dataset and set")
plt.ylabel("Number of sentences")

In [None]:
df = df_ud_english.groupby(["dataset", "set", "sentence_id"], as_index=False).agg({"token_id":"max"})
sns.boxplot(data=df, x="dataset", y="token_id", hue="set", order=datasets_ud_english, showfliers=False)
plt.title("Length of sentences")
plt.ylabel("Tokens")

# comment: ATIS is not like the others, less variability in the length of sentences and shorter sentences

In [None]:
df = pd.merge(
    df_ud_english.groupby(["sentence_id", "dataset", "set"]).agg({"token_id":"max"}).reset_index().groupby(["dataset"]).agg({"token_id":"sum"})
    , df_ud_english.groupby(["deprel_universal", "dataset"], as_index=False).size().set_index(["dataset"])
    , left_index=True, right_index=True, how="inner")
df["freq"] = df["size"]/df["token_id"]

fig, ax = plt.subplots(figsize=(10, 8))
plt.grid(True)
plt.title("Frequency of relations by dataset")
plt.xlabel("Frequency (relation by token)")
sns.scatterplot(data=df.sort_values(by="deprel_universal", key=lambda col: col.map(dict(zip(ORDERING_DEPREL_UNIVERSAL, range(len(ORDERING_DEPREL_UNIVERSAL)))))), y="deprel_universal", x="freq", hue="dataset")

# comment: ATIS is not like the others, often its point is an outlier and has no punct for example (speech dataset)

In [None]:
df = pd.merge(
    df_ud_english.groupby(["sentence_id", "dataset", "set"]).agg({"token_id":"max"}).reset_index().groupby(["dataset"]).agg({"token_id":"sum"})
    , df_ud_english.groupby(["deprel_universal", "dataset"], as_index=False).agg({"token_id":"count"
                                                                            ,"abs_length_relation":"mean"}).set_index(["dataset"]).rename(columns={"token_id":"size"})
    , left_index=True, right_index=True, how="inner").reset_index()
df["freq"] = df["size"]/df["token_id"]

px.scatter(df, x="freq", y="abs_length_relation", facet_col="dataset", hover_data=["deprel_universal"], color="deprel_universal"
           , category_orders={"dataset":datasets_ud_english})

# comment: again, ATIS seems different, could do a Wasserstein/Bottelneck distance

### Score attentional, sentence lengths

In [None]:
df = df_attentional[["length_linguistics", "length_attentional"]].melt().groupby(["variable", "value"], as_index=False).size().rename(columns={"size":"count",
                                                                                                                              "variable":"type",
                                                                                                                             "value":"length"})
fig = px.line(df, x="length", y="count", color="type")                                                                                                                        
fig.show()

# comment: something is weird for short sentences, better to start at 4 tokens

In [None]:
px.scatter(df_attentional, x="length_linguistics", y="length_attentional", facet_col="dataset")

In [None]:
import holoviews as hv
from holoviews import opts
from holoviews.operation.datashader import datashade, rasterize, shade, dynspread, spread
from holoviews.operation.resample import ResampleOperation2D
from holoviews.operation import decimate
from holoviews.operation.datashader import datashade
hv.extension('bokeh','matplotlib', width=100)
points = hv.Points(df_attentional[['length_linguistics', 'length_attentional']])
# points = decimate(points, dynamic=False, max_samples=3000)
spread(rasterize(points, width=400, height=400), px=1, shape='square').relabel("Rasterized")

# comment: need to filter out sentences that have a big difference between the two lengths

In [None]:
# modify df_attentional and df_ud_english
df = df_attentional.loc[(df_attentional["length_attentional"]>=4)
                   &(df_attentional["length_attentional"]<=50)]
mask = np.minimum((df["length_attentional"]/df["length_linguistics"]).values,(df["length_linguistics"]/df["length_attentional"]).values)>0.75
df = df.loc[mask]
df_attentional = df.reset_index(drop=True)

df_ud_english = df_ud_english.merge(df_attentional.reset_index(),
                    how="inner",on=["sentence_id","dataset","set"])

display(df_attentional), display(df_ud_english)

In [None]:
hv.extension('bokeh','matplotlib', width=100)
points = hv.Points(df_attentional[['length_linguistics', 'length_attentional']])
# points = decimate(points, dynamic=False, max_samples=3000)
spread(rasterize(points, width=400, height=400), px=1, shape='square').relabel("Rasterized")


In [None]:
df = df_attentional.groupby("length_attentional", as_index=False).agg({"score_attentional":"mean"})
# df["max_burden"] = df["length_attentional"].apply(lambda n: metric.compute_maxAttentionalBurden(burdenizer.w_forward, burdenizer.w_backward, n)).astype(float)
# df["uniform_burden"] = df["length_attentional"].apply(lambda n: metric.compute_uniformAttentionalBurden(burdenizer.w_forward, burdenizer.w_backward, n))
# df["max_burden_equal_vertices"] = df["length_attentional"].apply(lambda n: metric.compute_maxAttentionalBurdenEqualVertices(burdenizer.w_forward, burdenizer.w_backward, n))
df[f"half_hop_burden"] = df["length_attentional"].apply(lambda n: metric.compute_kHopAttentionalBurden(burdenizer.w_forward, burdenizer.w_backward, n, int(np.ceil(n/2))))
# df[f"2_hop_burden"] = df["length_attentional"].apply(lambda n: metric.compute_kHopAttentionalBurden(burdenizer.w_forward, burdenizer.w_backward, n, 2))
# df[f"9_hop_burden"] = df["length_attentional"].apply(lambda n: metric.compute_kHopAttentionalBurden(burdenizer.w_forward, burdenizer.w_backward, n, 9))
df = df.rename(columns={"score_attentional":"norme"
                        ,"max_burden":"max"
                        ,"uniform_burden":"uniform"})
df = df.melt(id_vars=["length_attentional"]).dropna()
fig = px.line(df,
        x="length_attentional", y="value", color="variable")
fig.update_layout(yaxis_range=[0,15]
                  ,xaxis_title="Longueur du texte",
                yaxis_title="Score attentionnel",
                legend_title="Légende")
fig.show()

### Correlation score attentional score linguistics

In [None]:
l_df_spearman = []
for deprel in df_ud_english["deprel_universal"].unique():
    df = pd.merge(df_attentional
        ,df_ud_english.loc[df_ud_english["deprel_universal"]==deprel].groupby("index").agg({"abs_length_relation":"mean"}).reset_index().rename(columns={"abs_length_relation":"score_linguistics"})
        ,how="inner", left_index=True, right_on="index")
    l_spearmanr = []
    for length_attentional in np.sort(df["length_attentional"].unique()):
        a = df.loc[df["length_attentional"]==length_attentional, ["score_attentional", "score_linguistics"]].values
        a_attentional = a[:, 0]
        a_linguistics = a[:, 1]
        spearmanr = scipy.stats.spearmanr(a_attentional, a_linguistics)
        l_spearmanr.append((deprel, length_attentional, spearmanr.correlation, spearmanr.pvalue, len(a)))
    l_df_spearman.append(pd.DataFrame(l_spearmanr, columns=["deprel", "length_attentional", "correlation", "pvalue", "size"]))

In [None]:
df_spearman = pd.concat(l_df_spearman).dropna()
df_spearman["has_significant_pvalue"] = df_spearman["pvalue"]<0.05
df_spearman = df_spearman.loc[(df_spearman["has_significant_pvalue"])&(df_spearman["size"]>10)&(~df_spearman["deprel"].isin(["punct", "root"]))]
df_spearman

In [None]:
fig = px.scatter(df_spearman
        , x="length_attentional"
        , y="correlation"
        , color="deprel"
        , hover_data=["deprel", "pvalue", "size"])
fig.show()
# fig.write_html("../images/universal-dependencies/UD_EWT_spearman_correlation_linguistics_vs_attentional_by_length.html")

In [None]:
df_spearman.groupby("deprel").agg({"correlation":"mean", "size":"mean", "deprel":"count"}).rename(columns={"deprel":"count_deprel"}).sort_values("count_deprel", ascending=False)

### Combined relations

In [None]:
l_deprel_selected = ['nsubj', 'aux', 'advcl', 'conj']

In [None]:
df = pd.merge(df_attentional
    ,df_ud_english.loc[df_ud_english["deprel_universal"].isin(l_deprel_selected)].groupby("index").agg({"abs_length_relation":"mean"}).reset_index().rename(columns={"abs_length_relation":"score_linguistics"})
    ,how="inner", left_index=True, right_on="index")
l_spearmanr = []
for length_attentional in np.sort(df["length_attentional"].unique()):
    a = df.loc[df["length_attentional"]==length_attentional, ["score_attentional", "score_linguistics"]].values
    a_attentional = a[:, 0]
    a_linguistics = a[:, 1]
    spearmanr = scipy.stats.spearmanr(a_attentional, a_linguistics)
    l_spearmanr.append(("combined", length_attentional, spearmanr.correlation, spearmanr.pvalue, len(a)))

In [None]:
import plotly.graph_objects as go
df = pd.DataFrame(l_spearmanr, columns=["deprel", "length_attentional", "correlation", "pvalue", "size"])
df["significant_pvalue"] = df["pvalue"]<0.05
fig = px.line(df, x="length_attentional", y="correlation", markers=False
        ,hover_data=["pvalue", "size"])
fig.add_trace(go.Scatter(x=df["length_attentional"], y=df["correlation"], mode='markers', marker=dict(color=df["significant_pvalue"].map({True:"Green", False:"Red"})
                                                                                                      , size=df["size"]*0.025)))
fig.update_layout(title=f"Relation used: "+", ".join(l_deprel_selected) + " (red: p-value > 0.05, green: p-value < 0.05, size: number of samples)")
# fig.write_html("../images/universal-dependencies/UD_EWT_spearman_correlation_linguistics_vs_attentional_by_length_combined.html")
fig.update_layout(xaxis_title="Longueur du texte",
                yaxis_title="Correlation")
fig.show()

# <>

In [None]:
input_text1 = "Joe is running for president. I think I will vote for Donald. He is a good candidate."
input_text2 = "Joe is running for president. I think I will vote for him. He is a good candidate."
t_attention1 = burdenizer._get_attention(input_text1).mean(dim=[0,1], keepdim=True)
tokens1 = burdenizer.tokens
t_attention2 = burdenizer._get_attention(input_text2).mean(dim=[0,1], keepdim=True)
tokens2 = burdenizer.tokens

tokens_merged = []
for i in range(len(tokens1)):
    if tokens1[i] == tokens2[i]:
        tokens_merged.append(tokens1[i])
    else:
        tokens_merged.append(f"{tokens1[i]}/{tokens2[i]}")

# normalize
t_attention1 = t_attention1/t_attention1.abs().sum()
t_attention2 = t_attention2/t_attention2.abs().sum()

t = torch.concatenate([torch.nn.ReLU()(t_attention1 - t_attention2)
                   , torch.nn.ReLU()(t_attention2 - t_attention1)]
                  , dim=1)

print(f"Attentional burden 1: {burdenizer.compute_burden(input_text1)}")
print(f"Attentional burden 2: {burdenizer.compute_burden(input_text2)}")
head_view([t*1500], tokens=tokens_merged)