# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
root_folder_name = "bertaphore"
p = os.getcwd()
while os.path.basename(p) != root_folder_name:
    p = os.path.dirname(p)
sys.path.insert(0, p)

import torch
from conllu import parse
import pandas as pd
import numpy as np
import plotly.express as px
from modules import metric
from transformers import AutoTokenizer, AutoModel, utils
utils.logging.set_verbosity_error()  # Suppress standard warnings
from bertviz import model_view, head_view
from scipy.linalg import toeplitz


model_name = "microsoft/xtremedistil-l12-h384-uncased"  # Find popular HuggingFace models here: https://huggingface.co/models
model = AutoModel.from_pretrained(model_name, output_attentions=True)  # Configure model to return attention values
tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


# Experiment

In [3]:
burdenizer = metric.AttentionalBurden(model, tokenizer
                                      , prune_cls_sep=True
                                      , normalize_attention=True
                                      )
burdenizer.compute_burden("Alice is eating pizza. I like Bob.")

2.373994827270508

In [4]:
data = open("/home/pierrick/datasets/Universal Dependencies 2.14/ud-treebanks-v2.14/UD_English-EWT/en_ewt-ud-train.conllu", "r").read()
data = parse(data)
# data[4].metadata["text"]

In [5]:
D_RELATIONS = {"punct":False
               ,"root":False
               ,"nsubj":True
               ,"case":True
               ,"det":True
               ,"advmod":True
               ,"nmod":True
               ,"obl":True
               ,"obj":True
               ,"amod":True
               ,"compound":False
               ,"aux":True
               ,"conj":False
               ,"mark":False
               ,"cc":False
               ,"cop":True
               ,'advcl': False,
               'acl': True,
               'xcomp': True,
               'nummod': True,
               'ccomp': False,
               'flat': True,
               'parataxis': False,
               'appos': False,
               'discourse': False,
               'iobj': False,
               'expl': False,
               'fixed': False,
               'list': False,
               'csubj': False,
               'vocative': False,
               'goeswith': False,
               'reparandum': False,
               'orphan': False,
               'dislocated': False,
               'dep': False,
               '_': False
              }

In [6]:
l_head, l_deprel = [], []
for sentence in data:
    l_head_1 = []
    l_deprel_1 = []
    for token in sentence:
        l_head_1.append((token["id"], token["deprel"], token["head"]))
        deps = token["deps"]
        if isinstance(deps, list):
            for dep in deps:
                l_deprel_1.append((token["id"],)+dep)
    l_head.append(l_head_1)
    l_deprel.append(l_deprel_1)

## EDA

In [7]:
def process_parsedData(l_parsed_data):
    """Process parsed data and perform various transformations and aggregations.

        l_parsed_data (list): A list of parsed data. Each element represents a text.
            Each text is composed of a list of tuples. Each tuple has the format (id, deprel, head).
            i.e. (position, relation, position of related)

    Returns:
        pandas.DataFrame: Processed dataframe with calculated columns.
    """
    df = pd.DataFrame([x for sublist in l_parsed_data for x in sublist], columns=["id", "deprel", "head"])
    df["id_raw"] = df["id"]
    df["id"] = df["id_raw"].apply(lambda u: float(u) if isinstance(u, (int, float)) else np.nan)
    df["head_raw"] = df["head"]
    df["head"] = df["head_raw"].apply(lambda u: float(u) if isinstance(u, (int, float)) else np.nan)
    df["length_relation"] = df["head"] - df["id"]
    df["abs_length_relation"] = np.abs(df["length_relation"]).astype(float)
    df["deprel_universal"] = df["deprel"].apply(lambda u: u.split(":")[0])
    # groupby
    df = df.groupby("deprel_universal").agg({"abs_length_relation": "mean",
                                    "id": "count"}).sort_values("abs_length_relation"
                                                                ).rename(columns={"id": "count"})
    df["abs_length_relation"] = df["abs_length_relation"].fillna(0.)
    df["keep_attentional_burden"] = np.NaN
    df["keep_attentional_burden"] = df["keep_attentional_burden"].astype(bool)
    df = df.merge(pd.Series(D_RELATIONS, name="manual"), how="left", left_index=True, right_index=True)
    df["keep_attentional_burden"] = df["keep_attentional_burden"] & df["manual"]
    return df


df = process_parsedData(l_head)
df

Unnamed: 0_level_0,abs_length_relation,count,keep_attentional_burden,manual
deprel_universal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fixed,1.085455,550,False,False
iobj,1.248074,649,False,False
compound,1.299519,8320,False,False
nummod,1.335391,2430,True,True
goeswith,1.360902,133,False,False
amod,1.371606,9650,True,True
flat,1.478426,1576,True,True
expl,1.561667,600,False,False
aux,1.635496,7860,True,True
det,1.648677,15954,True,True


In [8]:
px.scatter(df, x="count", y="abs_length_relation", hover_data=[df.index],
           color="keep_attentional_burden"
           )

## Correlation between linguistic and attentional metrics

In [9]:
def calculate_linguisticsScores(l_parsed_data):
    d = {"score_linguistics":[], "length_linguistics":[]}
    for sentence in l_parsed_data:
        score = 0
        for (u, rel, v) in sentence:
            if D_RELATIONS[rel.split(":")[0]]:
                score += abs(u-v)
        d["score_linguistics"].append(score)
        d["length_linguistics"].append(len(sentence))
    return d

def calculate_attentionalScores(data, verbose=False):
    d = {"score_attentional":[], "length_attentional":[]}
    for i, sentence in enumerate(data):
        if i % 100 == 0 and verbose:
            print(f"Processing sentence {i}...")
        text = sentence.metadata["text"]
        burden = burdenizer.compute_burden(text)
        d["score_attentional"].append(burden)
        d["length_attentional"].append(len(burdenizer.tokens))
    return d

In [10]:
d_linguistics = calculate_linguisticsScores(l_head)
d_attentional = calculate_attentionalScores(data, verbose=True)

Processing sentence 0...
Processing sentence 100...
Processing sentence 200...
Processing sentence 300...
Processing sentence 400...
Processing sentence 500...
Processing sentence 600...
Processing sentence 700...
Processing sentence 800...
Processing sentence 900...
Processing sentence 1000...
Processing sentence 1100...
Processing sentence 1200...
Processing sentence 1300...
Processing sentence 1400...
Processing sentence 1500...
Processing sentence 1600...
Processing sentence 1700...
Processing sentence 1800...
Processing sentence 1900...
Processing sentence 2000...
Processing sentence 2100...
Processing sentence 2200...
Processing sentence 2300...
Processing sentence 2400...
Processing sentence 2500...
Processing sentence 2600...
Processing sentence 2700...
Processing sentence 2800...
Processing sentence 2900...
Processing sentence 3000...
Processing sentence 3100...
Processing sentence 3200...
Processing sentence 3300...
Processing sentence 3400...
Processing sentence 3500...
Proc

In [328]:
df_results = pd.DataFrame(d_linguistics | d_attentional)
df_results

Unnamed: 0,score_linguistics,length_linguistics,score_attentional,length_attentional
0,41,29,7.798654,34
1,30,18,4.516824,18
2,16,17,4.095079,18
3,29,16,3.979782,16
4,85,36,9.871046,41
...,...,...,...,...
12539,78,56,12.820340,57
12540,31,22,5.315275,22
12541,45,24,6.000471,24
12542,37,25,7.513777,30


In [329]:
px.line(df_results[["length_linguistics", "length_attentional"]].melt().groupby(["variable", "value"], as_index=False).size().rename(columns={"size":"count",
                                                                                                                              "variable":"type",
                                                                                                                             "value":"length"})
        ,x="length", y="count", color="type")

In [344]:
df_results = df_results.loc[(df_results["length_attentional"]<=50) & (df_results["length_attentional"]>=5)]
mask = np.minimum((df_results["length_attentional"]/df_results["length_linguistics"]).values,(df_results["length_linguistics"]/df_results["length_attentional"]).values)>0.75
df_results = df_results.loc[mask]
df_results["score_linguistics_normalized"] = df_results["score_linguistics"]/df_results["length_linguistics"]
df_results

Unnamed: 0,score_linguistics,length_linguistics,score_attentional,length_attentional,score_linguistics_normalized
0,41,29,7.798654,34,1.413793
1,30,18,4.516824,18,1.666667
2,16,17,4.095079,18,0.941176
3,29,16,3.979782,16,1.812500
4,85,36,9.871046,41,2.361111
...,...,...,...,...,...
12538,26,19,4.691922,19,1.368421
12540,31,22,5.315275,22,1.409091
12541,45,24,6.000471,24,1.875000
12542,37,25,7.513777,30,1.480000


In [345]:
import scipy
l_spearmanr = []
for length_attentional in np.sort(df_results["length_attentional"].unique()):
    df = df_results.loc[df_results["length_attentional"]==length_attentional]
    a_attentional = df["score_attentional"].values
    a_linguistics = df["score_linguistics"].values
    spearmanr = scipy.stats.spearmanr(a_attentional, a_linguistics)
    l_spearmanr.append((length_attentional, spearmanr.correlation, spearmanr.pvalue, len(df)))

In [363]:
df = pd.DataFrame(l_spearmanr, columns=["length_attentional", "correlation", "pvalue", "size"])
df["has_significant_pvalue"] = df["pvalue"]<0.05
px.scatter(df, x="length_attentional"
           , y="correlation"
           , color="has_significant_pvalue"
           , size="size"
           ,labels={"length_attentional":"nb of attentional tokens"
                    , "correlation":"Spearman correlation linguistics vs attentional"})

In [16]:
df

Unnamed: 0,length_attentional,correlation,pvalue,size,has_significant_pvalue
0,5,0.271434,2.615559e-06,291,True
1,6,0.31297,2.049646e-09,351,True
2,7,0.225614,2.680272e-05,340,True
3,8,0.231845,1.527832e-05,341,True
4,9,0.13438,0.01080878,359,True
5,10,0.16459,0.001282505,380,True
6,11,0.119302,0.01466503,418,True
7,12,-0.006273,0.9007165,398,False
8,13,0.072492,0.1361593,424,False
9,14,0.047493,0.3610105,372,False


# <>