# Progetto di Social Computing

a.a. 2022-2023

## Attività preliminari

### Librerie e costanti

In [1]:
# Caricamento delle librerie
import os, json, random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Cartelle di salvataggio
data_folder = "./data"
out_folder = "./out"

### Funzioni

In [3]:
# Salvataggio in locale
def serialize_json(folder, filename, data):
    if not os.path.exists(folder):
        os.makedirs(folder, exist_ok=True)
    
    with open(f"{folder}/{filename}", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent = 4)
        f.close()
    print(f"Data serialized to path: {folder}/{filename}")

In [4]:
# Lettura da locale
def read_json(path):
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as file:
            data = json.load(file)
        print(f"Data read from path: {path}")
        return data
    else:
        print(f"No data found at path: {path}")
        return {}

## Caricamento del dataset

Si carica il dataset fornito, frammento della combinazione dei dataset [FEVER](https://fever.ai/dataset/fever.html) e [e-FEVER](https://doi.org/10.3929/ethz-b-000453826).

In [32]:
# Si carica e si mostra il dataset fornito
df = pd.read_csv("./group_9.csv")
display(df)

Unnamed: 0,id,statement,explanation_human,explanation_model,label
0,51526,Hush (2016 film) was produced by Jason Blum.,Hush (2016 film) was produced by Trevor Macy a...,The evidence says that the film was produced b...,REFUTES
1,77465,Winter's Tale was released in 1987.,Winter's Tale was released in 2014.,The claim is that Winter's Tale was released i...,REFUTES
2,166632,Anne Rice was born in the United States of Ame...,"Anne Rice was born in New Orleans, Louisiana, ...",The claim is that Anne Rice was born in the Un...,SUPPORTS


## Creazione degli HITs

Con il dataset fornito, vogliamo creare creare dodici HITs aventi le seguenti caratteristiche:

1. contiene 3 elementi;
2. ogni elemento è dotato di 4 attributi:
   1. `id`: identificatore dello statement;
   2. `statement`: testo dello statement;
   3. `explanation`: testo della spiegazione;
   4. `label`: etichetta della spiegazione.
3. per due elementi su tre vale che `explanation` = `explanation_model`;
4. per un elemento su tre vale che `explanation` = `explanation_human`;
5. la posizione dei tre elementi deve essere casuale.

In [14]:
# Funzione per l'interpretazione delle righe del dataset
def parse_row(row, isGold = False):
    parsed = {
        "id" : row.id,
        "statement" : row.statement,
        "explanation" : row.explanation_model,
        "label" : row.label,
        "isGold" : isGold
    }

    # Se è una "domanda d'oro", la spiegazione deve essere quella fornita da un essere umano
    if isGold:
        parsed["explanation"] = row.explanation_human
    
    return parsed

In [31]:
# Creiamo tutte le possibili permutazioni dei 3 elementi secondo quanto stabilito dalla consegna
all_HITs = []

# Iteriamo sulle spiegazioni fornite dai modelli di machine learning
for model_exp in df.itertuples():
    model_HITs = []

    # Iteriamo sugli altri "statements"
    for other_stat in df.itertuples():
        if other_stat.id != model_exp.id:
            # Inseriamo la versione con la "domanda d'oro" per la spiegazione del modello (model_exp)
            other_model_HIT = [parse_row(model_exp), parse_row(model_exp, True), parse_row(other_stat)]
            # Inseriamo la versione con la "domanda d'oro" per questo statement (other_stat)
            other_gold_HIT = [parse_row(model_exp), parse_row(other_stat, True), parse_row(other_stat)]

            # Riordiniamo pseudo-casualmente gli elementi
            random.shuffle(other_model_HIT)
            random.shuffle(other_gold_HIT)
            # Aggiungiamo le HIT create a quelle relative a questa spiegazione (model_exp)
            model_HITs.append(other_model_HIT)
            model_HITs.append(other_gold_HIT)
    
    # Si concatena il tutto a tutti gli HITs possibili
    all_HITs += model_HITs

# Riposizioniamo gli elementi della lista in maniera pseudo-causale
random.shuffle(all_HITs)

# Salviamo la lista degli HIT creata
serialize_json(data_folder, "all_HITs.json", all_HITs)

Data serialized to path: ./data/all_HITs.json


### Adattamento degli HITs per Crowd_Frame

In [5]:
# Generazione stringhe casuali
def generate_random_string(n_chars):
    random_string = ""
    for i in range(n_chars):
        # Generiamo un carattere minuscolo
        random_integer = random.randint(97, 97 + 26 - 1)
        flip_bit = random.randint(0, 1)
        # Lo rendiamo casualmente maiuscolo
        random_integer = random_integer - 32 if flip_bit == 1 else random_integer
        # Concateniamo alla stringa casuale
        random_string += chr(random_integer)
    return random_string

In [9]:
# Carichiamo gli HITs generati
raw_HITs = read_json(data_folder+"/all_HITs.json")

HITs = []
id = 0
documents_number = 3

for raw_HIT in raw_HITs:
    # Creiamo l'HIT definitiva
    HIT = {
        "unit_id" : "unit_"+str(id),
        "token_input" : generate_random_string(10),
        "token_output" : generate_random_string(10),
        "documents_number" : documents_number
    }
    # Aggiungiamo i documenti
    documents = []
    for element in raw_HIT:
        # Discriminiamo le domande d'oro
        pre = "G_" if element["isGold"] else "N_"
        # Creiamo il documento con i suoi attributi e lo aggiungiamo
        document = {
            "id" : pre+str(element["id"]),
            "statement" : element["statement"],
            "label" : element["label"],
            "explanation" : element["explanation"]
        }
        documents.append(document)
    # Associamo i documenti riformattati all'HIT
    HIT["documents"] = documents
    # Aggiungiamo l'HIT all'insieme delle HIT
    HITs.append(HIT)
    # Incremetiamo il valore dell'id
    id += 1

# Si esporta il tutto oome file JSON
serialize_json(out_folder, "hits.json", HITs)

Data read from path: ./data/all_HITs.json
Data serialized to path: ./out/hits.json


## Analisi dei risultati

### Percent agreement per le variabili categoriali

In [33]:
# Calcolo del percent-agreement (pa)
def get_pair_pa(df, dimension, w1, w2):
    dim_name = "doc_"+dimension+"_value"
    # Righe con i dati di w1 e w2
    w1_rows = df.loc[df["worker_id"] == w1]
    w2_rows = df.loc[df["worker_id"] == w2]
    # Documenti valutati da w1 e w2
    w1_docs = list(w1_rows["doc_id"])
    w2_docs = list(w2_rows["doc_id"])

    # Documenti valutati sia da w1 che da w2
    docs = [doc for doc in w1_docs if doc in w2_docs]

    if (len(docs) == 0):
        # Valore indefinito se non ci sono documenti in comune
        return -1
    else:
        total = len(docs)
        n_agree = 0
        # Righe di w1 e w2 contenenti documenti in comune
        w1_rows = w1_rows[w1_rows["doc_id"].isin(docs)]
        w2_rows = w2_rows[w2_rows["doc_id"].isin(docs)]

        for doc in docs:
            # Risposta di w1 per dimension
            v1 = list(w1_rows[w1_rows["doc_id"] == doc][dim_name])[0]
            # Risposta di w2 per dimension
            v2 = list(w2_rows[w2_rows["doc_id"] == doc][dim_name])[0]
            if (v1 == v2):
                # Se sono d'accordo sul valore, incremento il numero di accordi
                n_agree += 1
        # Il percent agreement è il numero di accordi su tutti i documenti in comune
        return n_agree / total * 100

In [32]:
# Calcolo del percent-agreement (pa) su una dimensione
def get_dimension_pa(df, dimension):
    # Lista di tutti i worker che hanno risposto
    workers = list(df["worker_id"].drop_duplicates())
    percent_agreement = []
    
    for worker_1 in workers:
        # Si identifica il primo elemento della coppia
        pair_agreement = {
            "worker_x" : worker_1,
        }
        for worker_2 in workers:
            # Si calcola l'accordo percentuale con il secondo elemento della coppia
            pair_agreement[worker_2] = round(get_pair_pa(df, dimension, worker_1, worker_2), 1)
        percent_agreement.append(pair_agreement)
    
    # Si interpretano i dati in un dataframe
    pa = pd.DataFrame.from_dict(percent_agreement, orient="columns")
    pa = pa.set_index("worker_x")
    return pa


In [34]:
# Si carica il csv con le risposte dei worker
answers = pd.read_csv("./result/secondo_progetto_social_computing/Dataframe/workers_answers.csv",
                        usecols=["worker_id", "unit_id", "doc_id", "doc_truthfulness-1_value", "doc_explanation-quality_value", "doc_truthfulness-2_value", "doc_time_elapsed"])
# Si eliminano i duplicati: scegliamo la riga più recente (quella definitiva)
answers = answers.loc[answers.reset_index().groupby(["worker_id", "doc_id"])["doc_time_elapsed"].idxmax()]

# Calcoliamo il percent agreement per tutte le dimensioni categoriali
for dimension in ["truthfulness-1", "truthfulness-2"]:
    pa = get_dimension_pa(answers, dimension)
    pa.to_csv(out_folder+"/"+dimension+"_pa.csv")
    print(dimension+" percent agreement")
    display(pa)

truthfulness-1 percent agreement


Unnamed: 0_level_0,A1TEEFJDPVEK0L,A2N1GA8PJDDA6P,A2Q51AC4E6I5ZB,A2Z4OTGC834F3Y,A348JKD82WQ6Z,A3OAQZM6Q3YJQ1,A3T2NTPGB3KNDS,A3W16X5D0VGU0E,AYKZJHEV29ZHL,AYUF9OHXQK2YT
worker_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A1TEEFJDPVEK0L,100.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,50.0,0.0
A2N1GA8PJDDA6P,0.0,100.0,0.0,0.0,0.0,100.0,-1.0,0.0,33.3,0.0
A2Q51AC4E6I5ZB,0.0,0.0,100.0,100.0,0.0,0.0,100.0,0.0,0.0,0.0
A2Z4OTGC834F3Y,0.0,0.0,100.0,100.0,100.0,100.0,-1.0,100.0,0.0,50.0
A348JKD82WQ6Z,0.0,0.0,0.0,100.0,100.0,66.7,-1.0,66.7,0.0,100.0
A3OAQZM6Q3YJQ1,0.0,100.0,0.0,100.0,66.7,100.0,-1.0,33.3,100.0,100.0
A3T2NTPGB3KNDS,-1.0,-1.0,100.0,-1.0,-1.0,-1.0,100.0,-1.0,-1.0,-1.0
A3W16X5D0VGU0E,0.0,0.0,0.0,100.0,66.7,33.3,-1.0,100.0,0.0,50.0
AYKZJHEV29ZHL,50.0,33.3,0.0,0.0,0.0,100.0,-1.0,0.0,100.0,0.0
AYUF9OHXQK2YT,0.0,0.0,0.0,50.0,100.0,100.0,-1.0,50.0,0.0,100.0


truthfulness-2 percent agreement


Unnamed: 0_level_0,A1TEEFJDPVEK0L,A2N1GA8PJDDA6P,A2Q51AC4E6I5ZB,A2Z4OTGC834F3Y,A348JKD82WQ6Z,A3OAQZM6Q3YJQ1,A3T2NTPGB3KNDS,A3W16X5D0VGU0E,AYKZJHEV29ZHL,AYUF9OHXQK2YT
worker_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A1TEEFJDPVEK0L,100.0,0.0,100.0,66.7,100.0,100.0,-1.0,100.0,0.0,100.0
A2N1GA8PJDDA6P,0.0,100.0,0.0,50.0,0.0,0.0,-1.0,0.0,66.7,0.0
A2Q51AC4E6I5ZB,100.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
A2Z4OTGC834F3Y,66.7,50.0,0.0,100.0,100.0,100.0,-1.0,100.0,50.0,50.0
A348JKD82WQ6Z,100.0,0.0,0.0,100.0,100.0,66.7,-1.0,66.7,0.0,100.0
A3OAQZM6Q3YJQ1,100.0,0.0,0.0,100.0,66.7,100.0,-1.0,33.3,0.0,100.0
A3T2NTPGB3KNDS,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,100.0,-1.0,-1.0,-1.0
A3W16X5D0VGU0E,100.0,0.0,0.0,100.0,66.7,33.3,-1.0,100.0,0.0,50.0
AYKZJHEV29ZHL,0.0,66.7,0.0,50.0,0.0,0.0,-1.0,0.0,100.0,0.0
AYUF9OHXQK2YT,100.0,0.0,100.0,50.0,100.0,100.0,-1.0,50.0,0.0,100.0


In [35]:
# Un lavoratore ha quasi tutti i valori di accordo indefiniti, lo escludo da qui in poi
bad_worker = "A3T2NTPGB3KNDS"

### Percentuale media di testo annotato

In [3]:
# Conta le parole in una stringa
def count_words(string):
    return len(string.strip('.').split(" "))

In [4]:
# Unisce le tre stringhe
def bind_strings(left, center, right):
    # Rimuovo gli spazi iniziali o finali le stringhe
    left = left.strip(' ')
    center = center.strip(' ')
    right = right.strip(' ')

    # Creo gli spazi tra left-center e center-right se è necessario
    sep1 = " " if center[0].isalnum() else ""
    if right != "":
        sep2 = " " if right[0].isalnum() else ""
    else:
        sep2 = ""

    # Concateno le stringhe ed i separatori
    return (left+sep1+center+sep2+right).strip()

In [36]:
# Carico il file contenente le annotazioni
annotations = pd.read_csv("./result/secondo_progetto_social_computing/Dataframe/workers_notes.csv",
                        usecols=["worker_id", "document_index", "note_timestamp_created", "note_text_left", "note_text_current", "note_text_right"])
# Rimuovo i duplicati: considero il record più recente
annotations = annotations.loc[annotations.reset_index().groupby(["worker_id", "document_index"])["note_timestamp_created"].idxmax()]
# Rimpiazzo eventuali valori indefiniti con la stringa vuota
annotations[["note_text_left", "note_text_current", "note_text_right"]] = annotations[["note_text_left", "note_text_current", "note_text_right"]].replace(np.nan, "")
# Rimuovo il lavoratore di infima qualità
annotations = annotations.loc[annotations["worker_id"] != bad_worker]
avg_annotations_len = {}

for index, row in annotations.iterrows():
    # Compongo l'intera spiegazione
    explanation = bind_strings(row["note_text_left"], row["note_text_current"], row["note_text_right"])
    # Conto le parole evidenziate e quelle totali
    ann_len = count_words(row["note_text_current"])
    exp_len = count_words(explanation)

    # Calcolo la percentuale di parole annotate
    ann_perc = ann_len / exp_len * 100

    if explanation in avg_annotations_len:
        # Se la spiegazione è nel vocabolario, aggiungo il valore percentuale di questa annotazione...
        avg_annotations_len[explanation].append(ann_perc)
    else:
        # ... altrimenti creo l'entry per la spiegazione con il valore che ho calcolato
        avg_annotations_len[explanation] = [ann_perc]

# Calcolo la media delle liste di valori di ogni annotazione
for annotation in avg_annotations_len:
    percs = avg_annotations_len[annotation]
    avg_annotations_len[annotation] = sum(percs) / len(percs)

# Ristrutturo tutto in un dataframe ed ordino in ordine decrescente in base al valore percentuale
avg_annotations_len = pd.DataFrame.from_dict(avg_annotations_len, orient="index", columns=["annotation_percentage"])
avg_annotations_len.sort_values(by=["annotation_percentage"], inplace=True, ascending=False)
avg_annotations_len.to_csv(out_folder+"/avarage_annotations_length.csv")
display(avg_annotations_len)

Unnamed: 0,annotation_percentage
Winter's Tale was released in 2014.,88.888889
Hush (2016 film) was produced by Trevor Macy and Jason Blum.,59.090909
"Anne Rice was born in New Orleans, Louisiana, which is in the United States of America.",51.5625
"The claim is that Winter's Tale was released in 1987. The evidence states that Winter's Tale is a 1983 novel by Mark Helprin. This is a novel, so it wasn't released in 1987. Therefore, the claim is false.",32.017544
"The claim is that Anne Rice was born in the United States of America. The evidence states that she was born in New Orleans and that New Orleans is a major United States port. Therefore, the claim is true.",30.769231
"The evidence says that the film was produced by Trevor Macy and Jason Blum. The claim says that the film was produced by Jason Blum. So, the answer must be false because the film was produced by Trevor Macy, not just Jason Blum.",30.232558


### Numero di volte in cui una spiegazione è stata aggiornata

In [37]:
# Carico il file csv di annotazioni fatte dai worker
annotations = pd.read_csv("./result/secondo_progetto_social_computing/Dataframe/workers_notes.csv",
                        usecols=["worker_id", "document_index", "note_timestamp_created", "note_text_left", "note_text_current", "note_text_right"])
# Sostituisco i valori non definiti con la stringa vuota
annotations[["note_text_left", "note_text_current", "note_text_right"]] = annotations[["note_text_left", "note_text_current", "note_text_right"]].replace(np.nan, "")
# Rimuovo il lavoratore bandito
annotations = annotations.loc[annotations["worker_id"] != bad_worker]

# Creo una lista in modo che ad ogni worker venga associata la sua spiegazione
explanations = []
for index, row in annotations.iterrows():
    explanation = bind_strings(row["note_text_left"], row["note_text_current"], row["note_text_right"])
    explanations.append(explanation)
annotations["explanation"] = explanations

# Calcolo quante volte è stata aggiunta/modificata un'annotazione
annotations = annotations.groupby(by = ["worker_id", "explanation"]).size().to_frame("n_updates")

# Considero solo le annotazioni modificate
updated_annotations = annotations[annotations["n_updates"] > 1]
# Decrementando di uno il conteggio di aggiunta/modifica si ottiene il conteggio delle modifiche
updated_annotations["n_updates"] = updated_annotations["n_updates"] - 1
# Sommo per ogni spiegazione, quante volte è stata modificata l'annotazione corrispondente
updated_annotations = updated_annotations.groupby(["explanation"]).sum()

# Ordino in ordine decrescente rispetto al numero di aggiornamenti
updated_annotations.sort_values(by=["n_updates"], inplace=True, ascending=False)
updated_annotations.to_csv(out_folder+"/number_annotation_updates.csv")
display(updated_annotations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_annotations["n_updates"] = updated_annotations["n_updates"] - 1


Unnamed: 0_level_0,n_updates
explanation,Unnamed: 1_level_1
"The claim is that Winter's Tale was released in 1987. The evidence states that Winter's Tale is a 1983 novel by Mark Helprin. This is a novel, so it wasn't released in 1987. Therefore, the claim is false.",2
"The evidence says that the film was produced by Trevor Macy and Jason Blum. The claim says that the film was produced by Jason Blum. So, the answer must be false because the film was produced by Trevor Macy, not just Jason Blum.",2
"The claim is that Anne Rice was born in the United States of America. The evidence states that she was born in New Orleans and that New Orleans is a major United States port. Therefore, the claim is true.",1


### Tempo medio impiegato dai worker per valutare ciascun elemento

In [40]:
# Carico il csv con le risposte dei worker
answers = pd.read_csv("./result/secondo_progetto_social_computing/Dataframe/workers_answers.csv",
                        usecols=["worker_id", "doc_id", "doc_time_elapsed"])
# Rimuovo le ripetizioni considerando solo le entries più recenti
answers = answers.loc[answers.reset_index().groupby(["worker_id", "doc_id"])["doc_time_elapsed"].idxmax()]
# Rimuovo il worker bandito
answers = answers.loc[answers["worker_id"] != bad_worker]

# Faccio la media della durata per ogni documento
times = answers.groupby("doc_id").mean()
# Ordino in ordine decrescente sulla base del tempo trascorso a valutare un documento
times.sort_values(by=["doc_time_elapsed"], inplace=True, ascending=False)
times.to_csv(out_folder+"/avarage_time.csv")
display(times)

  times = answers.groupby("doc_id").mean()


Unnamed: 0_level_0,doc_time_elapsed
doc_id,Unnamed: 1_level_1
N_166632,291.448333
N_51526,162.015
N_77465,156.171667
G_77465,146.495
G_166632,131.1675
G_51526,124.36
