# Progetto di Social Computing

a.a. 2022-2023

## Attività preliminari

### Librerie e costanti

In [1]:
# Caricamento delle librerie
import os, json, random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
# Cartelle di salvataggio
data_folder = "./data"
out_folder = "./out"

### Funzioni

In [3]:
# Salvataggio in locale
def serialize_json(folder, filename, data):
    if not os.path.exists(folder):
        os.makedirs(folder, exist_ok=True)
    
    with open(f"{folder}/{filename}", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent = 4)
        f.close()
    print(f"Data serialized to path: {folder}/{filename}")

In [4]:
# Lettura da locale
def read_json(path):
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as file:
            data = json.load(file)
        print(f"Data read from path: {path}")
        return data
    else:
        print(f"No data found at path: {path}")
        return {}

## Caricamento del dataset

Si carica il dataset fornito, frammento della combinazione dei dataset [FEVER](https://fever.ai/dataset/fever.html) e [e-FEVER](https://doi.org/10.3929/ethz-b-000453826).

In [32]:
# Si carica e si mostra il dataset fornito
df = pd.read_csv("./group_9.csv")
display(df)

Unnamed: 0,id,statement,explanation_human,explanation_model,label
0,51526,Hush (2016 film) was produced by Jason Blum.,Hush (2016 film) was produced by Trevor Macy a...,The evidence says that the film was produced b...,REFUTES
1,77465,Winter's Tale was released in 1987.,Winter's Tale was released in 2014.,The claim is that Winter's Tale was released i...,REFUTES
2,166632,Anne Rice was born in the United States of Ame...,"Anne Rice was born in New Orleans, Louisiana, ...",The claim is that Anne Rice was born in the Un...,SUPPORTS


## Creazione degli HITs

Con il dataset fornito, vogliamo creare creare dodici HITs aventi le seguenti caratteristiche:

1. contiene 3 elementi;
2. ogni elemento è dotato di 4 attributi:
   1. `id`: identificatore dello statement;
   2. `statement`: testo dello statement;
   3. `explanation`: testo della spiegazione;
   4. `label`: etichetta della spiegazione.
3. per due elementi su tre vale che `explanation` = `explanation_model`;
4. per un elemento su tre vale che `explanation` = `explanation_human`;
5. la posizione dei tre elementi deve essere casuale.

In [14]:
# Funzione per l'interpretazione delle righe del dataset
def parse_row(row, isGold = False):
    parsed = {
        "id" : row.id,
        "statement" : row.statement,
        "explanation" : row.explanation_model,
        "label" : row.label,
        "isGold" : isGold
    }

    # Se è una "domanda d'oro", la spiegazione deve essere quella fornita da un essere umano
    if isGold:
        parsed["explanation"] = row.explanation_human
    
    return parsed

In [31]:
# Creiamo tutte le possibili permutazioni dei 3 elementi secondo quanto stabilito dalla consegna
all_HITs = []

# Iteriamo sulle spiegazioni fornite dai modelli di machine learning
for model_exp in df.itertuples():
    model_HITs = []

    # Iteriamo sugli altri "statements"
    for other_stat in df.itertuples():
        if other_stat.id != model_exp.id:
            # Inseriamo la versione con la "domanda d'oro" per la spiegazione del modello (model_exp)
            other_model_HIT = [parse_row(model_exp), parse_row(model_exp, True), parse_row(other_stat)]
            # Inseriamo la versione con la "domanda d'oro" per questo statement (other_stat)
            other_gold_HIT = [parse_row(model_exp), parse_row(other_stat, True), parse_row(other_stat)]

            # Riordiniamo pseudo-casualmente gli elementi
            random.shuffle(other_model_HIT)
            random.shuffle(other_gold_HIT)
            # Aggiungiamo le HIT create a quelle relative a questa spiegazione (model_exp)
            model_HITs.append(other_model_HIT)
            model_HITs.append(other_gold_HIT)
    
    # Si concatena il tutto a tutti gli HITs possibili
    all_HITs += model_HITs

# Riposizioniamo gli elementi della lista in maniera pseudo-causale
random.shuffle(all_HITs)

# Salviamo la lista degli HIT creata
serialize_json(data_folder, "all_HITs.json", all_HITs)

Data serialized to path: ./data/all_HITs.json


### Adattamento degli HITs per Crowd_Frame

Ristrutturiamo gli HITs generati precedentemente in modo da renderli leggibili al software [Crowd_Frame](https://github.com/Miccighel/Crowd_Frame).

In [5]:
def generate_random_string(n_chars):
    random_string = ""
    for i in range(n_chars):
        # Generiamo un carattere minuscolo
        random_integer = random.randint(97, 97 + 26 - 1)
        flip_bit = random.randint(0, 1)
        # Lo rendiamo casualmente maiuscolo
        random_integer = random_integer - 32 if flip_bit == 1 else random_integer
        # Concateniamo alla stringa casuale
        random_string += chr(random_integer)
    return random_string

In [9]:
# Carichiamo gli HITs generati
raw_HITs = read_json(data_folder+"/all_HITs.json")

HITs = []
id = 0
documents_number = 3

for raw_HIT in raw_HITs:
    # Creiamo l'HIT definitiva
    HIT = {
        "unit_id" : "unit_"+str(id),
        "token_input" : generate_random_string(10),
        "token_output" : generate_random_string(10),
        "documents_number" : documents_number
    }
    # Aggiungiamo i documenti
    documents = []
    for element in raw_HIT:
        # Discriminiamo le domande d'oro
        pre = "G_" if element["isGold"] else "N_"
        # Creiamo il documento con i suoi attributi e lo aggiungiamo
        document = {
            "id" : pre+str(element["id"]),
            "statement" : element["statement"],
            "label" : element["label"],
            "explanation" : element["explanation"]
        }
        documents.append(document)
    # Associamo i documenti riformattati all'HIT
    HIT["documents"] = documents
    # Aggiungiamo l'HIT all'insieme delle HIT
    HITs.append(HIT)
    # Incremetiamo il valore dell'id
    id += 1

# Si esporta il tutto oome file JSON
serialize_json(out_folder, "hits.json", HITs)

Data read from path: ./data/all_HITs.json
Data serialized to path: ./out/hits.json


## Analisi dei risultati

### Percent agreement per le variabili categoriali

In [26]:
def get_pair_pa(df, dimension, w1, w2):
    dim_name = "doc_"+dimension+"_value"
    w1_rows = df.loc[df["worker_id"] == w1]
    w2_rows = df.loc[df["worker_id"] == w2]
    w1_docs = list(w1_rows["doc_id"])
    w2_docs = list(w2_rows["doc_id"])

    docs = [doc for doc in w1_docs if doc in w2_docs]

    if (len(docs) == 0):
        return -1
    else:
        total = len(docs)
        n_agree = 0
        w1_rows = w1_rows[w1_rows["doc_id"].isin(docs)]
        w2_rows = w2_rows[w2_rows["doc_id"].isin(docs)]

        for doc in docs:
            v1 = list(w1_rows[w1_rows["doc_id"] == doc][dim_name])[0]
            v2 = list(w2_rows[w2_rows["doc_id"] == doc][dim_name])[0]
            if (v1 == v2):
                n_agree += 1
        
        return n_agree / total * 100

In [10]:
def get_dimension_pa(df, dimension):
    workers = list(df["worker_id"].drop_duplicates())
    percent_agreement = []
    
    for worker_1 in workers:
        pair_agreement = {
            "worker_x" : worker_1,
        }
        for worker_2 in workers:
            pair_agreement[worker_2] = get_pair_pa(df, dimension, worker_1, worker_2)
        percent_agreement.append(pair_agreement)
    
    pa = pd.DataFrame.from_dict(percent_agreement, orient="columns")
    pa = pa.set_index("worker_x")
    return pa


In [27]:
answers = pd.read_csv("./result/secondo_progetto_social_computing/Dataframe/workers_answers.csv",
                        usecols=["worker_id", "unit_id", "doc_id", "doc_truthfulness-1_value", "doc_explanation-quality_value", "doc_truthfulness-2_value", "doc_time_elapsed"])
answers = answers.loc[answers.reset_index().groupby(["worker_id", "doc_id"])["doc_time_elapsed"].idxmax()]

for dimension in ["truthfulness-1", "truthfulness-2"]:
    pa = get_dimension_pa(answers, dimension)
    pa.to_csv(out_folder+"/"+dimension+"_pa.csv")
    print(dimension+" percent agreement")
    display(pa)

truthfulness-1 percent agreement


Unnamed: 0_level_0,A1TEEFJDPVEK0L,A2N1GA8PJDDA6P,A2Q51AC4E6I5ZB,A2Z4OTGC834F3Y,A348JKD82WQ6Z,A3OAQZM6Q3YJQ1,A3T2NTPGB3KNDS,A3W16X5D0VGU0E,AYKZJHEV29ZHL,AYUF9OHXQK2YT
worker_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A1TEEFJDPVEK0L,100.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,50.0,0.0
A2N1GA8PJDDA6P,0.0,100.0,0.0,0.0,0.0,100.0,-1.0,0.0,33.333333,0.0
A2Q51AC4E6I5ZB,0.0,0.0,100.0,100.0,0.0,0.0,100.0,0.0,0.0,0.0
A2Z4OTGC834F3Y,0.0,0.0,100.0,100.0,100.0,100.0,-1.0,100.0,0.0,50.0
A348JKD82WQ6Z,0.0,0.0,0.0,100.0,100.0,66.666667,-1.0,66.666667,0.0,100.0
A3OAQZM6Q3YJQ1,0.0,100.0,0.0,100.0,66.666667,100.0,-1.0,33.333333,100.0,100.0
A3T2NTPGB3KNDS,-1.0,-1.0,100.0,-1.0,-1.0,-1.0,100.0,-1.0,-1.0,-1.0
A3W16X5D0VGU0E,0.0,0.0,0.0,100.0,66.666667,33.333333,-1.0,100.0,0.0,50.0
AYKZJHEV29ZHL,50.0,33.333333,0.0,0.0,0.0,100.0,-1.0,0.0,100.0,0.0
AYUF9OHXQK2YT,0.0,0.0,0.0,50.0,100.0,100.0,-1.0,50.0,0.0,100.0


truthfulness-2 percent agreement


Unnamed: 0_level_0,A1TEEFJDPVEK0L,A2N1GA8PJDDA6P,A2Q51AC4E6I5ZB,A2Z4OTGC834F3Y,A348JKD82WQ6Z,A3OAQZM6Q3YJQ1,A3T2NTPGB3KNDS,A3W16X5D0VGU0E,AYKZJHEV29ZHL,AYUF9OHXQK2YT
worker_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A1TEEFJDPVEK0L,100.0,0.0,100.0,66.666667,100.0,100.0,-1.0,100.0,0.0,100.0
A2N1GA8PJDDA6P,0.0,100.0,0.0,50.0,0.0,0.0,-1.0,0.0,66.666667,0.0
A2Q51AC4E6I5ZB,100.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
A2Z4OTGC834F3Y,66.666667,50.0,0.0,100.0,100.0,100.0,-1.0,100.0,50.0,50.0
A348JKD82WQ6Z,100.0,0.0,0.0,100.0,100.0,66.666667,-1.0,66.666667,0.0,100.0
A3OAQZM6Q3YJQ1,100.0,0.0,0.0,100.0,66.666667,100.0,-1.0,33.333333,0.0,100.0
A3T2NTPGB3KNDS,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,100.0,-1.0,-1.0,-1.0
A3W16X5D0VGU0E,100.0,0.0,0.0,100.0,66.666667,33.333333,-1.0,100.0,0.0,50.0
AYKZJHEV29ZHL,0.0,66.666667,0.0,50.0,0.0,0.0,-1.0,0.0,100.0,0.0
AYUF9OHXQK2YT,100.0,0.0,100.0,50.0,100.0,100.0,-1.0,50.0,0.0,100.0


### Percentuale di testo annotato