In [1]:
import pickle
import pandas as pd
import re

# function to read data from file
def read_tagging(file_name,language):
    path="../data/"+language+"/tagging/"+file_name+".conllu"
    data = pd.read_csv (path, sep = '\t',quoting=3, names=["POSITION","WORD","TAG"])
    return data


def extract_sentences_from_dataframe(df):
    sentences = ''
    for index, row in df.iterrows():
        word = row['WORD']
        if pd.notnull(word):  # Se la parola non è nulla
            if sentences:  # Se c'è già una frase, aggiungi uno spazio prima della nuova parola
                sentences += ' '
            sentences += word
    return sentences

#deserialize data from a file
def load_data(file_name,language):
    path="../data/"+language+"/"+file_name
    try: 
        file = open(path, 'rb') 
        data = pickle.load(file) 
        return data
    except: 
        print("Error in reading data")



In [2]:
import pandas as pd

def extract_entities_from_dataframe(df):
    entities = []
    current_entity = None
    current_sentence = None
    start_index = None

    for index, row in df.iterrows():
        word = row['WORD']
        tag = row['TAG']
        position = index  # Assumendo che l'indice dell'iterazione rappresenti la posizione nella frase

        if current_sentence != position:
            if current_entity is not None:
                entities.append((current_entity, f"sent-{current_sentence}", start_index, index - 1))
                current_entity = None
            current_sentence = position
            start_index = index

        if tag != "O":
            entity_type = tag.split("-")[1]  # Estrai il tipo di entità dal tag
            if current_entity is None:
                current_entity = entity_type
            elif current_entity != entity_type:
                entities.append((current_entity, f"sent-{current_sentence}", start_index, index - 1))
                current_entity = entity_type
                start_index = index

    if current_entity is not None:
        entities.append((current_entity, f"sent-{current_sentence}", start_index, index))

    return pd.DataFrame(entities, columns=['ENTITA', 'FRASE', 'INIZIO', 'FINE'])




In [3]:
import pandas as pd

def calculate_accuracy(system_df, golden_df):
    # Uniamo i due dataframe per confrontare i tag
    merged_df = pd.merge(system_df, golden_df, left_index=True, right_index=True, suffixes=('_system', '_golden'))
    
    # Contiamo quante volte i tag corrispondenti sono uguali
    correct_tags = (merged_df['TAG_system'] == merged_df['TAG_golden']).sum()
    
    # Calcoliamo l'accuratezza
    accuracy = correct_tags / len(system_df)
    
    # Converti in percentuale e arrotonda alla prima cifra decimale
    accuracy_percent = round(accuracy * 100, 1)
    
    return accuracy_percent



In [4]:
import pandas as pd

def calculate_precision_recall(predicted_df, golden_df):
    # Unione delle entità predette e delle entità del sistema dorato
    merged_df = pd.merge(predicted_df, golden_df, how='outer', indicator=True)

    # Calcolo dei true positives (TP), false positives (FP) e false negatives (FN)
    TP = merged_df[(merged_df['_merge'] == 'both')].shape[0]
    FP = merged_df[(merged_df['_merge'] == 'right_only')].shape[0]
    FN = merged_df[(merged_df['_merge'] == 'left_only')].shape[0]

    # Calcolo della precisione e del recall
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0

    # Converti in percentuali e arrotonda alla prima cifra decimale
    precision_percent = round(precision * 100, 1)
    recall_percent = round(recall * 100, 1)

    return precision_percent, recall_percent



In [5]:
import pandas as pd
from datetime import datetime

def evaluate_and_save_results(languages, output_file, dataset_size, smoothing_type):

    # Scrivi i risultati su file
   
    with open(output_file, "a") as file:
        file.write("##################################\n")
        file.write(f"Data: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        file.write(f"Dimensione del dataset: {dataset_size}\n")
        file.write(f"Tipo di Smoothing: {'Smoothing Type ' + str(smoothing_type)}\n\n")
        file.write("Statistiche aggiuntive:\n")
        file.write("Language | Acc Viterbi | Precision Viterbi | Recall Viterbi | Acc Nayve | Precision Nayve | Recall Nayve | Acc Viterbi Sent | Precision Viterbi Sent | Recall Viterbi Sent\n")
        
        for language in languages:
            tot_vit_df = read_tagging("tot_viterbi_tag", language)
            sent_vit_df= read_tagging("sent_viterbi_tag", language)
            nayve_df = read_tagging("nayve_tag", language)
            golden_df = read_tagging("golden_tag", language)

            nay_quadruples_df = extract_entities_from_dataframe(nayve_df)
            tot_vit_quadruples_df = extract_entities_from_dataframe(tot_vit_df)
            sent_vit_quadruples_df = extract_entities_from_dataframe(sent_vit_df)
            gol_quadruples_df = extract_entities_from_dataframe(golden_df)

            acc_vit = calculate_accuracy(tot_vit_quadruples_df, golden_df)
            acc_vit_sent = calculate_accuracy(sent_vit_quadruples_df, golden_df)
            acc_nayve = calculate_accuracy(nayve_df, golden_df)
            precision_v, recall_v = calculate_precision_recall(tot_vit_quadruples_df, gol_quadruples_df)
            precision_v_sent, recall_v_sent = calculate_precision_recall(sent_vit_quadruples_df, gol_quadruples_df)
            precision_n, recall_n = calculate_precision_recall(nay_quadruples_df, gol_quadruples_df)

            file.write(f"{language} | {format(acc_vit, '.1f')} % | {format(precision_v, '.1f')} % | {format(recall_v, '.1f')} % | {format(acc_nayve, '.1f')} % | {format(precision_n, '.1f')} % | {format(recall_n, '.1f')} %| {format(acc_vit_sent, '.1f')} % || {format(precision_v_sent, '.1f')} % || {format(recall_v_sent, '.1f')} % |\n")
        
        file.write("##################################\n \n")

# Esempio di utilizzo
languages = ["en", "it", "es"]
output_file = "../evaluation_results.txt"
dataset_size = 100
smoothing_type = 1
evaluate_and_save_results(languages, output_file, dataset_size, smoothing_type)


KeyError: 'TAG_system'