In [1]:
import pickle
import pandas as pd
import re

# function to read data from file
def read_tagging(file_name,language):
    path="../data/"+language+"/tagging/"+file_name+".conllu"
    data = pd.read_csv (path, sep = '\t',quoting=3, names=["POSITION","WORD","TAG"])
    return data


def extract_sentences_from_dataframe(df):
    sentences = ''
    for index, row in df.iterrows():
        word = row['WORD']
        if pd.notnull(word):  # Se la parola non è nulla
            if sentences:  # Se c'è già una frase, aggiungi uno spazio prima della nuova parola
                sentences += ' '
            sentences += word
    return sentences

#deserialize data from a file
def load_data(file_name,language):
    path="../data/"+language+"/"+file_name
    try: 
        file = open(path, 'rb') 
        data = pickle.load(file) 
        return data
    except: 
        print("Error in reading data")



In [2]:
import pandas as pd

def extract_entities_from_dataframe(dataframe):
    entity_spans = []

    current_entity_span = None
    current_sentence_index = 0

    for index, row in dataframe.iterrows():
        word = row['WORD']
        tag = row['TAG']
        position = row['POSITION']

        if position == 0:  # Inizio di una nuova frase
            current_sentence_index += 1

        if tag != 'O':
            if tag.startswith('B-'):
                # Se inizia una nuova entità, chiudi quella corrente e inizia una nuova
                if current_entity_span is not None:
                    entity_spans.append(current_entity_span)
                current_entity_span = {'Tag': tag[2:], 'Sentence Number': current_sentence_index}
                current_entity_span['Start Index'] = index
                current_entity_span['End Index'] = index
            elif tag.startswith('I-'):
                # Aggiungi la parola all'entità corrente
                if current_entity_span is not None:
                    current_entity_span['End Index'] = index
            else:
                print("Errore: Tag non riconosciuto.")

        else:
            # Se il tag è "O" ma siamo all'interno di una serie di tag non "O", chiudi l'entità corrente
            if current_entity_span is not None:
                entity_spans.append(current_entity_span)
                current_entity_span = None

    # Aggiungi l'ultima entità se presente
    if current_entity_span is not None:
        entity_spans.append(current_entity_span)

    # Creazione del dataframe di output
    output_data = {'Tag': [], 'Sentence Number': [], 'Start Index': [], 'End Index': []}
    for entity_span in entity_spans:
        output_data['Tag'].append(entity_span['Tag'])
        output_data['Sentence Number'].append(entity_span['Sentence Number'])
        output_data['Start Index'].append(entity_span['Start Index'])
        output_data['End Index'].append(entity_span['End Index'])

    output_df = pd.DataFrame(output_data)

    return output_df

In [3]:
import pandas as pd

def calculate_accuracy(system_df, golden_df):
    # Uniamo i due dataframe per confrontare i tag
    merged_df = pd.merge(system_df, golden_df, left_index=True, right_index=True, suffixes=('_system', '_golden'))
    
    # Contiamo quante volte i tag corrispondenti sono uguali
    correct_tags = (merged_df['TAG_system'] == merged_df['TAG_golden']).sum()
    
    # Calcoliamo l'accuratezza
    accuracy = correct_tags / len(system_df)
    
    # Converti in percentuale e arrotonda alla prima cifra decimale
    accuracy_percent = round(accuracy * 100, 1)
    
    return accuracy_percent



In [4]:
import pandas as pd

def calculate_precision_recall(predicted_df, golden_df):
    # Unione delle entità predette e delle entità del sistema dorato
    merged_df = pd.merge(predicted_df, golden_df, how='outer', indicator=True)

    # Calcolo dei true positives (TP), false positives (FP) e false negatives (FN)
    TP = merged_df[(merged_df['_merge'] == 'both')].shape[0]
    FP = merged_df[(merged_df['_merge'] == 'right_only')].shape[0]
    FN = merged_df[(merged_df['_merge'] == 'left_only')].shape[0]

    # Calcolo della precisione e del recall
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    print("TP:",TP,"FP:",FP,"FN:",FN)
    # Converti in percentuali e arrotonda alla prima cifra decimale
    precision_percent = round(precision * 100, 1)
    recall_percent = round(recall * 100, 1)

    return precision_percent, recall_percent



In [5]:
import pandas as pd
from datetime import datetime

def evaluate_and_save_results(languages, output_file, dataset_size, smoothing_type):

    # Scrivi i risultati su file
   
    with open(output_file, "a") as file:
        file.write("##################################\n")
        file.write(f"Data: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        file.write(f"Dimensione del dataset: {dataset_size}\n")
        file.write(f"Tipo di Smoothing: {'Smoothing Type ' + str(smoothing_type)}\n\n")
        file.write("Statistiche aggiuntive:\n")
        file.write("Language ||| Acc Naive | Precision Naive | Recall Naive ||| Acc Viterbi| Precision Viterbi| Recall Viterbi\n")
        
        for language in languages:
            vit_df= read_tagging("sent_viterbi_tag", language)
            naive_df = read_tagging("naive_tag", language)
            golden_df = read_tagging("golden_tag", language)
            
            gold_quadruples_df = extract_entities_from_dataframe(golden_df)
            naive_quadruples_df = extract_entities_from_dataframe(naive_df)
            vit_quadruples_df = extract_entities_from_dataframe(vit_df)
            

         
            acc_vit= calculate_accuracy(vit_df, golden_df)
            acc_naive = calculate_accuracy(naive_df, golden_df)
            precision_vit, recall_vit = calculate_precision_recall(vit_quadruples_df, gold_quadruples_df)
            precision_n, recall_n = calculate_precision_recall(naive_quadruples_df, gold_quadruples_df)

            file.write(f"{language} ||| {format(acc_naive, '.1f')} % | {format(precision_n, '.1f')} % | {format(recall_n, '.1f')} % ||| {format(acc_vit, '.1f')} % | {format(precision_vit, '.1f')} % | {format(recall_vit, '.1f')} % |\n")
        
        file.write("##################################\n \n")

# Esempio di utilizzo
languages = ["en", "it", "es"]
output_file = "../evaluation_results.txt"
dataset_size = 100
smoothing_type = 4
evaluate_and_save_results(languages, output_file, dataset_size, smoothing_type)


TP: 3 FP: 736 FN: 846
TP: 389 FP: 350 FN: 562
TP: 7 FP: 685 FN: 528
TP: 417 FP: 275 FN: 514
TP: 4 FP: 668 FN: 358
TP: 404 FP: 268 FN: 486
