In [1]:
import pickle
import pandas as pd
import re

# function to read data from file
def read_tagging(file_name,language):
    path="../data/"+language+"/tagging/"+file_name+".conllu"
    data = pd.read_csv (path, sep = '\t',quoting=3, names=["POSITION","WORD","TAG"])
    return data


def read_dataset_memm(file_name,language):
    filename="../data/"+language+"/dataset/"+file_name+".conllu"
    #data = pd.read_csv (path, sep = '\t',quoting=3, names=["POSITION","WORD","TAG"])
    return filename



In [2]:
# Definiamo una funzione per estrarre le entità nominate da una frase
def extract_entities(sentence):
    entities = []
    current_entity = None
    for idx, (word, tag) in enumerate(sentence):
        if tag.startswith('B-'):
            if current_entity is not None:
                entities.append(current_entity)
            current_entity = {'type': tag[2:], 'start': idx, 'end': idx, 'words': [word]}
        elif tag.startswith('I-'):
            if current_entity is not None:
                current_entity['end'] = idx
                current_entity['words'].append(word)
        else:
            if current_entity is not None:
                entities.append(current_entity)
                current_entity = None
    if current_entity is not None:
        entities.append(current_entity)
    return entities

# Definiamo una funzione per dividere il testo in frasi
def split_into_sentences(text):
    return re.split(r'(?<=[.!?]) +', text)

# Definiamo una funzione per creare le quadruple
def create_quadruples(sentence, entities, sentence_idx):
    quadruples = []
    for entity in entities:
        quadruples.append((entity['type'], f"sent-{sentence_idx}", entity['start'], entity['end'], ' '.join(entity['words'])))
    return quadruples

# Funzione che dato in ingresso un dataframe di tipo (POSITION - WORD - TAG) 
#mi restituisce un DataFrame delle Entità espresse in quadruple
def process_dataframe(df):
    quadruples = []
    current_sentence = []
    for idx, row in df.iterrows():
        if isinstance(row['WORD'], float):
            # Convertiamo il float in una stringa
            word = str(row['WORD'])
        else:
            word = row['WORD']
            
        if word in ('.', '!', '?'):
            current_sentence.append((word, row['TAG']))
            entities = extract_entities(current_sentence)
            quadruples.extend(create_quadruples(current_sentence, entities, len(quadruples) + 1))
            current_sentence = []
        else:
            current_sentence.append((word, row['TAG']))
    quadruples_df = pd.DataFrame(quadruples, columns=['TAG', 'Contesto', 'Indice_inizio', 'Indice_fine', 'Entità'])
    return quadruples_df




In [3]:
# Definiamo una funzione per calcolare l'accuratezza dei singoli tag, vuole in ingresso i dataframe "classici" (POSITION -WORD -TAG)

def calculate_accuracy(system_df, golden_df):
    # Uniamo i due dataframe per confrontare i tag
    merged_df = pd.merge(system_df, golden_df, left_index=True, right_index=True, suffixes=('_system', '_golden'))
    
    # Contiamo quante volte i tag corrispondenti sono uguali
    correct_tags = (merged_df['TAG_system'] == merged_df['TAG_golden']).sum()
    
    # Calcoliamo l'accuratezza
    accuracy = correct_tags / len(system_df)
    
    return accuracy


In [4]:
def calculate_precision_recall(predicted_df, golden_df):
    # Effettuiamo il merge esterno tra i due dataframe
    merged_df = predicted_df.merge(golden_df, how='outer', on=['Entità'], suffixes=('_pred', '_gold'))
    
    # Contiamo il numero totale di corrispondenze esatte (True Positives)
    correct_matches = ((merged_df['TAG_pred'] == merged_df['TAG_gold']) & 
                       (merged_df['Contesto_pred'] == merged_df['Contesto_gold']) &
                       (merged_df['Indice_inizio_pred'] == merged_df['Indice_inizio_gold']) &
                       (merged_df['Indice_fine_pred'] == merged_df['Indice_fine_gold'])).sum()

    # Calcoliamo il numero di False Positives (FP) e False Negatives (FN)
    false_positives = merged_df['TAG_gold'].isna().sum()
    false_negatives = merged_df['TAG_pred'].isna().sum()

    # Calcoliamo precisione e richiamo
    precision = correct_matches / (correct_matches + false_positives) if (correct_matches + false_positives) > 0 else 0
    recall = correct_matches / (correct_matches + false_negatives) if (correct_matches + false_negatives) > 0 else 0
    
    return precision, recall



In [5]:
import memm_tagger
Vit_df=read_tagging("viterbi_tag","it")
nayve_df=read_tagging("nayve_tag","it")
golden_df=read_tagging("golden_tag","it")


memm_data = memm_tagger.load_data(read_dataset_memm("train","it"))
memm_tag = memm_tagger.train(read_dataset_memm("train","it"),memm_data)
memm_tagger.test(read_dataset_memm("test","it"),memm_tag,memm_data)


nay_quadruples_df = process_dataframe(nayve_df)
vit_quadruples_df = process_dataframe(Vit_df)
gol_quadruples_df = process_dataframe(golden_df)

acc_vit=calculate_accuracy(Vit_df,golden_df)
acc_nayve=calculate_accuracy(nayve_df,golden_df)
precision_v, recall_v =calculate_precision_recall(vit_quadruples_df,gol_quadruples_df)
precision_n, recall_n =calculate_precision_recall(nay_quadruples_df,gol_quadruples_df)

print("Accuracy Viterbi:"+ format(acc_vit))
print("Accuracy nayve:"+ format(acc_vit))
print("\n")
print("precision viterbi:"+ format(precision_v))
print("recall viterbi:"+ format(recall_v))
print("\n")
print("precision nayve:"+ format(precision_n))
print("recall nayve:"+ format(recall_n))

**********Number of features: 1157494**********
fit model...
Development Accuracy: 0.964 (592.0/614.0).

KeyboardInterrupt: 