In [1]:
import pandas as pd
import numpy as np
import pickle 

# function to read data from file
def read_dataset(file_name,language):
    path="../data/"+language+"/dataset/"+file_name+".conllu"
    data = pd.read_csv (path, sep = '\t',quoting=3, names=["POSITION","WORD","TAG"])
    return data



In [2]:
#serialize data into a file 
def save_data(data,file_name,language):
    path="../data/"+language+"/"+file_name
    try: 
        file= open(path, 'wb') 
        pickle.dump(data, file) 
        file.close() 
    except: 
        print("Error in writing data")




### Counteggio
- word_tag_counts = numero di occorenze in cui una parola è associata ad un certo tag
- tag_tag_counts =  numero di occroenze in cui un tag è seguito da un certo tag   

In [3]:
def counts_occurence(train_data):
  tag_list=train_data['TAG'].unique()
  word_list=train_data['WORD'].unique()
  empty_tag_count_dict= {tag:0 for tag in tag_list}

  tag_counts = empty_tag_count_dict.copy()  # {tag1:0, tag2:0, ...}
  word_tag_counts = {word:empty_tag_count_dict.copy() for word in word_list} # {word1:{tag1:0, tag2:0, ...}, word2:{tag1:0, tag2:0, ...}, ...}
  tag_tag_counts = {tag:empty_tag_count_dict.copy() for tag in tag_list} # {tag1:{tag1:0, tag2:0, ...}, tag2:{tag1:0, tag2:0, ...}, ...}
  #tag_tag_counts['START'] = empty_tag_count_dict.copy()
  #tag_counts['START'] = 0
  
  for index, row in train_data.iterrows():
      word = row['WORD']
      tag = row['TAG']
      pos= row['POSITION']
      
      if pos!=0:
        tag_tag_counts[prev_tag][tag] +=1

      tag_counts[tag] += 1
      word_tag_counts[word][tag] +=1
      prev_tag = tag

  return tag_counts, word_tag_counts, tag_tag_counts



### Probabilità
- emission_prob = $p(w_i|t_i)$: probabilità, dato un tag, che venga associato ad una certa parola  
- transition_prob = $p(t_i|t_{i-1})$: probabilità di occorenza di un tag dato il tag precedente 

In [4]:
def calculate_probs(tag_counts, word_tag_counts, tag_tag_counts):
    tag_list = list(tag_counts.keys())
    word_list = list(word_tag_counts.keys())

    emission_prob = {word: {} for word in word_list} 
    transition_prob = {tag: {} for tag in tag_list} 

    # Calcolo delle probabilità di emissione
    for word, tag_dict in word_tag_counts.items():
        total_tag_count = sum(tag_dict.values())
        for tag, count in tag_dict.items():
            emission_prob[word][tag] = count / total_tag_count if total_tag_count > 0 else 0

    # Calcolo delle probabilità di transizione
    for prev, tag_dict in tag_tag_counts.items():
        total_next_tag_count = sum(tag_dict.values())
        for next_tag, count in tag_dict.items():
            transition_prob[prev][next_tag] = count / total_next_tag_count if total_next_tag_count > 0 else 0

    return emission_prob, transition_prob



In [5]:
def check_probabilities(emission_prob, transition_prob):
    # Verifica delle probabilità di emissione
    for word, tag_dict in emission_prob.items():
        total_emission_prob = sum(tag_dict.values())
        if abs(total_emission_prob - 1) > 1e-6:
            print(f"WARNING: La somma delle probabilità di emissione per la parola '{word}' non è 1, ma {total_emission_prob}")

    # Verifica delle probabilità di transizione
    for tag, next_tag_dict in transition_prob.items():
        total_transition_prob = sum(next_tag_dict.values())
        if abs(total_transition_prob - 1) > 1e-6:
            print(f"WARNING: La somma delle probabilità di transizione per il tag '{tag}' non è 1, ma {total_transition_prob}")




In [6]:
#calcola la probabilità di emissione di un tag per le parole  "sconosciute"
#usando il dataset di validazione e considerando le parole che appaiono una sola volta
def calculate_unknown_probs(val_data):

    word_counts = val_data['WORD'].value_counts()
    unique_rows = val_data[val_data['WORD'].map(word_counts) == 1]
    tag_counts= unique_rows['TAG'].value_counts()
    tag_list=val_data['TAG'].unique()
    unknown_prob= {tag:0 for tag in tag_list}

    for tag in tag_list:
        if tag not in unknown_prob:
            unknown_prob[tag] = 0
        else:
            unknown_prob[tag] = tag_counts[tag]/sum(tag_counts)
    return unknown_prob
    


In [7]:
#Impara le probabilità di transizione, di emissione per le lingue en, it, es
for language in ["en","it","es"]:
    train_data=read_dataset("train",language)
    tag_counts, word_tag_counts, tag_tag_counts= counts_occurence(train_data)
    emission_prob, transition_prob= calculate_probs(tag_counts, word_tag_counts, tag_tag_counts)
    check_probabilities(emission_prob, transition_prob)
    save_data(transition_prob,"transition_prob",language)
    save_data(emission_prob,"emission_prob",language)
    val_data=read_dataset("val",language)
    unknown_prob=calculate_unknown_probs(val_data)
    save_data(unknown_prob,"unknown_prob",language)