In [1]:
import pandas as pd
import numpy as np
import pickle 

# function to read data from file
def read_dataset(file_name,language):
    path="../data/"+language+"/"+file_name+".conllu"
    data = pd.read_csv (path, sep = '\t',quoting=3, names=["POSITION","WORD","TAG"])
    return data

train_data=read_dataset("train","en")
print(train_data)

         POSITION      WORD TAG
0               0      This   O
1               1  division   O
2               2      also   O
3               3  contains   O
4               4       the   O
...           ...       ...  ..
2193674        27      born   O
2193675        28         1   O
2193676        29       May   O
2193677        30      1964   O
2193678        31         .   O

[2193679 rows x 3 columns]


# LEARNING:
### Counting
- word_tag_counts = number of times a word is associated with each tag
- tag_tag_counts =  number of times a tag is followed by another tag    (also considering an extra tag "START")

In [14]:
def counts_occurence(train_data):
  tag_list=train_data['TAG'].unique()
  word_list=train_data['WORD'].unique()
  empty_tag_count_dict= {tag:0 for tag in tag_list}

  tag_counts = empty_tag_count_dict.copy()  # {tag1:0, tag2:0, ...}
  word_tag_counts = {word:empty_tag_count_dict.copy() for word in word_list} # {word1:{tag1:0, tag2:0, ...}, word2:{tag1:0, tag2:0, ...}, ...}
  tag_tag_counts = {tag:empty_tag_count_dict.copy() for tag in tag_list} # {tag1:{tag1:0, tag2:0, ...}, tag2:{tag1:0, tag2:0, ...}, ...}
  #tag_tag_counts['START'] = empty_tag_count_dict.copy()
  #tag_counts['START'] = 0
 

  for index, row in train_data.iterrows():
      word = row['WORD']
      tag = row['TAG']
      pos= row['POSITION']
      
      if pos!=0:
        tag_tag_counts[prev_tag][tag] +=1

      
      tag_counts[tag] += 1
      word_tag_counts[word][tag] +=1
      prev_tag = tag

  
  return tag_counts, word_tag_counts, tag_tag_counts

tag_counts, word_tag_counts, tag_tag_counts= counts_occurence(train_data)



### Probabilities
- emission_prob = probability, given a tag, that it will be associated with a given word
- transition_prob = probability of a tag occurring given the previous tag

In [25]:
def calculate_probs(tag_counts, word_tag_counts, tag_tag_counts):
    tag_list = list(tag_counts.keys())
    word_list = list(word_tag_counts.keys())

    emission_prob = {word: {} for word in word_list} 
    transition_prob = {tag: {} for tag in tag_list} 

    # Calcolo delle probabilità di emissione
    for word, tag_dict in word_tag_counts.items():
        total_tag_count = sum(tag_dict.values())
        for tag, count in tag_dict.items():
            emission_prob[word][tag] = count / total_tag_count if total_tag_count > 0 else 0

    # Calcolo delle probabilità di transizione
    for prev, tag_dict in tag_tag_counts.items():
        total_tag_count = tag_counts[prev]
        for next_tag, count in tag_dict.items():
            transition_prob[prev][next_tag] = count / total_tag_count if total_tag_count > 0 else 0

    return emission_prob, transition_prob

emission_prob, transition_prob= calculate_probs(tag_counts, word_tag_counts, tag_tag_counts)




In [27]:
#serialize data into a file 
def save_data(data,file_name,language):
    path="../data/"+language+"/"+file_name
    try: 
        file= open(path, 'wb') 
        pickle.dump(data, file) 
        file.close() 
    except: 
        print("Error in writing data")

# save_data(transition_prob,"transition_prob","it")

In [28]:
#Learn emission and transition probabilities for each language
for language in ["en","it","es"]:
    read_dataset("train",language)
    tag_counts, word_tag_counts, tag_tag_counts= counts_occurence(train_data)
    emission_prob, transition_prob= calculate_probs(tag_counts, word_tag_counts, tag_tag_counts)
    save_data(transition_prob,"transition_prob",language)
    save_data(emission_prob,"emission_prob",language)