# Handling Universal Dependency files

The file `data/french_USD.conllu` contains the French Treebank Seqoia.

1. Load the file.
2. Give the number of available parsed sentences in the file.
3. Transform the file into a list of tuple of the from `(word, tag)`.
3. Build a dictionary `counts` that counts the number of `tag` for each word in the corpus.

In [3]:
# 1. Load the file
file_path = "data/french_USD.conllu"

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    print("File loaded successfully")
except FileNotFoundError:
    print(f"Error: File '{file_path}' not found.")

File loaded successfully


In [None]:
# 2. Store every sentence into a dictionary
sentences = []
sentence_ids = set()

for line in content.split('\n'):
    line = line.strip()
    
    # Check for sent_id
    if line.startswith('# sent_id'):
        # Store sentence ID
        sent_id = line.split('=')[1].strip()
        sentence_ids.add(sent_id)

print(f"There is a total of {len(sentence_ids)} sentences")

There is a total of 2231 sentences


In [10]:
# 3. Store every (word, tag) in a list
word_tags= []

for line in content.split('\n'):
    line = line.strip()

    # Skip lines that start with #
    # Skip lines that are empty
    if line.startswith('#') or not line:
        continue
        
    # Parse the CONLL-U line
    parts = line.split('\t')
    if len(parts) >= 4:  # Ensure we have at least word and POS tag
        word_id = parts[0]
        word = parts[1]
        tag = parts[3]

        # Skip if it's a multi-word token with '-' in it
        if '-' in word_id:
            continue
        else:
            word_tags.append((word, tag))


In [14]:
# 4. Iterate to create the counting
count_dictionary = {}
for word_tag in word_tags:
    word = word_tag[0].lower() # lower as we only care about the POS
    tag = word_tag[1]
    # Check if the entry exists already in the count dictionary
    if not word in count_dictionary:
        count_dictionary[word] = {}
    # Check if the tag already exists for the word
    if not tag in count_dictionary[word]:
        count_dictionary[word] = {tag: 1}
    # Add a new value if already exists
    else:
        count_dictionary[word][tag] += 1

# Development of a naive approach
The naive approach for POS tagging consists in assigning the most frequent tag associated with a word.

1. Build a function called `train_tagger` that takes as input a set of lines in the USD format and returns a dictionary `most_common_tag[word]`. 
2. Create a function `tag_word(word, tag_dict, default_tag)` that tags the word `word`.
3. Create a function `tag_sentence(sentence, tag_dict, default_tag)` that returns a list of (word, tag) pairs for any input sentence.
4. Tests your function on the sentence "Le renard n'aime pas quand il court."

In [34]:
# 1. 
# We build two intermediary functions, one to parse the content, and one to count the values.
def parse_content(content: str) -> list[(str, str)]:
    """Parse the content of the conllu file and store it
    as a list of (word, tag).

    Args:
        content (str): The conllu content.

    Returns:
        list[(str, str)]: The list of word and tags.
    """
    word_tags= []

    for line in content.split('\n'):
        line = line.strip()

        # Skip lines that start with #
        # Skip lines that are empty
        if line.startswith('#') or not line:
            continue
            
        # Parse the CONLL-U line
        parts = line.split('\t')
        if len(parts) >= 4:  # Ensure we have at least word and POS tag
            word_id = parts[0]
            word = parts[1]
            tag = parts[3]

            # Skip if it's a multi-word token with '-' in it
            if '-' in word_id:
                continue
            else:
                word_tags.append((word, tag))
    return word_tags


def build_dictionary(word_tags: list[(str, str)]) -> dict[str, dict[str, int]]:
    """Build the dictionary that counts the occurence of the tags.

    Args:
        word_tags (list[(str, str)]): The list of word and their corresponding
            tags.

    Returns:
        dict[str, dict[str, int]]: The count of occurence of each tag per
            word.
    """
    count_dictionary = {}
    for word_tag in word_tags:
        word = word_tag[0].lower() # lower as we only care about the POS
        tag = word_tag[1]
        # Check if the entry exists already in the count dictionary
        if not word in count_dictionary:
            count_dictionary[word] = {}
        # Check if the tag already exists for the word
        if not tag in count_dictionary[word]:
            count_dictionary[word] = {tag: 1}
        # Add a new value if already exists
        else:
            count_dictionary[word][tag] += 1
    return count_dictionary


def train_tagger(content: str):
    """Function to build the word count dictionary given a USD file.

    Args:
        content (str): The content to parse.
    """
    word_tags = parse_content(content)
    return build_dictionary(word_tags)

trained_dict = train_tagger(content)

In [48]:
# 2. Create a function `tag_word(word, tag_dict, default_tag)` that tags the word `word`
def tag_word(word: str, tag_dict: dict[str, dict[str, int]], default_tag: str = "NOUN"):
    """Tag the word word with the most frequent tag.

    Args:
        word (str): The word to tag.
        tag_dict (dict[str, dict[str, int]]): The dictionary containing the POS.
        default_tag (str, optional): The default attribution.
            Defaults to "NOUN".
    """
    if word not in tag_dict:
        return default_tag
    tag = tag_dict[word]
    return max(tag, key=tag.get)

# 3. Create a function `tag_sentence(sentence, tag_dict, default_tag)` that returns a list of (word, tag) pairs for any input sentence.
def tag_sentence(sentence: str,
                 tag_dict: dict[str, dict[str, int]],
                 default_tag: str = "NOUN"):
    """Tag the given sentence.

    Args:
        sentence (str): The sentence to tag.
        tag_dict (dict[str, dict[str, int]]): The dictionary containing the POS.
        default_tag (str, optional): The default tag to return.
            Defaults to "NOUN".
    """
    tags = " "
    for word in sentence.split():
        tags += tag_word(word=word.lower(),
                         tag_dict=tag_dict,
                         default_tag=default_tag) + " "
    return tags.split()

# 4. Tests your function on the sentence "Le renard n'aime pas quand il court."
tag_sentence("Le renard n'aime pas quand il court",
             tag_dict=trained_dict)

['DET', 'PROPN', 'NOUN', 'ADV', 'SCONJ', 'PRON', 'ADJ']

# Evaluation of the approach

1. Create a training and test dataset, with the train containing 70\% of randomly selected words from the USD and the rest of the 30\% in the test dataset.
2. Train the parser on your train dataset.
3. Write the function `compute_accuracy(predictions, ground_truths)` that returns the accuracy given two numpy arrays containing POS tag.
4. Evaluate the quality of the naÃ¯ve parser on the test dataset.
5. Print the wrongly predicted tags and comment on the limits of the naive parser.

In [None]:
#1. Create the training and the test dataset

# Create the word_tag dataset
# and add a unique id per line
word_tags = []
for ix, line in enumerate(content.split('\n')):
    line = line.strip()

    # Skip lines that start with #
    # Skip lines that are empty
    if line.startswith('#') or not line:
        continue
        
    # Parse the CONLL-U line
    parts = line.split('\t')
    if len(parts) >= 4:  # Ensure we have at least word and POS tag
        word_id = parts[0]
        word = parts[1]
        tag = parts[3]

        # Skip if it's a multi-word token with '-' in it
        if '-' in word_id:
            continue
        else:
            word_tags.append((ix, word, tag))

import random

# Put 30% of these words in a test set and the rest in the test set
random.seed(10)
test_set = random.sample(word_tags, k=round(30/100*len(word_tags)))

# The rest in the train set
train_set = [word_tag for word_tag in word_tags if word not in test_set]
train_set_word_tag = [(word_tag[1], word_tag[2]) for word_tag in train_set]



In [58]:
# 2. Train the parser
trained_on_trainset = build_dictionary(word_tags=train_set_word_tag)

In [63]:
# 3. Compute the accuracy
import numpy as np

def compute_accuracy(predictions: np.ndarray,
                     ground_truths: np.ndarray) -> float:
    """Compute the accuracy comparing the predictions and the
    ground truth.

    Args:
        predictions (np.ndarray): The predicted values.
        ground_truths (np.ndarray): The ground truths.

    Returns:
        float: The accuracy.
    """
    return sum((1 - (predictions != ground_truths))/len(predictions))*100

In [71]:
# 4. Evaluate the performance of the naive parser
predictions = []
ground_truth = []
for word in test_set:
    ground_truth.append(word[2])
    predictions.append(tag_word(word[1], tag_dict=trained_on_trainset))

print("=== Predictions ===")
print(predictions)

print("=== Ground truth ===")
print(ground_truth)

print("=== Accuracy of naive tagger ===")
print(compute_accuracy(
    np.array(predictions),
    np.array(ground_truth)
))

=== Predictions ===
['PUNCT', 'PUNCT', 'PUNCT', 'PROPN', 'CCONJ', 'NOUN', 'NOUN', 'PRON', 'DET', 'NOUN', 'VERB', 'AUX', 'ADP', 'DET', 'NOUN', 'DET', 'ADJ', 'ADP', 'NOUN', 'PUNCT', 'ADP', 'ADP', 'VERB', 'NOUN', 'PRON', 'NOUN', 'ADP', 'NOUN', 'VERB', 'ADV', 'DET', 'ADJ', 'PUNCT', 'VERB', 'NOUN', 'ADV', 'AUX', 'ADV', 'ADP', 'VERB', 'DET', 'ADV', 'ADJ', 'NOUN', 'PRON', 'NOUN', 'AUX', 'ADP', 'NOUN', 'NUM', 'DET', 'DET', 'NOUN', 'DET', 'NOUN', 'NOUN', 'AUX', 'ADP', 'NOUN', 'DET', 'ADV', 'VERB', 'PUNCT', 'ADV', 'VERB', 'NOUN', 'ADP', 'NOUN', 'ADJ', 'DET', 'NOUN', 'ADP', 'PUNCT', 'ADP', 'NOUN', 'NOUN', 'DET', 'NOUN', 'ADV', 'VERB', 'VERB', 'AUX', 'ADP', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN', 'VERB', 'ADP', 'ADP', 'ADV', 'VERB', 'NOUN', 'DET', 'NOUN', 'ADP', 'ADP', 'DET', 'PUNCT', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'NOUN', 'ADP', 'ADP', 'ADP', 'ADJ', 'NOUN', 'VERB', 'DET', 'ADP', 'NOUN', 'NOUN', 'DET', 'ADV', 'PUNCT', 'DET', 'ADP', 'ADJ', 'VERB', 'CCONJ', 'PUNCT', 'NOUN', 'ADV', 'ADP', 'NUM',

# Bonus : object oriented programming

Wrap all the methods to build a NaivePOSTagger class, that should have a method `fit` given a corpus and `tag` given a sentence.