In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import time

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Training on: {device}")

  from .autonotebook import tqdm as notebook_tqdm
2025-12-04 18:07:12.003800: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-04 18:07:12.601873: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-04 18:07:14.024662: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Training on: cuda


In [2]:
# --- Funções de Leitura de Dados ---

def read_conllu_file(filepath):
    """
    Read a CoNLL-U format file and extract words and POS tags sentence by sentence.
    
    Args:
        filepath: Path to the CoNLL-U file
        
    Returns:
        A list of dictionaries, each containing 'words' and 'pos_tags' lists for a sentence
    """
    sentences = []
    current_sentence = {'words': [], 'pos_tags': []}
    
    with open(filepath, "r", encoding="utf-8") as data_file:
        for line in data_file:
            if line.startswith("#"):
                # Skip comment lines
                pass
            elif line.strip() == "":
                # Empty line marks end of sentence
                if current_sentence['words']:  # Only add non-empty sentences
                    sentences.append(current_sentence)
                    current_sentence = {'words': [], 'pos_tags': []}
            else:
                # Parse the token line
                fields = line.split("\t")
                word, pos = fields[1], fields[3]
                current_sentence['words'].append(word)
                current_sentence['pos_tags'].append(pos)
    
    return sentences

#load data
TRAIN = "./data/en_ewt-ud-train.conllu"
DEV = "./data/en_ewt-ud-dev.conllu"
TEST = "./data/en_ewt-ud-test.conllu"

# --- 1) Carregar Dados ---
try:
    train_sents = read_conllu_file(TRAIN)
    val_sents = read_conllu_file(DEV)
    test_sents = read_conllu_file(TEST)
    print("Loaded sentences:", len(train_sents), len(val_sents), len(test_sents))
except FileNotFoundError as e:
    print(f"Erro: Ficheiro de dados não encontrado: {e.filename}. Certifique-se de que os ficheiros CoNLL-U estão em './data/'")
    exit()

# Display preview
print(f"Total sentences: {len(val_sents)}")
print(f"First 3 sentences:")
for i, sent in enumerate(val_sents[:3]):
    print(f"Sentence {i+1}:")
    print(f"  Words: {sent['words']}")
    print(f"  POS tags: {sent['pos_tags']}")

Loaded sentences: 12544 2001 2077
Total sentences: 2001
First 3 sentences:
Sentence 1:
  Words: ['From', 'the', 'AP', 'comes', 'this', 'story', ':']
  POS tags: ['ADP', 'DET', 'PROPN', 'VERB', 'DET', 'NOUN', 'PUNCT']
Sentence 2:
  Words: ['President', 'Bush', 'on', 'Tuesday', 'nominated', 'two', 'individuals', 'to', 'replace', 'retiring', 'jurists', 'on', 'federal', 'courts', 'in', 'the', 'Washington', 'area', '.']
  POS tags: ['PROPN', 'PROPN', 'ADP', 'PROPN', 'VERB', 'NUM', 'NOUN', 'PART', 'VERB', 'VERB', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'DET', 'PROPN', 'NOUN', 'PUNCT']
Sentence 3:
  Words: ['Bush', 'nominated', 'Jennifer', 'M.', 'Anderson', 'for', 'a', '15', '-', 'year', 'term', 'as', 'associate', 'judge', 'of', 'the', 'Superior', 'Court', 'of', 'the', 'District', 'of', 'Columbia', ',', 'replacing', 'Steffen', 'W.', 'Graae', '.']
  POS tags: ['PROPN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'ADP', 'DET', 'NUM', 'PUNCT', 'NOUN', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'PROP

In [None]:
# --- Configuration ---
MAX_LEN = 128
BATCH_SIZE = 16
MODEL_NAME = 'distilbert-base-cased' # Case sensitive is usually better for POS

# --- 1. Label Mapping (Reusing from Task 1.1) ---
# We need to ensure we have the same tag map. 
# Re-running build_vocab just to be safe and self-contained
def get_tag_map(sentences):
    tags = set()
    for sent in sentences:
        for tag in sent['pos_tags']:
            tags.add(tag)
    tag2id = {tag: i for i, tag in enumerate(sorted(list(tags)))}
    id2tag = {i: tag for tag, i in tag2id.items()}
    return tag2id, id2tag

# Load data (assuming sentences are already loaded from previous task)
# train_sentences, dev_sentences, test_sentences are available
tag2id, id2tag = get_tag_map(train_sents)
num_labels = len(tag2id)

print(f" distinctive tags: {num_labels}")

# --- 2. Tokenization & Alignment Function ---
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

class TransformerPOSDataset(torch.utils.data.Dataset):
    def __init__(self, sentences, tag2id, tokenizer, max_len):
        self.sentences = sentences
        self.tag2id = tag2id
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        item = self.sentences[idx]
        word_list = item['words']
        label_list = item['pos_tags']
        
        # Tokenize the sentence
        # is_split_into_words=True tells the tokenizer we are providing a list of words
        encoding = self.tokenizer(
            word_list,
            is_split_into_words=True,
            return_offsets_mapping=True, 
            padding='max_length', 
            truncation=True, 
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        # Create labels aligned with tokens
        labels = []
        encoded_labels = []
        
        # The offset mapping helps us determine which original word a token belongs to
        # It returns tuples (start, end) char indices. (0,0) usually means special token.
        doc_encodings = encoding.encodings[0]
        
        # word_ids returns a list where each element indicates the index of the word 
        # in the original sentence that the token corresponds to. 
        # None indicates special tokens like [CLS] or [SEP].
        word_ids = encoding.word_ids()
        
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                # Special token ([CLS], [SEP], [PAD]) -> Ignore (-100)
                encoded_labels.append(-100)
            elif word_idx != previous_word_idx:
                # First subword of a new word -> Use the real label
                tag = label_list[word_idx]
                encoded_labels.append(self.tag2id[tag])
            else:
                # Subsequent subword of the same word -> Ignore (-100)
                # Alternatively, you could propagate the label, but -100 is standard
                encoded_labels.append(-100)
            previous_word_idx = word_idx
            
        # Squeeze to remove batch dimension added by tokenizer
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(encoded_labels)
        
        # Remove offset_mapping as it's not needed for the model
        if 'offset_mapping' in item:
             del item['offset_mapping']
             
        return item

# Create Datasets
train_dataset = TransformerPOSDataset(train_sents, tag2id, tokenizer, MAX_LEN)
test_dataset = TransformerPOSDataset(test_sents, tag2id, tokenizer, MAX_LEN)
val_dataset = TransformerPOSDataset(val_sents, tag2id, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

 distinctive tags: 18


In [4]:
def train_model(model, train_loader, val_loader, NUM_EPOCHS=10, learning_rate=0.001):
    
    # Loss function and optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(NUM_EPOCHS):
        model.train()
        running_train_loss = 0.0

        for batch in train_loader:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            running_train_loss += loss.item()

        train_loss = running_train_loss / len(train_loader)
        
        # --- Validation Loop ---
        model.eval() # Set the model to evaluation mode (e.g., disables dropout)
        running_val_loss = 0.0
        with torch.no_grad(): # Disable gradient calculation for validation
            for batch in val_loader:
                # Move batch to device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                running_val_loss += loss.item()

        val_loss = running_val_loss / len(val_loader)

        print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")


In [None]:
model_bert = DistilBertForTokenClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=num_labels
).to(device)

print(f"Bert Model parameters: {sum(p.numel() for p in model_bert.parameters())}")

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
EPOCHS = 3
LEARNING_RATE = 5e-5

start_time = time.time()
print("Starting Fine-tuning...")

bert = train_model(model_bert,train_loader,val_loader,EPOCHS,LEARNING_RATE)

training_time = time.time() - start_time

Starting Fine-tuning...
Epoch [1/3], Train Loss: 0.1962, Val Loss: 0.1178
Epoch [2/3], Train Loss: 0.0533, Val Loss: 0.1167
Epoch [3/3], Train Loss: 0.0292, Val Loss: 0.1300


In [None]:
def evaluate_model(model, loader, id2tag):
    model.eval()
    
    all_labels = []
    all_pred = []
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits # Shape: (batch, seq_len, num_labels)
            predictions = torch.argmax(logits, dim=2)
            
            # Flatten to align
            predictions = predictions.view(-1).cpu().numpy()
            labels = labels.view(-1).cpu().numpy()
            
            # Filter out ignored indices (-100)
            # This ensures we only evaluate on the first subword of real words
            valid_indices = labels != -100
            
            valid_preds = predictions[valid_indices]
            valid_targets = labels[valid_indices]
            
            all_labels.extend(valid_targets)
            all_pred.extend(valid_preds)

    # Convert ids back to tags
    true_tags = [id2tag[i] for i in all_labels]
    pred_tags = [id2tag[i] for i in all_pred]
    
    acc = accuracy_score(true_tags, pred_tags)
    report = classification_report(true_tags, pred_tags, zero_division=0)
    
    return acc, report

In [13]:
acc, report = evaluate_model(model_bert, test_loader, id2tag)

print(f"Accuracy: {100*acc:.4f}%")
print(f"Training time: {training_time:.2f} seconds")
print(f"Report: \n{report}")

Accuracy: 96.7505%
Training time: 357.92 seconds
Report: 
              precision    recall  f1-score   support

         ADJ       0.90      0.95      0.93      1788
         ADP       0.97      0.98      0.98      2025
         ADV       0.96      0.95      0.95      1191
         AUX       1.00      1.00      1.00      1543
       CCONJ       1.00      1.00      1.00       736
         DET       0.99      1.00      0.99      1897
        INTJ       0.95      0.87      0.91       121
        NOUN       0.96      0.93      0.95      4123
         NUM       0.95      1.00      0.97       542
        PART       1.00      0.99      1.00       649
        PRON       0.99      0.99      0.99      2165
       PROPN       0.90      0.91      0.91      2075
       PUNCT       1.00      0.99      0.99      3096
       SCONJ       0.96      0.96      0.96       384
         SYM       0.76      0.97      0.86       113
        VERB       0.98      0.98      0.98      2606
           X       0.89