# Text Classification with TECLA Dataset

This notebook demonstrates how to load and use the TECLA dataset from Hugging Face's datasets library for text classification tasks in Catalan language.

TECLA (TEChniques for LAnguage processing) is a dataset for Catalan text classification tasks.

In [2]:
# Cargar el dataset de projecte-aina/tecla desde Hugging Face
from datasets import load_dataset

# Cargar el dataset TECLA
tecla_dataset = load_dataset("projecte-aina/tecla")

# Mostrar información sobre el dataset
print(f"Claves del dataset: {tecla_dataset.keys()}")
print(f"Tamaño del conjunto de entrenamiento: {len(tecla_dataset['train'])}")
print(f"Tamaño del conjunto de validación: {len(tecla_dataset['validation'])}")
print(f"Tamaño del conjunto de prueba: {len(tecla_dataset['test'])}")

# Mostrar las columnas disponibles en el dataset
print(f"\nColumnas disponibles: {tecla_dataset['train'].column_names}")

# Mostrar un ejemplo de los datos
print("\nEjemplo de los datos:")
print(tecla_dataset['train'][0])

# Explorar las clases/etiquetas disponibles para label1 y label2
label1_values = set(tecla_dataset['train']['label1'])
label2_values = set(tecla_dataset['train']['label2'])

print(f"\nNúmero de clases en label1: {len(label1_values)}")
print(f"Clases disponibles en label1: {label1_values}")

print(f"\nNúmero de clases en label2: {len(label2_values)}")
print(f"Clases disponibles en label2: {label2_values}")

# Ver la distribución de las etiquetas
print("\nDistribución de label1:")
label1_counts = {}
for label in tecla_dataset['train']['label1']:
    label1_counts[label] = label1_counts.get(label, 0) + 1
for label, count in sorted(label1_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{label}: {count} ({count/len(tecla_dataset['train'])*100:.2f}%)")

print("\nDistribución de label2:")
label2_counts = {}
for label in tecla_dataset['train']['label2']:
    label2_counts[label] = label2_counts.get(label, 0) + 1
for label, count in sorted(label2_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{label}: {count} ({count/len(tecla_dataset['train'])*100:.2f}%)")

Claves del dataset: dict_keys(['train', 'validation', 'test'])
Tamaño del conjunto de entrenamiento: 90700
Tamaño del conjunto de validación: 5669
Tamaño del conjunto de prueba: 17007

Columnas disponibles: ['sentence', 'label1', 'label2']

Ejemplo de los datos:
{'sentence': "L'ACA reactiva el retorn del cànon de l'aigua que s'envia a Tarragona i millorarà l'eficiència del canal de l'esquerra. L'obra corregirà pèrdues d'aigua a la sèquia del Cementiri de Deltebre amb una inversió de 900.000 euros. Després d'alguns exercici de paràlisi, la Comunitat de Regants de l'Esquerra de l'Ebre i l'ACA han signat, aquest dilluns, un nou conveni per reactivar obres de millora de l'eficiència d'infraestructures de reg i evitar la pèrdua d'aigua. Les actuacions començaran al febrer de l'any que ve en un tram de 4,9 quilòmetres de la sèquia del Cementiri, al terme municipal de Deltebre (Baix Ebre).Es col·locaran plaques i llits de graves, làmines de geotèxtil, i es revestiran més de 3 quilòmetres de l

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# We'll use a pre-trained model for Catalan language
model_name = "projecte-aina/roberta-base-ca"
print(f"Using model: {model_name}")

# We'll focus on label1 for this classification task
# You can change to label2 if needed
target_label = "label1"
print(f"Classification target: {target_label}")

## Data Preprocessing

We'll tokenize the text data using the tokenizer from the pre-trained model and prepare it for training.

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and prepare the dataset
def tokenize_dataset(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize and prepare the datasets
train_dataset = tecla_dataset["train"]
val_dataset = tecla_dataset["validation"]
test_dataset = tecla_dataset["test"]

# Get unique labels and create a mapping
labels = sorted(list(set(train_dataset[target_label])))
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}
num_labels = len(labels)

print(f"Number of labels: {num_labels}")
print(f"Label mapping: {label2id}")

# Tokenize datasets
train_encodings = tokenizer(train_dataset["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
val_encodings = tokenizer(val_dataset["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer(test_dataset["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")

# Convert labels to numeric format
train_labels = [label2id[label] for label in train_dataset[target_label]]
val_labels = [label2id[label] for label in val_dataset[target_label]]
test_labels = [label2id[label] for label in test_dataset[target_label]]

# Convert to PyTorch tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

# Create TensorDatasets
train_dataset = TensorDataset(train_encodings.input_ids, train_encodings.attention_mask, train_labels)
val_dataset = TensorDataset(val_encodings.input_ids, val_encodings.attention_mask, val_labels)
test_dataset = TensorDataset(test_encodings.input_ids, test_encodings.attention_mask, test_labels)

# Create DataLoaders
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print(f"Train batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

## Model Definition

We'll fine-tune a pre-trained RoBERTa model for Catalan language specifically for our classification task.

In [None]:
# Load pre-trained model with a classification head
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

# Move model to the appropriate device
model = model.to(device)

# Set up optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 5  # 5 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

## Training Function

We'll define functions for training and evaluation.

In [None]:
# Function to calculate accuracy
def compute_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, pred_flat)

# Training function
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    for batch in dataloader:
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        # Clear previous gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(**inputs)
        loss = outputs.loss
        logits = outputs.logits
        
        # Backward pass
        loss.backward()
        
        # Clip the norm of gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and learning rate
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        
        # Move logits and labels to CPU for sklearn metrics
        logits = logits.detach().cpu().numpy()
        labels = inputs['labels'].cpu().numpy()
        
        all_preds.extend(logits)
        all_labels.extend(labels)
    
    avg_loss = total_loss / len(dataloader)
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    accuracy = compute_accuracy(all_preds, all_labels)
    
    return avg_loss, accuracy

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(b.to(device) for b in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }
            
            outputs = model(**inputs)
            loss = outputs.loss
            logits = outputs.logits
            
            total_loss += loss.item()
            
            # Move logits and labels to CPU for sklearn metrics
            logits = logits.detach().cpu().numpy()
            labels = inputs['labels'].cpu().numpy()
            
            all_preds.extend(logits)
            all_labels.extend(labels)
    
    avg_loss = total_loss / len(dataloader)
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    accuracy = compute_accuracy(all_preds, all_labels)
    
    return avg_loss, accuracy, all_preds, all_labels

## Model Training

Now we'll train the model for a few epochs and evaluate its performance.

In [None]:
# Training loop
epochs = 5
best_val_accuracy = 0
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    print('-' * 40)
    
    # Training
    train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, scheduler, device)
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    
    # Validation
    val_loss, val_accuracy, val_preds, val_labels = evaluate(model, val_loader, device)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    
    # Save the best model
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best_tecla_model.pt')
        print("Saved best model!")

# Plot training and validation metrics
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss over Epochs')

plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Accuracy over Epochs')
plt.tight_layout()
plt.show()

## Model Evaluation

Let's evaluate our best model on the test set and analyze its performance.

In [None]:
# Load the best model
model.load_state_dict(torch.load('best_tecla_model.pt'))

# Evaluate on test set
test_loss, test_accuracy, test_preds, test_labels = evaluate(model, test_loader, device)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

# Generate predictions
test_predictions = np.argmax(test_preds, axis=1)

# Create classification report
print("\nClassification Report:")
print(classification_report(test_labels, test_predictions, target_names=labels))

# Create confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(test_labels, test_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

## Error Analysis

Let's analyze some misclassified examples to understand where our model struggles.

In [None]:
# Get misclassified examples
misclassified_indices = []
test_pred_classes = np.argmax(test_preds, axis=1)

for i, (pred, true) in enumerate(zip(test_pred_classes, test_labels)):
    if pred != true:
        misclassified_indices.append(i)

# Look at some misclassified examples
print(f"Number of misclassified examples: {len(misclassified_indices)}")

if len(misclassified_indices) > 0:
    # Take up to 10 misclassified examples
    samples = min(10, len(misclassified_indices))
    print(f"\nShowing {samples} misclassified examples:")
    
    for i in range(samples):
        idx = misclassified_indices[i]
        text = tecla_dataset["test"][idx]["text"]
        true_label = tecla_dataset["test"][idx][target_label]
        pred_label = id2label[test_pred_classes[idx]]
        
        print(f"\nExample {i+1}:")
        print(f"Text: {text[:100]}...")  # Show just the beginning of the text
        print(f"True label: {true_label}")
        print(f"Predicted label: {pred_label}")
        print("-" * 50)

## Model Interpretation

Let's try to understand which parts of the text are important for classification using a technique like attention visualization.

In [None]:
from transformers import pipeline
import random

# Load the model for interpretation
model.eval()

# Create a pipeline for token classification to visualize attention
nlp = pipeline(
    "text-classification", 
    model=model, 
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    return_all_scores=True
)

# Select a few random examples from the test set
random_indices = random.sample(range(len(tecla_dataset["test"])), 5)

for idx in random_indices:
    text = tecla_dataset["test"][idx]["text"]
    true_label = tecla_dataset["test"][idx][target_label]
    
    # Get prediction
    result = nlp(text)
    
    # Sort scores and get the predicted label
    scores = {labels[int(item['label'].split('_')[-1])]: item['score'] for item in result[0]}
    pred_label = max(scores, key=scores.get)
    
    print(f"\nExample:")
    print(f"Text: {text[:150]}...")  # Show beginning of text
    print(f"True label: {true_label}")
    print(f"Predicted label: {pred_label}")
    print("Confidence scores:")
    for label, score in sorted(scores.items(), key=lambda x: x[1], reverse=True):
        print(f"  {label}: {score:.4f}")
    print("-" * 50)

# Save the model and tokenizer for future use
model_save_path = "tecla_classifier_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model and tokenizer saved to {model_save_path}")

## Conclusion

We've successfully trained and evaluated a RoBERTa-based classification model for the TECLA dataset in Catalan. The model achieves good performance on the test set and can be used for Catalan text classification tasks.

Key observations:
1. The model's performance varies across different classes
2. Some classes are harder to distinguish than others
3. The model achieves good overall accuracy on the test set

Future improvements:
1. Try different pre-trained models specific to Catalan
2. Experiment with data augmentation for underrepresented classes
3. Try ensemble methods to improve performance
4. Use longer context windows for documents that might be truncated