In [None]:
!pip install transformers datasets evaluate seqeval
!git clone https://github.com/kmkurn/pytorch-crf.git
!pip install ./pytorch-crf

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, DataCollatorForTokenClassification, TrainingArguments, get_scheduler
from datasets import load_dataset
from torchcrf import CRF
from seqeval.metrics import classification_report
from tqdm import tqdm
import os

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")

# Reduce the dataset size (30% of the original size) for quicker experimentation
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(int(len(dataset["train"]))))
dataset["validation"] = dataset["validation"].shuffle(seed=42).select(range(int(len(dataset["validation"]))))
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(int(len(dataset["test"]))))

# Get label names
label_list = dataset["train"].features["ner_tags"].feature.names  # e.g., O, B-PER, I-PER
num_labels = len(label_list)

# Load tokenizer
model_name = "bert-large-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization and label alignment
def tokenize_and_align_labels_with_crf(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignored during training
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # First subword
            else:
                label_ids.append(-100)  # Ignore subwords
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_and_align_labels_with_crf, batched=True)

# Remove unnecessary columns
tokenized_datasets["train"] = tokenized_datasets["train"].remove_columns(["id", "pos_tags", "chunk_tags", "tokens"])
tokenized_datasets["validation"] = tokenized_datasets["validation"].remove_columns(["id", "pos_tags", "chunk_tags", "tokens"])
tokenized_datasets["test"] = tokenized_datasets["test"].remove_columns(["id", "pos_tags", "chunk_tags", "tokens"])

# Set dataset format
tokenized_datasets["train"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_datasets["validation"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_datasets["test"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# DataLoader setup
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="pt")
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16, collate_fn=data_collator)

# Define the model with CRF
class BertCRFNER(nn.Module):
    def __init__(self, model_name, num_labels):
        super(BertCRFNER, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.5)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(self.dropout(outputs.last_hidden_state))  # Apply dropout

        if labels is not None:
            # Replace -100 with a valid index (e.g., 0)
            valid_labels = labels.clone()
            valid_labels[labels == -100] = 0

            # Compute CRF loss
            loss = -self.crf(logits, valid_labels, mask=attention_mask.bool())
            return loss
        else:
            # Decode CRF predictions
            predictions = self.crf.decode(logits, mask=attention_mask.bool())
            return predictions

# Initialize TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate once per epoch
    save_strategy="epoch",  # Save only at the end of each epoch
    save_total_limit=1,  # Keep only the latest checkpoint
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir=None,  # Disable logging directory
    logging_steps=50,  # Log only every 50 steps
    report_to="none",  # Disable reporting
    save_on_each_node=False,  # Avoid saving duplicates in distributed training
)

# Extract Parameters
learning_rate = training_args.learning_rate
weight_decay = training_args.weight_decay
num_epochs = training_args.num_train_epochs
batch_size = training_args.per_device_train_batch_size
output_dir = training_args.output_dir

# Initialize the model
improved_model = BertCRFNER(model_name=model_name, num_labels=num_labels)
improved_model.to(device)

# Freeze lower layers for initial training
for param in improved_model.bert.embeddings.parameters():
    param.requires_grad = False

for layer in improved_model.bert.encoder.layer[:6]:  # Freeze first 6 layers
    for param in layer.parameters():
        param.requires_grad = False

# Optimizer
optimizer = torch.optim.AdamW(improved_model.parameters(), lr=learning_rate, weight_decay=weight_decay)

total_steps = len(train_dataloader) * num_epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training and Validation Loop
for epoch in range(num_epochs):
    # Training Phase
    improved_model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        loss = improved_model(input_ids, attention_mask, labels=labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(improved_model.parameters(), max_norm=1.0)  # Optional gradient clipping
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}: Training Loss = {avg_train_loss}")

    # Validation Phase
    improved_model.eval()
    val_loss = 0
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            loss = improved_model(input_ids, attention_mask, labels=labels)
            val_loss += loss.item()

            preds = improved_model(input_ids, attention_mask)
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy().tolist())

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}: Validation Loss = {avg_val_loss}")
    decoded_predictions = []
    decoded_labels = []
    for preds, labels in zip(predictions, true_labels):
        valid_preds = [p for p, l in zip(preds, labels) if l != -100]
        valid_labels = [l for l in labels if l != -100]
        decoded_predictions.append([label_list[p] for p in valid_preds])
        decoded_labels.append([label_list[l] for l in valid_labels])

    print(f"Classification Report for Epoch {epoch + 1}:")
    print(classification_report(decoded_labels, decoded_predictions))

    # Save Model at the End of Each Epoch
    if training_args.save_strategy == "epoch":
        save_path = f"{output_dir}/epoch-{epoch + 1}"
        os.makedirs(save_path, exist_ok=True)

        # Save the model's state dictionary
        model_save_path = os.path.join(save_path, "pytorch_model.bin")
        torch.save(improved_model.state_dict(), model_save_path)

        # Save the tokenizer
        tokenizer.save_pretrained(save_path)

        print(f"Model and tokenizer saved at {save_path}")

# Testing the Model
print("Starting Testing Phase...")
improved_model.eval()
test_predictions, test_labels = [], []
test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Compute loss for testing
        loss = improved_model(input_ids, attention_mask, labels=labels)
        test_loss += loss.item()

        # Get predictions
        preds = improved_model(input_ids, attention_mask)
        test_predictions.extend(preds)
        test_labels.extend(labels.cpu().numpy().tolist())

avg_test_loss = test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss}")

# Decode predictions and labels
decoded_test_predictions = []
decoded_test_labels = []
for preds, labels in zip(test_predictions, test_labels):
    valid_preds = [p for p, l in zip(preds, labels) if l != -100]
    valid_labels = [l for l in labels if l != -100]
    decoded_test_predictions.append([label_list[p] for p in valid_preds])
    decoded_test_labels.append([label_list[l] for l in valid_labels])

# Classification Report
print("Test Set Classification Report:")
print(classification_report(decoded_test_labels, decoded_test_predictions))


Using device: cuda
/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(
The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
Downloading data: 100%
 983k/983k [00:00<00:00, 20.4MB/s]
Generating train split: 100%
 14041/14041 [00:05<00:00, 2489.13 examples/s]
Generating validation split: 100%
 3250/3250 [00:01<00:00, 3003.64 examples/s]
Generating test split: 100%
 3453/3453 [00:00<00:00, 4578.28 examples/s]
tokenizer_config.json: 100%
 49.0/49.0 [00:00<00:00, 2.46kB/s]
config.json: 100%
 762/762 [00:00<00:00, 58.2kB/s]
vocab.txt: 100%
 213k/213k [00:00<00:00, 865kB/s]
tokenizer.json: 100%
 436k/436k [00:00<00:00, 1.74MB/s]
Map: 100%
 14041/14041 [00:04<00:00, 2904.87 examples/s]
Map: 100%
 3250/3250 [00:00<00:00, 3545.38 examples/s]
Map: 100%
 3453/3453 [00:01<00:00, 2598.57 examples/s]
/usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1568: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(
model.safetensors: 100%
 1.34G/1.34G [00:11<00:00, 152MB/s]
Training Epoch 1: 100%|██████████| 878/878 [15:21<00:00,  1.05s/it]
Epoch 1: Training Loss = 23.584132590978182
Evaluating: 100%|██████████| 204/204 [02:45<00:00,  1.23it/s]
Epoch 1: Validation Loss = 9.506981639301076
Classification Report for Epoch 1:
              precision    recall  f1-score   support

         LOC       0.96      0.97      0.97      1837
        MISC       0.86      0.89      0.87       922
         ORG       0.93      0.94      0.93      1341
         PER       0.98      0.98      0.98      1836

   micro avg       0.94      0.95      0.95      5936
   macro avg       0.93      0.94      0.94      5936
weighted avg       0.94      0.95      0.95      5936

Model and tokenizer saved at ./results/epoch-1
Training Epoch 2: 100%|██████████| 878/878 [15:28<00:00,  1.06s/it]
Epoch 2: Training Loss = 8.143380171617235
Evaluating: 100%|██████████| 204/204 [02:45<00:00,  1.23it/s]
Epoch 2: Validation Loss = 11.9276491800944
Classification Report for Epoch 2:
              precision    recall  f1-score   support

         LOC       0.97      0.96      0.97      1837
        MISC       0.88      0.91      0.89       922
         ORG       0.91      0.94      0.93      1341
         PER       0.98      0.97      0.98      1836

   micro avg       0.94      0.95      0.95      5936
   macro avg       0.93      0.95      0.94      5936
weighted avg       0.95      0.95      0.95      5936

Model and tokenizer saved at ./results/epoch-2
Training Epoch 3: 100%|██████████| 878/878 [15:25<00:00,  1.05s/it]
Epoch 3: Training Loss = 5.313750869863941
Evaluating: 100%|██████████| 204/204 [02:45<00:00,  1.23it/s]
Epoch 3: Validation Loss = 14.283800209269804
Classification Report for Epoch 3:
              precision    recall  f1-score   support

         LOC       0.97      0.97      0.97      1837
        MISC       0.92      0.92      0.92       922
         ORG       0.94      0.94      0.94      1341
         PER       0.98      0.98      0.98      1836

   micro avg       0.96      0.96      0.96      5936
   macro avg       0.95      0.96      0.95      5936
weighted avg       0.96      0.96      0.96      5936

Model and tokenizer saved at ./results/epoch-3
Training Epoch 4: 100%|██████████| 878/878 [15:25<00:00,  1.05s/it]
Epoch 4: Training Loss = 3.1521896631679667
Evaluating: 100%|██████████| 204/204 [02:45<00:00,  1.23it/s]
Epoch 4: Validation Loss = 14.71805798773672
Classification Report for Epoch 4:
              precision    recall  f1-score   support

         LOC       0.98      0.97      0.98      1837
        MISC       0.91      0.93      0.92       922
         ORG       0.94      0.95      0.94      1341
         PER       0.97      0.99      0.98      1836

   micro avg       0.96      0.96      0.96      5936
   macro avg       0.95      0.96      0.95      5936
weighted avg       0.96      0.96      0.96      5936

Model and tokenizer saved at ./results/epoch-4
Training Epoch 5: 100%|██████████| 878/878 [15:23<00:00,  1.05s/it]
Epoch 5: Training Loss = 1.9186030715907625
Evaluating: 100%|██████████| 204/204 [02:45<00:00,  1.23it/s]
Epoch 5: Validation Loss = 15.730226301679425
Classification Report for Epoch 5:
              precision    recall  f1-score   support

         LOC       0.97      0.97      0.97      1837
        MISC       0.91      0.93      0.92       922
         ORG       0.93      0.95      0.94      1341
         PER       0.98      0.98      0.98      1836

   micro avg       0.96      0.96      0.96      5936
   macro avg       0.95      0.96      0.95      5936
weighted avg       0.96      0.96      0.96      5936

Model and tokenizer saved at ./results/epoch-5
Starting Testing Phase...
Testing: 100%|██████████| 216/216 [02:55<00:00,  1.23it/s]
Test Loss: 36.4662637533965
Test Set Classification Report:
              precision    recall  f1-score   support

         LOC       0.93      0.93      0.93      1666
        MISC       0.79      0.83      0.81       702
         ORG       0.88      0.91      0.90      1661
         PER       0.97      0.97      0.97      1615

   micro avg       0.91      0.92      0.92      5644
   macro avg       0.89      0.91      0.90      5644
weighted avg       0.91      0.92      0.92      5644
