In [None]:
!pip install transformers datasets evaluate seqeval
!git clone https://github.com/kmkurn/pytorch-crf.git
!pip install ./pytorch-crf
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117

In [None]:
from datasets import load_dataset

dataset = load_dataset("wikiann", "hi")  # Example: Hindi WikiANN dataset
print(dataset)

In [None]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

# Define the save directory in Google Drive
drive_save_path = '/content/drive/My Drive/bert_crf_model'

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, DataCollatorForTokenClassification, TrainingArguments, get_scheduler
from datasets import load_dataset
from torchcrf import CRF
from seqeval.metrics import classification_report
from tqdm import tqdm
import os

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the CoNLL-2003 dataset
dataset = load_dataset("wikiann", "hi")  # Example: Hindi WikiANN dataset

# Reduce the dataset size (30% of the original size) for quicker experimentation
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(int(len(dataset["train"]))))
dataset["validation"] = dataset["validation"].shuffle(seed=42).select(range(int(len(dataset["validation"]))))
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(int(len(dataset["test"]))))

# Get label names
label_list = dataset["train"].features["ner_tags"].feature.names  # e.g., O, B-PER, I-PER
num_labels = len(label_list)

# Load tokenizer
model_name = "bert-large-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization and label alignment
def tokenize_and_align_labels_with_crf(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignored during training
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # First subword
            else:
                label_ids.append(-100)  # Ignore subwords
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_and_align_labels_with_crf, batched=True)

# Remove unnecessary columns
tokenized_datasets["train"] = tokenized_datasets["train"].remove_columns(["langs", "spans", "tokens"])
tokenized_datasets["validation"] = tokenized_datasets["validation"].remove_columns(["langs", "spans", "tokens"])
tokenized_datasets["test"] = tokenized_datasets["test"].remove_columns(["langs", "spans", "tokens"])

# Set dataset format
tokenized_datasets["train"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_datasets["validation"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_datasets["test"].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# DataLoader setup
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="pt")
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16, collate_fn=data_collator)

# Define the model with CRF
class BertCRFNER(nn.Module):
    def __init__(self, model_name, num_labels):
        super(BertCRFNER, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.5)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(self.dropout(outputs.last_hidden_state))  # Apply dropout

        if labels is not None:
            # Replace -100 with a valid index (e.g., 0)
            valid_labels = labels.clone()
            valid_labels[labels == -100] = 0

            # Compute CRF loss
            loss = -self.crf(logits, valid_labels, mask=attention_mask.bool())
            return loss
        else:
            # Decode CRF predictions
            predictions = self.crf.decode(logits, mask=attention_mask.bool())
            return predictions

# Initialize TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate once per epoch
    save_strategy="epoch",  # Save only at the end of each epoch
    save_total_limit=1,  # Keep only the latest checkpoint
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir=None,  # Disable logging directory
    logging_steps=50,  # Log only every 50 steps
    report_to="none",  # Disable reporting
    save_on_each_node=False,  # Avoid saving duplicates in distributed training
)

# Extract Parameters
learning_rate = training_args.learning_rate
weight_decay = training_args.weight_decay
num_epochs = training_args.num_train_epochs
batch_size = training_args.per_device_train_batch_size
output_dir = training_args.output_dir

# Initialize the model
improved_model = BertCRFNER(model_name=model_name, num_labels=num_labels)
improved_model.to(device)

# Freeze lower layers for initial training
for param in improved_model.bert.embeddings.parameters():
    param.requires_grad = False

for layer in improved_model.bert.encoder.layer[:6]:  # Freeze first 6 layers
    for param in layer.parameters():
        param.requires_grad = False

# Optimizer
optimizer = torch.optim.AdamW(improved_model.parameters(), lr=learning_rate, weight_decay=weight_decay)

total_steps = len(train_dataloader) * num_epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training and Validation Loop
for epoch in range(num_epochs):
    # Training Phase
    improved_model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        loss = improved_model(input_ids, attention_mask, labels=labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(improved_model.parameters(), max_norm=1.0)  # Optional gradient clipping
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}: Training Loss = {avg_train_loss}")

    # Validation Phase
    improved_model.eval()
    val_loss = 0
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            loss = improved_model(input_ids, attention_mask, labels=labels)
            val_loss += loss.item()

            preds = improved_model(input_ids, attention_mask)
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy().tolist())

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}: Validation Loss = {avg_val_loss}")
    decoded_predictions = []
    decoded_labels = []
    for preds, labels in zip(predictions, true_labels):
        valid_preds = [p for p, l in zip(preds, labels) if l != -100]
        valid_labels = [l for l in labels if l != -100]
        decoded_predictions.append([label_list[p] for p in valid_preds])
        decoded_labels.append([label_list[l] for l in valid_labels])

    print(f"Classification Report for Epoch {epoch + 1}:")
    print(classification_report(decoded_labels, decoded_predictions))

     # Save Model and Tokenizer to Google Drive
    if training_args.save_strategy == "epoch":
        save_path = os.path.join(drive_save_path, f"epoch-{epoch + 1}")
        os.makedirs(save_path, exist_ok=True)

        # Save the model's state dictionary
        model_save_path = os.path.join(save_path, "pytorch_model.bin")
        torch.save(improved_model.state_dict(), model_save_path)

        # Save the tokenizer
        tokenizer.save_pretrained(save_path)

        print(f"Model and tokenizer saved at {save_path}")

# Testing the Model
print("Starting Testing Phase...")
improved_model.eval()
test_predictions, test_labels = [], []
test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Compute loss for testing
        loss = improved_model(input_ids, attention_mask, labels=labels)
        test_loss += loss.item()

        # Get predictions
        preds = improved_model(input_ids, attention_mask)
        test_predictions.extend(preds)
        test_labels.extend(labels.cpu().numpy().tolist())

avg_test_loss = test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss}")

# Decode predictions and labels
decoded_test_predictions = []
decoded_test_labels = []
for preds, labels in zip(test_predictions, test_labels):
    valid_preds = [p for p, l in zip(preds, labels) if l != -100]
    valid_labels = [l for l in labels if l != -100]
    decoded_test_predictions.append([label_list[p] for p in valid_preds])
    decoded_test_labels.append([label_list[l] for l in valid_labels])

# Classification Report
print("Test Set Classification Report:")
print(classification_report(decoded_test_labels, decoded_test_predictions))


Using device: cuda
Map: 100%
 1000/1000 [00:00<00:00, 4164.85 examples/s]
/usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1568: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(
Training Epoch 1: 100%|██████████| 313/313 [05:23<00:00,  1.03s/it]
Epoch 1: Training Loss = 98.74091459767887
Evaluating: 100%|██████████| 63/63 [00:50<00:00,  1.25it/s]
Epoch 1: Validation Loss = 61.85583732241676
Classification Report for Epoch 1:
              precision    recall  f1-score   support

         LOC       0.50      0.38      0.43       423
         ORG       0.33      0.62      0.43       369
         PER       0.60      0.61      0.60       434

   micro avg       0.45      0.53      0.48      1226
   macro avg       0.47      0.53      0.49      1226
weighted avg       0.48      0.53      0.49      1226

Model and tokenizer saved at /content/drive/My Drive/bert_crf_model/epoch-1
Training Epoch 2: 100%|██████████| 313/313 [05:29<00:00,  1.05s/it]
Epoch 2: Training Loss = 58.32604721483712
Evaluating: 100%|██████████| 63/63 [00:49<00:00,  1.26it/s]
Epoch 2: Validation Loss = 46.512420412093874
Classification Report for Epoch 2:
              precision    recall  f1-score   support

         LOC       0.69      0.60      0.64       423
         ORG       0.45      0.63      0.53       369
         PER       0.68      0.77      0.72       434

   micro avg       0.60      0.67      0.63      1226
   macro avg       0.61      0.67      0.63      1226
weighted avg       0.62      0.67      0.64      1226

Model and tokenizer saved at /content/drive/My Drive/bert_crf_model/epoch-2
Training Epoch 3: 100%|██████████| 313/313 [05:31<00:00,  1.06s/it]
Epoch 3: Training Loss = 41.98486045374276
Evaluating: 100%|██████████| 63/63 [00:51<00:00,  1.23it/s]
Epoch 3: Validation Loss = 44.532665827917675
Classification Report for Epoch 3:
              precision    recall  f1-score   support

         LOC       0.69      0.61      0.65       423
         ORG       0.52      0.66      0.58       369
         PER       0.72      0.80      0.75       434

   micro avg       0.64      0.69      0.66      1226
   macro avg       0.64      0.69      0.66      1226
weighted avg       0.65      0.69      0.66      1226

Model and tokenizer saved at /content/drive/My Drive/bert_crf_model/epoch-3
Training Epoch 4: 100%|██████████| 313/313 [05:30<00:00,  1.06s/it]
Epoch 4: Training Loss = 32.92744721848363
Evaluating: 100%|██████████| 63/63 [00:49<00:00,  1.26it/s]
Epoch 4: Validation Loss = 43.91161467537047
Classification Report for Epoch 4:
              precision    recall  f1-score   support

         LOC       0.61      0.71      0.66       423
         ORG       0.58      0.57      0.57       369
         PER       0.71      0.78      0.74       434

   micro avg       0.64      0.69      0.66      1226
   macro avg       0.63      0.69      0.66      1226
weighted avg       0.64      0.69      0.66      1226

Model and tokenizer saved at /content/drive/My Drive/bert_crf_model/epoch-4
Training Epoch 5: 100%|██████████| 313/313 [05:40<00:00,  1.09s/it]
Epoch 5: Training Loss = 24.613775880953757
Evaluating: 100%|██████████| 63/63 [00:50<00:00,  1.25it/s]
Epoch 5: Validation Loss = 46.61832667153979
Classification Report for Epoch 5:
              precision    recall  f1-score   support

         LOC       0.73      0.70      0.71       423
         ORG       0.65      0.70      0.68       369
         PER       0.78      0.81      0.79       434

   micro avg       0.72      0.74      0.73      1226
   macro avg       0.72      0.74      0.73      1226
weighted avg       0.72      0.74      0.73      1226

Model and tokenizer saved at /content/drive/My Drive/bert_crf_model/epoch-5
Training Epoch 6: 100%|██████████| 313/313 [05:30<00:00,  1.06s/it]
Epoch 6: Training Loss = 19.596256414541422
Evaluating: 100%|██████████| 63/63 [00:51<00:00,  1.23it/s]
Epoch 6: Validation Loss = 47.64252913944305
Classification Report for Epoch 6:
              precision    recall  f1-score   support

         LOC       0.67      0.75      0.71       423
         ORG       0.63      0.72      0.67       369
         PER       0.82      0.81      0.81       434

   micro avg       0.70      0.76      0.73      1226
   macro avg       0.71      0.76      0.73      1226
weighted avg       0.71      0.76      0.73      1226

Model and tokenizer saved at /content/drive/My Drive/bert_crf_model/epoch-6
Training Epoch 7: 100%|██████████| 313/313 [05:33<00:00,  1.07s/it]
Epoch 7: Training Loss = 15.189744675121368
Evaluating: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Epoch 7: Validation Loss = 51.164234645782955
Classification Report for Epoch 7:
              precision    recall  f1-score   support

         LOC       0.66      0.75      0.70       423
         ORG       0.70      0.70      0.70       369
         PER       0.78      0.83      0.81       434

   micro avg       0.71      0.77      0.74      1226
   macro avg       0.71      0.76      0.74      1226
weighted avg       0.72      0.77      0.74      1226

Model and tokenizer saved at /content/drive/My Drive/bert_crf_model/epoch-7
Training Epoch 8: 100%|██████████| 313/313 [05:28<00:00,  1.05s/it]
Epoch 8: Training Loss = 12.168577663433819
Evaluating: 100%|██████████| 63/63 [00:51<00:00,  1.21it/s]
Epoch 8: Validation Loss = 54.782096590314595
Classification Report for Epoch 8:
              precision    recall  f1-score   support

         LOC       0.75      0.73      0.74       423
         ORG       0.64      0.69      0.66       369
         PER       0.77      0.84      0.80       434

   micro avg       0.72      0.76      0.74      1226
   macro avg       0.72      0.75      0.73      1226
weighted avg       0.72      0.76      0.74      1226

Model and tokenizer saved at /content/drive/My Drive/bert_crf_model/epoch-8
Training Epoch 9: 100%|██████████| 313/313 [05:28<00:00,  1.05s/it]
Epoch 9: Training Loss = 10.0409609761101
Evaluating: 100%|██████████| 63/63 [00:50<00:00,  1.26it/s]
Epoch 9: Validation Loss = 54.340050803290474
Classification Report for Epoch 9:
              precision    recall  f1-score   support

         LOC       0.74      0.77      0.75       423
         ORG       0.71      0.69      0.70       369
         PER       0.77      0.86      0.82       434

   micro avg       0.74      0.78      0.76      1226
   macro avg       0.74      0.77      0.76      1226
weighted avg       0.74      0.78      0.76      1226

Model and tokenizer saved at /content/drive/My Drive/bert_crf_model/epoch-9
Training Epoch 10: 100%|██████████| 313/313 [05:46<00:00,  1.11s/it]
Epoch 10: Training Loss = 8.320954709769056
Evaluating: 100%|██████████| 63/63 [00:52<00:00,  1.20it/s]
Epoch 10: Validation Loss = 56.66284682258727
Classification Report for Epoch 10:
              precision    recall  f1-score   support

         LOC       0.74      0.77      0.75       423
         ORG       0.70      0.74      0.72       369
         PER       0.81      0.84      0.82       434

   micro avg       0.75      0.78      0.77      1226
   macro avg       0.75      0.78      0.77      1226
weighted avg       0.75      0.78      0.77      1226

Model and tokenizer saved at /content/drive/My Drive/bert_crf_model/epoch-10
Starting Testing Phase...
Testing: 100%|██████████| 63/63 [00:50<00:00,  1.25it/s]Test Loss: 60.79998403882224
Test Set Classification Report:
              precision    recall  f1-score   support

         LOC       0.74      0.79      0.76       414
         ORG       0.68      0.78      0.73       363
         PER       0.80      0.77      0.79       450

   micro avg       0.74      0.78      0.76      1227
   macro avg       0.74      0.78      0.76      1227
weighted avg       0.75      0.78      0.76      1227