<a href="https://colab.research.google.com/github/Rudra-prasad-tarai/nlpInternship/blob/main/POSTaggerMbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers==4.47.0 datasets seqeval scikit-learn




In [3]:
import ast  # safely parse string to Python object
import numpy as np
import transformers
from seqeval.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback  # <-- Add this import
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import Dataset, DatasetDict
import time
start = time.time()

2025-05-16 06:12:38.252639: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747375958.276098     119 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747375958.283275     119 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
def extract_tokens_and_tags(example):
    ann = ast.literal_eval(example['Annotated by: Annotator 1 '])  # safely convert string to list of dicts
    tokens = [entry["word"] for entry in ann]
    tags = [entry["entity"] for entry in ann]
    return {"tokens": tokens, "tags": tags}
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2tag[label] for label in sent if label != -100]
        for sent in labels
    ]
    true_predictions = [
        [id2tag[pred] for (pred, label) in zip(sent_preds, sent_labels) if label != -100]
        for sent_preds, sent_labels in zip(predictions, labels)
    ]
    print(classification_report(true_labels,true_predictions,digits = 6))
    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128

    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



In [5]:
# Load POS task subset

from datasets import load_dataset
dataset = load_dataset("LingoIITGN/COMI-LINGUA", "POS")

# Convert to tokens and tags
processed_data = dataset.map(extract_tokens_and_tags)

# Keep only necessary columns
processed_data = processed_data.remove_columns([
    "Sentences", "Predicted Tags",
    "Annotated by: Annotator 1 ",
    "Annotated by: Annotator 2",
    "Annotated by: Annotator 3"
])


print(len(processed_data['train']))

# Assuming processed_data is your Dataset/DatasetDict
data = processed_data['train']

15684


In [6]:
# type(dataset['train'][15683]) # dataset['train'][i] the annotaion by all the annotators for the the sentence

In [8]:
# First split: 80% train, 20% temp
train_test_split = data.train_test_split(
    test_size=0.2,
    seed=42
)

# Second split: 10% val, 10% test
val_test_split = train_test_split['test'].train_test_split(
    test_size=0.5,
    seed=42
)

# Combine
processed_data = DatasetDict({
    'train': train_test_split['train'],
    'validation': val_test_split['train'],
    'test': val_test_split['test']
})

In [9]:
print(len(processed_data['train']))
print(len(processed_data['validation']))
print(len(processed_data['test']))

12547
1568
1569


In [10]:
processed_data['train'][0]

{'tokens': ['नई',
  'दिल्ली',
  ':',
  'देश',
  'की',
  'प्रमुख',
  'दो',
  '-',
  'पहिया',
  'वाहन',
  'निर्माता',
  'कंपनी',
  'टीवीएस',
  'मोटर्स',
  '(',
  'TVS',
  'Motors',
  ')',
  'आज',
  'यानी',
  'सोमवार',
  'को',
  'Apache',
  'RR',
  '310',
  '2021',
  'को',
  'लॉन्च',
  'करने',
  'जा',
  'रही',
  'है',
  '।'],
 'tags': ['PROPN',
  'PROPN',
  'X',
  'NOUN',
  'ADP',
  'ADJ',
  'ADJ',
  'X',
  'ADJ',
  'NOUN',
  'NOUN',
  'NOUN',
  'PROPN',
  'PROPN',
  'X',
  'PROPN',
  'PROPN',
  'X',
  'ADV',
  'CONJ',
  'PROPN',
  'ADP',
  'NOUN',
  'NOUN',
  'NUM',
  'NUM',
  'ADP',
  'VERB',
  'VERB',
  'VERB',
  'VERB',
  'VERB',
  'X']}

In [13]:
# Get all unique tags
unique_tags = list(set(tag for row in data for tag in row['tags']))
tag2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
id2tag = {i: tag for tag, i in tag2id.items()}
NUM_LABELS = len(tag2id)


In [16]:
print(len(id2tag))

14


In [17]:
# Tokenize

model_checkpoint = "bert-base-multilingual-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=NUM_LABELS,
    id2label=id2tag,
    label2id=tag2id
)

tokenized_train = processed_data['train'].map(tokenize_and_align_labels, batched=True)
tokenized_val = processed_data['validation'].map(tokenize_and_align_labels, batched=True)
tokenized_test = processed_data['test'].map(tokenize_and_align_labels, batched=True)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/12547 [00:00<?, ? examples/s]

Map:   0%|          | 0/1568 [00:00<?, ? examples/s]

Map:   0%|          | 0/1569 [00:00<?, ? examples/s]

In [21]:
tokenized_train

Dataset({
    features: ['tokens', 'tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 12547
})

In [22]:
# Create training arguements
training_args = TrainingArguments(
    output_dir="./mbert-pos-results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    load_best_model_at_end=True,  # <-- Important for early stopping
    metric_for_best_model="eval_loss",  # <-- Metric to monitor
    greater_is_better=False  # <-- For loss (lower is better)
)

# Create Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,  # Stop after 3 epochs without improvement
            early_stopping_threshold=0.01  # Minimum improvement to qualify as better
        )
    ],
    compute_metrics=compute_metrics

)


In [23]:
t = trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2986,0.270777,0.913363,0.862139
2,0.2227,0.241891,0.92207,0.874754
3,0.1715,0.235925,0.927496,0.883873
4,0.1531,0.227315,0.929152,0.885971
5,0.131,0.236944,0.931788,0.890079




              precision    recall  f1-score   support

         ART   0.906736  0.700000  0.790068       250
     ART_NEG   0.946078  0.969849  0.957816       199
          DJ   0.706806  0.749075  0.727327      1622
          DP   0.953528  0.957447  0.955483      4136
          DV   0.783446  0.748162  0.765397      1088
         ERB   0.860830  0.819596  0.839707      3215
          ET   0.849845  0.919598  0.883347       597
         ONJ   0.864796  0.946927  0.904000       716
         OUN   0.804731  0.815003  0.809834      5719
         RON   0.914567  0.905640  0.910082       922
      RON_WH   0.923423  0.923423  0.923423       222
        ROPN   0.822043  0.874533  0.847476      2678
          UM   0.916667  0.940663  0.928510       573
           _   0.950970  0.918882  0.934651      2934

   micro avg   0.859949  0.864340  0.862139     24871
   macro avg   0.871748  0.870628  0.869794     24871
weighted avg   0.861204  0.864340  0.862295     24871





              precision    recall  f1-score   support

         ART   0.900000  0.792000  0.842553       250
     ART_NEG   0.960591  0.979899  0.970149       199
          DJ   0.734487  0.758940  0.746513      1622
          DP   0.950012  0.964942  0.957419      4136
          DV   0.845098  0.792279  0.817837      1088
         ERB   0.829710  0.854743  0.842041      3215
          ET   0.876623  0.904523  0.890354       597
         ONJ   0.930801  0.958101  0.944253       716
         OUN   0.821049  0.810282  0.815630      5719
         RON   0.926327  0.927332  0.926829       922
      RON_WH   0.976303  0.927928  0.951501       222
        ROPN   0.859565  0.870799  0.865146      2678
          UM   0.918506  0.944154  0.931153       573
           _   0.957615  0.947171  0.952365      2934

   micro avg   0.873072  0.876442  0.874754     24871
   macro avg   0.891906  0.888078  0.889553     24871
weighted avg   0.873217  0.876442  0.874674     24871





              precision    recall  f1-score   support

         ART   0.890830  0.816000  0.851775       250
     ART_NEG   0.960396  0.974874  0.967581       199
          DJ   0.756571  0.745376  0.750932      1622
          DP   0.956501  0.967602  0.962019      4136
          DV   0.813590  0.814338  0.813964      1088
         ERB   0.866875  0.862830  0.864848      3215
          ET   0.858034  0.921273  0.888530       597
         ONJ   0.945455  0.944134  0.944794       716
         OUN   0.838595  0.826718  0.832614      5719
         RON   0.925244  0.926247  0.925745       922
      RON_WH   0.933333  0.945946  0.939597       222
        ROPN   0.861483  0.889470  0.875253      2678
          UM   0.920339  0.947644  0.933792       573
           _   0.954592  0.952965  0.953778      2934

   micro avg   0.882863  0.884886  0.883873     24871
   macro avg   0.891560  0.895387  0.893230     24871
weighted avg   0.882547  0.884886  0.883625     24871





              precision    recall  f1-score   support

         ART   0.892704  0.832000  0.861284       250
     ART_NEG   0.965517  0.984925  0.975124       199
          DJ   0.742925  0.776819  0.759494      1622
          DP   0.956730  0.967602  0.962135      4136
          DV   0.876161  0.780331  0.825474      1088
         ERB   0.872254  0.864386  0.868302      3215
          ET   0.852535  0.929648  0.889423       597
         ONJ   0.934316  0.973464  0.953488       716
         OUN   0.832278  0.827767  0.830017      5719
         RON   0.939866  0.915401  0.927473       922
      RON_WH   0.938053  0.954955  0.946429       222
        ROPN   0.858785  0.892457  0.875298      2678
          UM   0.949212  0.945899  0.947552       573
           _   0.963781  0.952284  0.957998      2934

   micro avg   0.884887  0.887057  0.885971     24871
   macro avg   0.898223  0.899853  0.898535     24871
weighted avg   0.885325  0.887057  0.885928     24871





              precision    recall  f1-score   support

         ART   0.894068  0.844000  0.868313       250
     ART_NEG   0.960784  0.984925  0.972705       199
          DJ   0.750149  0.775586  0.762655      1622
          DP   0.956740  0.967843  0.962260      4136
          DV   0.844744  0.805147  0.824471      1088
         ERB   0.876524  0.872162  0.874337      3215
          ET   0.839525  0.946399  0.889764       597
         ONJ   0.936997  0.976257  0.956224       716
         OUN   0.842059  0.835286  0.838659      5719
         RON   0.944383  0.920824  0.932455       922
      RON_WH   0.942478  0.959459  0.950893       222
        ROPN   0.876372  0.894698  0.885440      2678
          UM   0.942408  0.942408  0.942408       573
           _   0.961433  0.951602  0.956492      2934

   micro avg   0.888404  0.891761  0.890079     24871
   macro avg   0.897762  0.905471  0.901220     24871
weighted avg   0.888632  0.891761  0.890037     24871



In [27]:
metrics = trainer.evaluate(eval_dataset=tokenized_test)
print(metrics)


              precision    recall  f1-score   support

         ART   0.823256  0.793722  0.808219       223
     ART_NEG   0.980000  0.956098  0.967901       205
          DJ   0.759165  0.788056  0.773341      1708
          DP   0.964504  0.969250  0.966871      4065
          DV   0.860656  0.822723  0.841262      1021
         ERB   0.886510  0.886510  0.886510      3269
          ET   0.876423  0.912014  0.893864       591
         ONJ   0.942159  0.968296  0.955049       757
         OUN   0.843154  0.834545  0.838828      5778
         RON   0.928042  0.923158  0.925594       950
      RON_WH   0.937198  0.946341  0.941748       205
        ROPN   0.864071  0.891923  0.877776      2637
          UM   0.929688  0.964344  0.946698       617
           _   0.969541  0.951310  0.960339      2978

   micro avg   0.891327  0.893537  0.892431     25004
   macro avg   0.897455  0.900592  0.898857     25004
weighted avg   0.891647  0.893537  0.892491     25004

{'eval_loss': 0.216725647

In [28]:
metrics

{'eval_loss': 0.2167256474494934,
 'eval_accuracy': 0.9331997327989312,
 'eval_f1': 0.8924305971639704,
 'eval_runtime': 10.1546,
 'eval_samples_per_second': 154.512,
 'eval_steps_per_second': 4.924,
 'epoch': 5.0}

In [29]:
end = time.time()
print(f"Time taken : {(end-start)/60} mins")

Time taken : 31.362045045693716 mins
