<a href="https://colab.research.google.com/github/Rudra-prasad-tarai/nlpInternship/blob/main/POSTaggerMbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers==4.47.0 datasets seqeval scikit-learn


Collecting transformers==4.47.0
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m41.0/43.5 kB[0m [31m79.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m41.0/43.5 kB[0m [31m79.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m372.9 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloadi

In [None]:
import ast  # safely parse string to Python object
import numpy as np
import transformers
from seqeval.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback  # <-- Add this import



In [None]:
def extract_tokens_and_tags(example):
    ann = ast.literal_eval(example['Annotated by: Annotator 1 '])  # safely convert string to list of dicts
    tokens = [entry["word"] for entry in ann]
    tags = [entry["entity"] for entry in ann]
    return {"tokens": tokens, "tags": tags}
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2tag[label] for label in sent if label != -100]
        for sent in labels
    ]
    true_predictions = [
        [id2tag[pred] for (pred, label) in zip(sent_preds, sent_labels) if label != -100]
        for sent_preds, sent_labels in zip(predictions, labels)
    ]
    print(classification_report(true_labels,true_predictions,digits = 6))
    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128

    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



In [None]:
# Load POS task subset

from datasets import load_dataset
dataset = load_dataset("LingoIITGN/COMI-LINGUA", "POS")

# Convert to tokens and tags
processed_data = dataset.map(extract_tokens_and_tags)

# Keep only necessary columns
processed_data = processed_data.remove_columns([
    "Sentences", "Predicted Tags",
    "Annotated by: Annotator 1 ",
    "Annotated by: Annotator 2",
    "Annotated by: Annotator 3"
])


print(len(processed_data['train']))

# Assuming processed_data is your Dataset/DatasetDict
data = processed_data['train']

In [None]:
# type(dataset['train'][15683]) # dataset['train'][i] the annotaion by all the annotators for the the sentence

{'Sentences': 'जैसलमेर - प्रधानमंत्री नरेंद्र मोदी (Narendra Modi) ने राजस्थान के जैसलमेर बॉर्डर पर लोंगेवाला पोस्ट पहुंच कर सेना के जवानों के साथ दिवाली (Diwali) का त्योहार मनाया।',
 'Predicted Tags': "[{'word': 'जैसलमेर', 'entity': 'PROPN'}, {'word': '-', 'entity': 'X'}, {'word': 'प्रधानमंत्री', 'entity': 'PROPN'}, {'word': 'नरेंद्र', 'entity': 'PROPN'}, {'word': 'मोदी', 'entity': 'PROPN'}, {'word': '(', 'entity': 'X'}, {'word': 'Narendra', 'entity': 'PROPN'}, {'word': 'Modi', 'entity': 'PROPN'}, {'word': ')', 'entity': 'X'}, {'word': 'ने', 'entity': 'VERB'}, {'word': 'राजस्थान', 'entity': 'PROPN'}, {'word': 'के', 'entity': 'ADP'}, {'word': 'जैसलमेर', 'entity': 'PROPN'}, {'word': 'बॉर्डर', 'entity': 'NOUN'}, {'word': 'पर', 'entity': 'ADP'}, {'word': 'लोंगेवाला', 'entity': 'ADJ'}, {'word': 'पोस्ट', 'entity': 'NOUN'}, {'word': 'पहुंच', 'entity': 'VERB'}, {'word': 'कर', 'entity': 'VERB'}, {'word': 'सेना', 'entity': 'NOUN'}, {'word': 'के', 'entity': 'ADP'}, {'word': 'जवानों', 'entity': '

In [None]:
# First split: 80% train, 20% temp (will become val+test)
train_data, temp_data = train_test_split(
    data,
    test_size=0.2,  # 20% for val+test
    random_state=42
)

# Second split: 50% of temp becomes val (10% of total), other 50% test (10% of total)
val_data, test_data = train_test_split(
    temp_data,
    test_size=0.5,  # Split the 20% equally
    random_state=42
)

# Create new DatasetDict with splits
processed_data = {
    'train': train_data,
    'validation': val_data,
    'test': test_data
}

In [None]:
print(len(processed_data['train']))
print(len(processed_data['validation']))
print(len(processed_data['test']))

In [None]:
processed_data['train'][0]

{'tokens': ['Loan',
  'Apps',
  'की',
  'अब',
  'खैर',
  'नहीं',
  ',',
  'RBI',
  'ने',
  'बता',
  'दिया',
  'किस',
  '-',
  'किस',
  'पर',
  'होगी',
  'कार्रवाई',
  ',',
  'लिस्ट',
  'तैयार'],
 'tags': ['NOUN',
  'NOUN',
  'ADP',
  'ADV',
  'NOUN',
  'PART_NEG',
  'X',
  'PROPN',
  'ADP',
  'VERB',
  'VERB',
  'PRON_WH',
  'X',
  'PRON_WH',
  'ADP',
  'VERB',
  'NOUN',
  'X',
  'NOUN',
  'VERB']}

In [None]:
# Tokenize
tokenized_train = processed_data['train'].map(tokenize_and_align_labels, batched=True)
tokenized_val = processed_data['validation'].map(tokenize_and_align_labels, batched=True)
tokenized_test = processed_data['test'].map(tokenize_and_align_labels, batched=True)

In [None]:
# Get all unique tags
unique_tags = list(set(tag for row in train_dataset for tag in row['tags']))
tag2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
id2tag = {i: tag for tag, i in tag2id.items()}
NUM_LABELS = len(tag2id)


In [None]:
# Create training arguements
training_args = TrainingArguments(
    output_dir="./mbert-pos-results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    load_best_model_at_end=True,  # <-- Important for early stopping
    metric_for_best_model="eval_loss",  # <-- Metric to monitor
    greater_is_better=False  # <-- For loss (lower is better)
)

# Create Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train["train"],
    eval_dataset=tokenized_val["validation"],
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,  # Stop after 3 epochs without improvement
            early_stopping_threshold=0.01  # Minimum improvement to qualify as better
        )
    ],
    compute_metrics=compute_metrics

)


In [None]:
t = trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2429,0.218512,0.928498,0.885708
2,0.1847,0.204626,0.933934,0.89439
3,0.1371,0.204165,0.936494,0.897823




              precision    recall  f1-score   support

         ART     0.8250    0.8462    0.8354       702
     ART_NEG     0.9792    0.9714    0.9753       630
          DJ     0.7724    0.7642    0.7683      5708
          DP     0.9618    0.9652    0.9635     14361
          DV     0.8163    0.8136    0.8150      3616
         ERB     0.8860    0.8556    0.8705     10920
          ET     0.8739    0.8907    0.8822      1930
         ONJ     0.9629    0.9306    0.9465      2623
         OUN     0.8269    0.8514    0.8390     19648
         RON     0.9223    0.9203    0.9213      3326
      RON_WH     0.9398    0.9448    0.9423       562
        ROPN     0.8726    0.8771    0.8749      8717
          UM     0.8991    0.9345    0.9165      2108
           _     0.9546    0.9430    0.9487      9667

   micro avg     0.8849    0.8865    0.8857     84518
   macro avg     0.8924    0.8935    0.8928     84518
weighted avg     0.8854    0.8865    0.8858     84518





              precision    recall  f1-score   support

         ART     0.8708    0.8447    0.8576       702
     ART_NEG     0.9811    0.9873    0.9842       630
          DJ     0.7648    0.7829    0.7738      5708
          DP     0.9639    0.9712    0.9675     14361
          DV     0.8725    0.7890    0.8286      3616
         ERB     0.8881    0.8784    0.8832     10920
          ET     0.8677    0.9073    0.8870      1930
         ONJ     0.9592    0.9405    0.9498      2623
         OUN     0.8499    0.8471    0.8485     19648
         RON     0.9283    0.9221    0.9252      3326
      RON_WH     0.9423    0.9591    0.9506       562
        ROPN     0.8589    0.9032    0.8805      8717
          UM     0.9528    0.9573    0.9550      2108
           _     0.9576    0.9539    0.9557      9667

   micro avg     0.8936    0.8951    0.8944     84518
   macro avg     0.9041    0.9031    0.9034     84518
weighted avg     0.8939    0.8951    0.8943     84518





              precision    recall  f1-score   support

         ART     0.8393    0.8704    0.8545       702
     ART_NEG     0.9779    0.9841    0.9810       630
          DJ     0.7881    0.7756    0.7818      5708
          DP     0.9647    0.9722    0.9684     14361
          DV     0.8527    0.8230    0.8376      3616
         ERB     0.8864    0.8807    0.8836     10920
          ET     0.8707    0.9104    0.8901      1930
         ONJ     0.9598    0.9462    0.9530      2623
         OUN     0.8515    0.8545    0.8530     19648
         RON     0.9240    0.9278    0.9259      3326
      RON_WH     0.9397    0.9698    0.9545       562
        ROPN     0.8808    0.9001    0.8903      8717
          UM     0.9578    0.9464    0.9520      2108
           _     0.9597    0.9537    0.9567      9667

   micro avg     0.8972    0.8984    0.8978     84518
   macro avg     0.9038    0.9082    0.9059     84518
weighted avg     0.8971    0.8984    0.8977     84518



In [None]:
metrics = trainer.evaluate(eval_dataset=tokenized_val)
print(metrics)


              precision    recall  f1-score   support

         ART     0.8393    0.8704    0.8545       702
     ART_NEG     0.9779    0.9841    0.9810       630
          DJ     0.7881    0.7756    0.7818      5708
          DP     0.9647    0.9722    0.9684     14361
          DV     0.8527    0.8230    0.8376      3616
         ERB     0.8864    0.8807    0.8836     10920
          ET     0.8707    0.9104    0.8901      1930
         ONJ     0.9598    0.9462    0.9530      2623
         OUN     0.8515    0.8545    0.8530     19648
         RON     0.9240    0.9278    0.9259      3326
      RON_WH     0.9397    0.9698    0.9545       562
        ROPN     0.8808    0.9001    0.8903      8717
          UM     0.9578    0.9464    0.9520      2108
           _     0.9597    0.9537    0.9567      9667

   micro avg     0.8972    0.8984    0.8978     84518
   macro avg     0.9038    0.9082    0.9059     84518
weighted avg     0.8971    0.8984    0.8977     84518

{'eval_loss': 0.204165145

In [None]:
metrics

{'eval_loss': 0.20416514575481415,
 'eval_accuracy': 0.9364937710710873,
 'eval_f1': 0.8978226040071654,
 'eval_runtime': 38.2009,
 'eval_samples_per_second': 130.861,
 'eval_steps_per_second': 16.361,
 'epoch': 3.0}