<a href="https://colab.research.google.com/github/Rudra-prasad-tarai/nlpInternship/blob/main/POSTaggerMuril.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers==4.47.0 datasets seqeval scikit-learn


Collecting transformers==4.47.0
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets

In [None]:
from datasets import load_dataset
dataset = load_dataset("LingoIITGN/COMI-LINGUA", "POS")  # Load POS task subset
train_data = dataset["train"]
test_data = dataset["test"]

In [3]:
dataset['train'][15683] # dataset['train'][i] the annotaion by all the annotators for the the sentence

{'Sentences': 'जैसलमेर - प्रधानमंत्री नरेंद्र मोदी (Narendra Modi) ने राजस्थान के जैसलमेर बॉर्डर पर लोंगेवाला पोस्ट पहुंच कर सेना के जवानों के साथ दिवाली (Diwali) का त्योहार मनाया।',
 'Predicted Tags': "[{'word': 'जैसलमेर', 'entity': 'PROPN'}, {'word': '-', 'entity': 'X'}, {'word': 'प्रधानमंत्री', 'entity': 'PROPN'}, {'word': 'नरेंद्र', 'entity': 'PROPN'}, {'word': 'मोदी', 'entity': 'PROPN'}, {'word': '(', 'entity': 'X'}, {'word': 'Narendra', 'entity': 'PROPN'}, {'word': 'Modi', 'entity': 'PROPN'}, {'word': ')', 'entity': 'X'}, {'word': 'ने', 'entity': 'VERB'}, {'word': 'राजस्थान', 'entity': 'PROPN'}, {'word': 'के', 'entity': 'ADP'}, {'word': 'जैसलमेर', 'entity': 'PROPN'}, {'word': 'बॉर्डर', 'entity': 'NOUN'}, {'word': 'पर', 'entity': 'ADP'}, {'word': 'लोंगेवाला', 'entity': 'ADJ'}, {'word': 'पोस्ट', 'entity': 'NOUN'}, {'word': 'पहुंच', 'entity': 'VERB'}, {'word': 'कर', 'entity': 'VERB'}, {'word': 'सेना', 'entity': 'NOUN'}, {'word': 'के', 'entity': 'ADP'}, {'word': 'जवानों', 'entity': '

In [4]:
import ast  # safely parse string to Python object

def extract_tokens_and_tags(example):
    ann = ast.literal_eval(example['Annotated by: Annotator 1 '])  # safely convert string to list of dicts
    tokens = [entry["word"] for entry in ann]
    tags = [entry["entity"] for entry in ann]
    return {"tokens": tokens, "tags": tags}


In [None]:
# Convert to tokens and tags
processed_data = dataset.map(extract_tokens_and_tags)

# Keep only necessary columns
processed_data = processed_data.remove_columns([
    "Sentences", "Predicted Tags",
    "Annotated by: Annotator 1 ",
    "Annotated by: Annotator 2",
    "Annotated by: Annotator 3"
])


In [6]:
train_dataset = processed_data['train']
test_dataset = processed_data['test']

In [7]:
processed_data['train'][0]

{'tokens': ['Loan',
  'Apps',
  'की',
  'अब',
  'खैर',
  'नहीं',
  ',',
  'RBI',
  'ने',
  'बता',
  'दिया',
  'किस',
  '-',
  'किस',
  'पर',
  'होगी',
  'कार्रवाई',
  ',',
  'लिस्ट',
  'तैयार'],
 'tags': ['NOUN',
  'NOUN',
  'ADP',
  'ADV',
  'NOUN',
  'PART_NEG',
  'X',
  'PROPN',
  'ADP',
  'VERB',
  'VERB',
  'PRON_WH',
  'X',
  'PRON_WH',
  'ADP',
  'VERB',
  'NOUN',
  'X',
  'NOUN',
  'VERB']}

In [8]:
# Get all unique tags
unique_tags = list(set(tag for row in train_dataset for tag in row['tags']))
tag2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
id2tag = {i: tag for tag, i in tag2id.items()}
NUM_LABELS = len(tag2id)


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "google/muril-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=NUM_LABELS,
    id2label=id2tag,
    label2id=tag2id
)


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128

    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val = test_dataset.map(tokenize_and_align_labels, batched=True)


In [11]:
# pip install --upgrade transformers


In [12]:
import transformers
print(transformers.__version__)


4.47.0


In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./muril-pos-results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50
)




In [14]:
print(transformers.__file__)

/usr/local/lib/python3.11/dist-packages/transformers/__init__.py


In [15]:
import numpy as np
from seqeval.metrics import accuracy_score, f1_score, classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2tag[label] for label in sent if label != -100]
        for sent in labels
    ]
    true_predictions = [
        [id2tag[pred] for (pred, label) in zip(sent_preds, sent_labels) if label != -100]
        for sent_preds, sent_labels in zip(predictions, labels)
    ]
    print(classification_report(true_labels, true_predictions,digits = 4))
    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }
print(1)

1


In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    # evaluation_strategy="epoch",
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [17]:
print(f"Dataset size: {len(tokenized_train)}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"Effective steps per epoch: {len(tokenized_train) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)}")


Dataset size: 15684
Batch size: 8
Gradient accumulation: 1
Effective steps per epoch: 1960


In [18]:
len(tokenized_train)

15684

In [19]:
trainer.train()
# in progress bar you will see th total number of batches i.e 5883



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrudratara8[0m ([33mrudratara8-indain-institute-of-science-education-and-res[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0492,0.994073,0.92956,0.887043
2,0.5667,0.53883,0.936049,0.896828
3,0.4383,0.444273,0.937681,0.900024




              precision    recall  f1-score   support

         ART       0.86      0.71      0.78       702
     ART_NEG       0.96      0.98      0.97       630
          DJ       0.78      0.74      0.76      5708
          DP       0.95      0.97      0.96     14361
          DV       0.82      0.80      0.81      3616
         ERB       0.88      0.87      0.87     10920
          ET       0.86      0.86      0.86      1930
         ONJ       0.94      0.92      0.93      2623
         OUN       0.84      0.85      0.85     19648
         RON       0.90      0.93      0.92      3326
      RON_WH       0.94      0.95      0.94       562
        ROPN       0.89      0.90      0.90      8717
          UM       0.92      0.94      0.93      2108
           _       0.95      0.94      0.95      9667

   micro avg       0.89      0.89      0.89     84518
   macro avg       0.89      0.88      0.89     84518
weighted avg       0.89      0.89      0.89     84518





              precision    recall  f1-score   support

         ART       0.85      0.78      0.81       702
     ART_NEG       0.98      0.98      0.98       630
          DJ       0.79      0.77      0.78      5708
          DP       0.96      0.97      0.96     14361
          DV       0.83      0.82      0.83      3616
         ERB       0.89      0.89      0.89     10920
          ET       0.87      0.89      0.88      1930
         ONJ       0.95      0.93      0.94      2623
         OUN       0.86      0.86      0.86     19648
         RON       0.93      0.92      0.93      3326
      RON_WH       0.96      0.95      0.95       562
        ROPN       0.89      0.91      0.90      8717
          UM       0.95      0.95      0.95      2108
           _       0.96      0.94      0.95      9667

   micro avg       0.90      0.90      0.90     84518
   macro avg       0.90      0.90      0.90     84518
weighted avg       0.90      0.90      0.90     84518





              precision    recall  f1-score   support

         ART       0.85      0.82      0.84       702
     ART_NEG       0.98      0.99      0.98       630
          DJ       0.79      0.77      0.78      5708
          DP       0.96      0.97      0.97     14361
          DV       0.83      0.84      0.83      3616
         ERB       0.89      0.89      0.89     10920
          ET       0.89      0.89      0.89      1930
         ONJ       0.94      0.94      0.94      2623
         OUN       0.86      0.86      0.86     19648
         RON       0.92      0.94      0.93      3326
      RON_WH       0.96      0.96      0.96       562
        ROPN       0.90      0.91      0.91      8717
          UM       0.95      0.94      0.95      2108
           _       0.96      0.94      0.95      9667

   micro avg       0.90      0.90      0.90     84518
   macro avg       0.91      0.90      0.90     84518
weighted avg       0.90      0.90      0.90     84518



TrainOutput(global_step=5883, training_loss=0.964534799886078, metrics={'train_runtime': 2032.4526, 'train_samples_per_second': 23.15, 'train_steps_per_second': 2.895, 'total_flos': 3073967211902976.0, 'train_loss': 0.964534799886078, 'epoch': 3.0})

In [20]:
metrics = trainer.evaluate(eval_dataset=tokenized_val)
print(metrics)


              precision    recall  f1-score   support

         ART       0.85      0.82      0.84       702
     ART_NEG       0.98      0.99      0.98       630
          DJ       0.79      0.77      0.78      5708
          DP       0.96      0.97      0.97     14361
          DV       0.83      0.84      0.83      3616
         ERB       0.89      0.89      0.89     10920
          ET       0.89      0.89      0.89      1930
         ONJ       0.94      0.94      0.94      2623
         OUN       0.86      0.86      0.86     19648
         RON       0.92      0.94      0.93      3326
      RON_WH       0.96      0.96      0.96       562
        ROPN       0.90      0.91      0.91      8717
          UM       0.95      0.94      0.95      2108
           _       0.96      0.94      0.95      9667

   micro avg       0.90      0.90      0.90     84518
   macro avg       0.91      0.90      0.90     84518
weighted avg       0.90      0.90      0.90     84518

{'eval_loss': 0.444273233

In [21]:
metrics

{'eval_loss': 0.4442732334136963,
 'eval_accuracy': 0.9376808328451034,
 'eval_f1': 0.9000236658383624,
 'eval_runtime': 37.9782,
 'eval_samples_per_second': 131.628,
 'eval_steps_per_second': 16.457,
 'epoch': 3.0}