<a href="https://colab.research.google.com/github/Rudra-prasad-tarai/nlpInternship/blob/main/POSTaggerMuril.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets seqeval scikit-learn


Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (4

In [None]:
from datasets import load_dataset
dataset = load_dataset("LingoIITGN/COMI-LINGUA", "POS")  # Load POS task subset
train_data = dataset["train"]
test_data = dataset["test"]

In [28]:
dataset['train'][15683] # dataset['train'][i] the annotaion by all the annotators for the the sentence

{'Sentences': 'जैसलमेर - प्रधानमंत्री नरेंद्र मोदी (Narendra Modi) ने राजस्थान के जैसलमेर बॉर्डर पर लोंगेवाला पोस्ट पहुंच कर सेना के जवानों के साथ दिवाली (Diwali) का त्योहार मनाया।',
 'Predicted Tags': "[{'word': 'जैसलमेर', 'entity': 'PROPN'}, {'word': '-', 'entity': 'X'}, {'word': 'प्रधानमंत्री', 'entity': 'PROPN'}, {'word': 'नरेंद्र', 'entity': 'PROPN'}, {'word': 'मोदी', 'entity': 'PROPN'}, {'word': '(', 'entity': 'X'}, {'word': 'Narendra', 'entity': 'PROPN'}, {'word': 'Modi', 'entity': 'PROPN'}, {'word': ')', 'entity': 'X'}, {'word': 'ने', 'entity': 'VERB'}, {'word': 'राजस्थान', 'entity': 'PROPN'}, {'word': 'के', 'entity': 'ADP'}, {'word': 'जैसलमेर', 'entity': 'PROPN'}, {'word': 'बॉर्डर', 'entity': 'NOUN'}, {'word': 'पर', 'entity': 'ADP'}, {'word': 'लोंगेवाला', 'entity': 'ADJ'}, {'word': 'पोस्ट', 'entity': 'NOUN'}, {'word': 'पहुंच', 'entity': 'VERB'}, {'word': 'कर', 'entity': 'VERB'}, {'word': 'सेना', 'entity': 'NOUN'}, {'word': 'के', 'entity': 'ADP'}, {'word': 'जवानों', 'entity': '

In [4]:
import ast  # safely parse string to Python object

def extract_tokens_and_tags(example):
    ann = ast.literal_eval(example['Annotated by: Annotator 1 '])  # safely convert string to list of dicts
    tokens = [entry["word"] for entry in ann]
    tags = [entry["entity"] for entry in ann]
    return {"tokens": tokens, "tags": tags}


In [None]:
# Convert to tokens and tags
processed_data = dataset.map(extract_tokens_and_tags)

# Keep only necessary columns
processed_data = processed_data.remove_columns([
    "Sentences", "Predicted Tags",
    "Annotated by: Annotator 1 ",
    "Annotated by: Annotator 2",
    "Annotated by: Annotator 3"
])


In [6]:
train_dataset = processed_data['train']
test_dataset = processed_data['test']

In [7]:
processed_data['train'][0]

{'tokens': ['Loan',
  'Apps',
  'की',
  'अब',
  'खैर',
  'नहीं',
  ',',
  'RBI',
  'ने',
  'बता',
  'दिया',
  'किस',
  '-',
  'किस',
  'पर',
  'होगी',
  'कार्रवाई',
  ',',
  'लिस्ट',
  'तैयार'],
 'tags': ['NOUN',
  'NOUN',
  'ADP',
  'ADV',
  'NOUN',
  'PART_NEG',
  'X',
  'PROPN',
  'ADP',
  'VERB',
  'VERB',
  'PRON_WH',
  'X',
  'PRON_WH',
  'ADP',
  'VERB',
  'NOUN',
  'X',
  'NOUN',
  'VERB']}

In [8]:
# Get all unique tags
unique_tags = list(set(tag for row in train_dataset for tag in row['tags']))
tag2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
id2tag = {i: tag for tag, i in tag2id.items()}
NUM_LABELS = len(tag2id)


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "google/muril-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=NUM_LABELS,
    id2label=id2tag,
    label2id=tag2id
)


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128

    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val = test_dataset.map(tokenize_and_align_labels, batched=True)


In [11]:
pip install --upgrade transformers


[31mERROR: Operation cancelled by user[0m[31m
[0m^C


In [12]:
import transformers
print(transformers.__version__)


4.51.3


In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./muril-pos-results",
    # evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50
)


In [14]:
print(transformers.__file__)

/usr/local/lib/python3.11/dist-packages/transformers/__init__.py


In [15]:
import numpy as np
from seqeval.metrics import accuracy_score, f1_score, classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2tag[label] for label in sent if label != -100]
        for sent in labels
    ]
    true_predictions = [
        [id2tag[pred] for (pred, label) in zip(sent_preds, sent_labels) if label != -100]
        for sent_preds, sent_labels in zip(predictions, labels)
    ]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }


In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    evaluation_strategy="epoch",
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [17]:
trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrudratara8[0m ([33mrudratara8-indain-institute-of-science-education-and-res[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,2.5936
100,2.4977
150,2.4292
200,2.3763
250,2.3286
300,2.2824
350,2.234
400,2.1891
450,2.1458
500,2.0929


TrainOutput(global_step=5883, training_loss=0.962623565229007, metrics={'train_runtime': 1783.0845, 'train_samples_per_second': 26.388, 'train_steps_per_second': 3.299, 'total_flos': 3073967211902976.0, 'train_loss': 0.962623565229007, 'epoch': 3.0})

In [None]:
metrics = trainer.evaluate(eval_dataset=tokenized_val)
print(metrics)


In [19]:
metrics

{'eval_loss': 0.44218793511390686,
 'eval_accuracy': 0.9372121321375456,
 'eval_f1': 0.8991497695443545,
 'eval_runtime': 34.6445,
 'eval_samples_per_second': 144.294,
 'eval_steps_per_second': 18.04,
 'epoch': 3.0}