<a href="https://colab.research.google.com/github/Rudra-prasad-tarai/nlpInternship/blob/main/POSTaggerMuril.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install transformers==4.47.0 datasets seqeval scikit-learn




In [6]:
pip install -U datasets



In [7]:
from datasets import load_dataset
dataset = load_dataset("LingoIITGN/COMI-LINGUA", "POS")  # Load POS task subset
train_data = dataset["train"]
test_data = dataset["test"]

In [8]:
dataset['train'][15683] # dataset['train'][i] the annotaion by all the annotators for the the sentence

{'Sentences': 'जैसलमेर - प्रधानमंत्री नरेंद्र मोदी (Narendra Modi) ने राजस्थान के जैसलमेर बॉर्डर पर लोंगेवाला पोस्ट पहुंच कर सेना के जवानों के साथ दिवाली (Diwali) का त्योहार मनाया।',
 'Predicted Tags': "[{'word': 'जैसलमेर', 'entity': 'PROPN'}, {'word': '-', 'entity': 'X'}, {'word': 'प्रधानमंत्री', 'entity': 'PROPN'}, {'word': 'नरेंद्र', 'entity': 'PROPN'}, {'word': 'मोदी', 'entity': 'PROPN'}, {'word': '(', 'entity': 'X'}, {'word': 'Narendra', 'entity': 'PROPN'}, {'word': 'Modi', 'entity': 'PROPN'}, {'word': ')', 'entity': 'X'}, {'word': 'ने', 'entity': 'VERB'}, {'word': 'राजस्थान', 'entity': 'PROPN'}, {'word': 'के', 'entity': 'ADP'}, {'word': 'जैसलमेर', 'entity': 'PROPN'}, {'word': 'बॉर्डर', 'entity': 'NOUN'}, {'word': 'पर', 'entity': 'ADP'}, {'word': 'लोंगेवाला', 'entity': 'ADJ'}, {'word': 'पोस्ट', 'entity': 'NOUN'}, {'word': 'पहुंच', 'entity': 'VERB'}, {'word': 'कर', 'entity': 'VERB'}, {'word': 'सेना', 'entity': 'NOUN'}, {'word': 'के', 'entity': 'ADP'}, {'word': 'जवानों', 'entity': '

In [9]:
import ast  # safely parse string to Python object

def extract_tokens_and_tags(example):
    ann = ast.literal_eval(example['Annotated by: Annotator 1 '])  # safely convert string to list of dicts
    tokens = [entry["word"] for entry in ann]
    tags = [entry["entity"] for entry in ann]
    return {"tokens": tokens, "tags": tags}


In [None]:
# Convert to tokens and tags
processed_data = dataset.map(extract_tokens_and_tags)

# Keep only necessary columns
processed_data = processed_data.remove_columns([
    "Sentences", "Predicted Tags",
    "Annotated by: Annotator 1 ",
    "Annotated by: Annotator 2",
    "Annotated by: Annotator 3"
])


In [11]:
train_dataset = processed_data['train']
test_dataset = processed_data['test']

In [12]:
processed_data['train'][0]

{'tokens': ['Loan',
  'Apps',
  'की',
  'अब',
  'खैर',
  'नहीं',
  ',',
  'RBI',
  'ने',
  'बता',
  'दिया',
  'किस',
  '-',
  'किस',
  'पर',
  'होगी',
  'कार्रवाई',
  ',',
  'लिस्ट',
  'तैयार'],
 'tags': ['NOUN',
  'NOUN',
  'ADP',
  'ADV',
  'NOUN',
  'PART_NEG',
  'X',
  'PROPN',
  'ADP',
  'VERB',
  'VERB',
  'PRON_WH',
  'X',
  'PRON_WH',
  'ADP',
  'VERB',
  'NOUN',
  'X',
  'NOUN',
  'VERB']}

In [13]:
# Get all unique tags
unique_tags = list(set(tag for row in train_dataset for tag in row['tags']))
tag2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
id2tag = {i: tag for tag, i in tag2id.items()}
NUM_LABELS = len(tag2id)


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "google/muril-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=NUM_LABELS,
    id2label=id2tag,
    label2id=tag2id
)


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128

    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val = test_dataset.map(tokenize_and_align_labels, batched=True)


In [16]:
# pip install --upgrade transformers


In [17]:
import transformers
print(transformers.__version__)


4.47.0


In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./muril-pos-results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50
)




In [19]:
print(transformers.__file__)

/usr/local/lib/python3.11/dist-packages/transformers/__init__.py


In [20]:
import numpy as np
from seqeval.metrics import accuracy_score, f1_score, classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2tag[label] for label in sent if label != -100]
        for sent in labels
    ]
    true_predictions = [
        [id2tag[pred] for (pred, label) in zip(sent_preds, sent_labels) if label != -100]
        for sent_preds, sent_labels in zip(predictions, labels)
    ]
    print(classification_report(true_labels, true_predictions,digits = 6))
    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }
print(1)

1


In [21]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    # evaluation_strategy="epoch",
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [22]:
print(f"Dataset size: {len(tokenized_train)}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"Effective steps per epoch: {len(tokenized_train) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)}")


Dataset size: 15684
Batch size: 8
Gradient accumulation: 1
Effective steps per epoch: 1960


In [23]:
len(tokenized_train)

15684

In [24]:
trainer.train()
# in progress bar you will see th total number of batches i.e 5883



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrudratara8[0m ([33mrudratara8-indain-institute-of-science-education-and-res[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0429,0.989191,0.930263,0.887982
2,0.5697,0.540406,0.936067,0.896796
3,0.4397,0.444999,0.937474,0.899424




              precision    recall  f1-score   support

         ART   0.862245  0.722222  0.786047       702
     ART_NEG   0.968553  0.977778  0.973144       630
          DJ   0.785341  0.737737  0.760795      5708
          DP   0.949843  0.969222  0.959435     14361
          DV   0.831312  0.814989  0.823069      3616
         ERB   0.880405  0.866941  0.873622     10920
          ET   0.870582  0.867876  0.869227      1930
         ONJ   0.951741  0.917270  0.934188      2623
         OUN   0.839876  0.851588  0.845691     19648
         RON   0.900058  0.931449  0.915485      3326
      RON_WH   0.965201  0.937722  0.951264       562
        ROPN   0.888651  0.904554  0.896532      8717
          UM   0.925507  0.931214  0.928352      2108
           _   0.947693  0.938968  0.943310      9667

   micro avg   0.888223  0.887740  0.887982     84518
   macro avg   0.897643  0.883538  0.890011     84518
weighted avg   0.887766  0.887740  0.887583     84518





              precision    recall  f1-score   support

         ART   0.874799  0.776353  0.822642       702
     ART_NEG   0.985669  0.982540  0.984102       630
          DJ   0.782562  0.770498  0.776483      5708
          DP   0.961735  0.969570  0.965637     14361
          DV   0.848053  0.807246  0.827147      3616
         ERB   0.885183  0.884615  0.884899     10920
          ET   0.876138  0.897927  0.886899      1930
         ONJ   0.951144  0.935189  0.943099      2623
         OUN   0.855683  0.856728  0.856205     19648
         RON   0.927624  0.924835  0.926227      3326
      RON_WH   0.969314  0.955516  0.962366       562
        ROPN   0.892249  0.913847  0.902919      8717
          UM   0.938149  0.942600  0.940369      2108
           _   0.954058  0.938761  0.946348      9667

   micro avg   0.897630  0.895963  0.896796     84518
   macro avg   0.907311  0.896873  0.901810     84518
weighted avg   0.897474  0.895963  0.896641     84518





              precision    recall  f1-score   support

         ART   0.854626  0.829060  0.841649       702
     ART_NEG   0.984127  0.984127  0.984127       630
          DJ   0.786032  0.774877  0.780415      5708
          DP   0.963905  0.970684  0.967283     14361
          DV   0.834351  0.831582  0.832964      3616
         ERB   0.888635  0.889286  0.888960     10920
          ET   0.883661  0.885492  0.884576      1930
         ONJ   0.949479  0.938620  0.944018      2623
         OUN   0.860452  0.856423  0.858433     19648
         RON   0.916446  0.936560  0.926394      3326
      RON_WH   0.964286  0.960854  0.962567       562
        ROPN   0.894837  0.916600  0.905588      8717
          UM   0.955116  0.938805  0.946890      2108
           _   0.956965  0.940830  0.948829      9667

   micro avg   0.899632  0.899217  0.899424     84518
   macro avg   0.906637  0.903843  0.905192     84518
weighted avg   0.899536  0.899217  0.899340     84518



TrainOutput(global_step=5883, training_loss=0.9638092419372901, metrics={'train_runtime': 1948.9323, 'train_samples_per_second': 24.142, 'train_steps_per_second': 3.019, 'total_flos': 3073967211902976.0, 'train_loss': 0.9638092419372901, 'epoch': 3.0})

In [1]:
metrics = trainer.evaluate(eval_dataset=tokenized_val)
print(metrics)


NameError: name 'trainer' is not defined

In [None]:
metrics