<a href="https://colab.research.google.com/github/Rudra-prasad-tarai/nlpInternship/blob/main/POSTagger_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Downloading Libraries and datasets

In [40]:
!pip install transformers==4.47.0 datasets==2.17.0 seqeval scikit-learn




In [41]:
import ast  # safely parse string to Python object
import numpy as np
import transformers
from seqeval.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback  # <-- Add this import
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import Dataset, DatasetDict
from datasets import load_dataset
from google.colab import files
import json
import time
from collections import Counter

In [45]:
def extract_tokens_and_tags_(example):
    ann = ast.literal_eval(example['Annotated by: Annotator 1 '])  # safely convert string to list of dicts
    tokens = [entry["word"] for entry in ann]
    tags = [entry["entity"] for entry in ann]
    return {"tokens": tokens, "tags": tags}


def extract_tokens_and_tags(example):
    # Safely parse annotation strings to Python objects
    ann1 = ast.literal_eval(example['Annotated by: Annotator 1 '])
    ann2 = ast.literal_eval(example['Annotated by: Annotator 2'])
    ann3 = ast.literal_eval(example['Annotated by: Annotator 3'])
    pred = ast.literal_eval(example['Predicted Tags'])

    # Extract tokens from annotator 1 (assumed correct for tokens)
    tokens = [entry['word'] for entry in ann1]

    # Get tags from each source
    tags1 = [entry['entity'] for entry in ann1]
    tags2 = [entry['entity'] for entry in ann2]
    tags3 = [entry['entity'] for entry in ann3]
    pred_tags = [entry['entity'] for entry in pred]

    # Ensure each tag list is same length as tokens, else replace with [''] * len(tokens)
    def align_tags(tags):
        return tags if len(tags) == len(tokens) else [''] * len(tokens)

    tags1 = align_tags(tags1)
    tags2 = align_tags(tags2)
    tags3 = align_tags(tags3)
    pred_tags = align_tags(pred_tags)

    # Voting
    voted_tags = []
    for i in range(len(tokens)):
        votes = [tag for tag in [tags1[i], tags2[i], tags3[i], pred_tags[i]] if tag != '']
        most_common = Counter(votes).most_common(1)[0][0] if votes else 'O'  # fallback to 'O' (outside tag)
        voted_tags.append(most_common)

    return {"tokens": tokens, "tags": voted_tags}

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2tag[label] for label in sent if label != -100]
        for sent in labels
    ]
    true_predictions = [
        [id2tag[pred] for (pred, label) in zip(sent_preds, sent_labels) if label != -100]
        for sent_preds, sent_labels in zip(predictions, labels)
    ]
    print(classification_report(true_labels,true_predictions,digits = 6))
    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128

    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



In [46]:
dataset['train'][0]

{'Sentences': 'Loan Apps की अब खैर नहीं, RBI ने बता दिया किस-किस पर होगी कार्रवाई, लिस्ट तैयार',
 'Predicted Tags': "[{'word': 'Loan', 'entity': 'NOUN'}, {'word': 'Apps', 'entity': 'NOUN'}, {'word': 'की', 'entity': 'ADP'}, {'word': 'अब', 'entity': 'ADV'}, {'word': 'खैर', 'entity': 'ADJ'}, {'word': 'नहीं', 'entity': 'VERB'}, {'word': ',', 'entity': 'X'}, {'word': 'RBI', 'entity': 'PROPN'}, {'word': 'ने', 'entity': 'VERB'}, {'word': 'बता', 'entity': 'VERB'}, {'word': 'दिया', 'entity': 'VERB'}, {'word': 'किस', 'entity': 'DET'}, {'word': '-', 'entity': 'X'}, {'word': 'किस', 'entity': 'DET'}, {'word': 'पर', 'entity': 'ADP'}, {'word': 'होगी', 'entity': 'VERB'}, {'word': 'कार्रवाई', 'entity': 'NOUN'}, {'word': ',', 'entity': 'X'}, {'word': 'लिस्ट', 'entity': 'NOUN'}, {'word': 'तैयार', 'entity': 'ADJ'}]",
 'Annotated by: Annotator 1 ': "[{'word': 'Loan', 'entity': 'NOUN'}, {'word': 'Apps', 'entity': 'NOUN'}, {'word': 'की', 'entity': 'ADP'}, {'word': 'अब', 'entity': 'ADV'}, {'word': 'खैर', 'ent

In [47]:
# Load POS task subset

dataset = load_dataset("LingoIITGN/COMI-LINGUA", "POS")

# Convert to tokens and tags
processed_data = dataset.map(extract_tokens_and_tags)

# Keep only necessary columns
processed_data = processed_data.remove_columns([
    "Sentences", "Predicted Tags",
    "Annotated by: Annotator 1 ",
    "Annotated by: Annotator 2",
    "Annotated by: Annotator 3"
])


print(len(processed_data['train']))

# Assuming processed_data is your Dataset/DatasetDict
data = processed_data['train']

Map:   0%|          | 0/15684 [00:00<?, ? examples/s]

Map:   0%|          | 0/4999 [00:00<?, ? examples/s]

15684


In [48]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Sentences', 'Predicted Tags', 'Annotated by: Annotator 1 ', 'Annotated by: Annotator 2', 'Annotated by: Annotator 3'],
        num_rows: 15684
    })
    test: Dataset({
        features: ['Sentences', 'Predicted Tags', 'Annotated by: Annotator 1 ', 'Annotated by: Annotator 2', 'Annotated by: Annotator 3'],
        num_rows: 4999
    })
})


In [49]:
# type(dataset['train'][15683]) # dataset['train'][i] the annotaion by all the annotators for the the sentence

In [50]:
# First split: 80% train, 20% temp
train_test_split = data.train_test_split(
    test_size=0.2,
    seed=42
)

# Second split: 10% val, 10% test
val_test_split = train_test_split['test'].train_test_split(
    test_size=0.5,
    seed=42
)

# Combine
processed_data = DatasetDict({
    'train': train_test_split['train'],
    'validation': val_test_split['train'],
    'test': val_test_split['test']
})

In [52]:
# Save train/val/test datasets
for split in ["train", "validation", "test"]:
    data_list = processed_data[split].to_list()
    file_name = f"{split}.json"
    with open(file_name, "w", encoding="utf-8") as f:
        json.dump(data_list, f, indent=2, ensure_ascii=False)
    files.download(file_name)  # Download the file

# # Save predictions
# with open("test_predictions.json", "w", encoding="utf-8") as f:
#     json.dump(predictions, f, indent=2, ensure_ascii=False)
# files.download("test_predictions.json")  # Download predictions


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [53]:
print(len(processed_data['train']))
print(len(processed_data['validation']))
print(len(processed_data['test']))

12547
1568
1569


In [54]:
processed_data['train'][0]

{'tokens': ['नई',
  'दिल्ली',
  ':',
  'देश',
  'की',
  'प्रमुख',
  'दो',
  '-',
  'पहिया',
  'वाहन',
  'निर्माता',
  'कंपनी',
  'टीवीएस',
  'मोटर्स',
  '(',
  'TVS',
  'Motors',
  ')',
  'आज',
  'यानी',
  'सोमवार',
  'को',
  'Apache',
  'RR',
  '310',
  '2021',
  'को',
  'लॉन्च',
  'करने',
  'जा',
  'रही',
  'है',
  '।'],
 'tags': ['PROPN',
  'PROPN',
  'X',
  'NOUN',
  'ADP',
  'ADJ',
  'ADJ',
  'X',
  'ADJ',
  'NOUN',
  'NOUN',
  'NOUN',
  'PROPN',
  'PROPN',
  'X',
  'PROPN',
  'PROPN',
  'X',
  'ADV',
  'CONJ',
  'PROPN',
  'ADP',
  'PROPN',
  'PROPN',
  'NUM',
  'NUM',
  'ADP',
  'VERB',
  'VERB',
  'VERB',
  'VERB',
  'VERB',
  'X']}

In [55]:
# Get all unique tags
unique_tags = list(set(tag for row in data for tag in row['tags']))
tag2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
id2tag = {i: tag for tag, i in tag2id.items()}
NUM_LABELS = len(tag2id)


In [56]:
print(len(id2tag))

14


In [57]:
model_checkpoint_ = {'xlmr':"xlm-roberta-base",'indicBert':"ai4bharat/indic-bert",'mBert':"bert-base-multilingual-cased",'muril':"google/muril-base-cased"}

#Muril


In [58]:
start = time.time()

In [59]:
# Tokenize
key = 'muril'
model_checkpoint = model_checkpoint_[key]
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=NUM_LABELS,
    id2label=id2tag,
    label2id=tag2id
)

tokenized_train = processed_data['train'].map(tokenize_and_align_labels, batched=True)
tokenized_val = processed_data['validation'].map(tokenize_and_align_labels, batched=True)
tokenized_test = processed_data['test'].map(tokenize_and_align_labels, batched=True)

tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/12547 [00:00<?, ? examples/s]

Map:   0%|          | 0/1568 [00:00<?, ? examples/s]

Map:   0%|          | 0/1569 [00:00<?, ? examples/s]

In [60]:
tokenized_train

Dataset({
    features: ['tokens', 'tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 12547
})

In [61]:
# Create training arguements
training_args = TrainingArguments(
    output_dir=f"./{key}-pos-results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    load_best_model_at_end=True,  # <-- Important for early stopping
    metric_for_best_model="eval_loss",  # <-- Metric to monitor
    greater_is_better=False  # <-- For loss (lower is better)
)

# Create Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,  # Stop after 3 epochs without improvement
            early_stopping_threshold=0.01  # Minimum improvement to qualify as better
        )
    ],
    compute_metrics=compute_metrics

)


In [62]:
t = trainer.train()


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
import numpy as np
import json
from seqeval.metrics import f1_score
from google.colab import files

# Step 1: Predict on test set
raw_preds, _, _ = trainer.predict(tokenized_test)

# Step 2: Convert logits to predicted label indices
pred_labels = np.argmax(raw_preds, axis=2)

# Step 3: Convert to tag strings
predicted_tag_lists = [
    [id2tag[pred] for pred, label in zip(preds, labels) if label != -100]
    for preds, labels in zip(pred_labels, tokenized_test["labels"])
]

# Step 4: Get true tags (filtered for -100s)
true_tag_lists = [
    [id2tag[label] for label in labels if label != -100]
    for labels in tokenized_test["labels"]
]

# Step 5: Get tokens
tokens = processed_data["test"]["tokens"]

# Step 6: Calculate per-sentence F1 and store
sentence_analysis = []
for tkns, true_tags, pred_tags in zip(tokens, true_tag_lists, predicted_tag_lists):
    f1 = f1_score([true_tags], [pred_tags])  # Note: seqeval expects list of lists
    sentence_analysis.append({
        "tokens": tkns,
        "true_tags": true_tags,
        "predicted_tags": pred_tags,
        "f1_score": f1
    })

# Step 7: Save to JSON
with open(f"{key}_sentence_f1_analysis.json", "w", encoding="utf-8") as f:
    json.dump(sentence_analysis, f, indent=2, ensure_ascii=False)

# Step 8: Download
files.download(f"{key}_sentence_f1_analysis.json")


In [None]:
metrics = trainer.evaluate(eval_dataset=tokenized_test)
print(metrics)


In [None]:
metrics

In [None]:
end = time.time()
print(f"Time taken : {(end-start)/60} mins")

#XLM_r


In [None]:
start = time.time()

In [None]:
# Tokenize
key = 'xlmr'
model_checkpoint = model_checkpoint_[key]
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=NUM_LABELS,
    id2label=id2tag,
    label2id=tag2id
)

tokenized_train = processed_data['train'].map(tokenize_and_align_labels, batched=True)
tokenized_val = processed_data['validation'].map(tokenize_and_align_labels, batched=True)
tokenized_test = processed_data['test'].map(tokenize_and_align_labels, batched=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/12547 [00:00<?, ? examples/s]

Map:   0%|          | 0/1568 [00:00<?, ? examples/s]

Map:   0%|          | 0/1569 [00:00<?, ? examples/s]

In [None]:
tokenized_train

Dataset({
    features: ['tokens', 'tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 12547
})

In [None]:
# Create training arguements
training_args = TrainingArguments(
    output_dir=f"./{key}-pos-results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    load_best_model_at_end=True,  # <-- Important for early stopping
    metric_for_best_model="eval_loss",  # <-- Metric to monitor
    greater_is_better=False  # <-- For loss (lower is better)
)

# Create Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,  # Stop after 3 epochs without improvement
            early_stopping_threshold=0.01  # Minimum improvement to qualify as better
        )
    ],
    compute_metrics=compute_metrics

)


In [None]:
t = trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2526,0.232913,0.924093,0.8771
2,0.1928,0.210054,0.931911,0.88896
3,0.1724,0.20572,0.934701,0.893486
4,0.1458,0.21932,0.932432,0.891068
5,0.131,0.21785,0.935099,0.895333




              precision    recall  f1-score   support

         ART   0.867470  0.864000  0.865731       250
     ART_NEG   0.950739  0.969849  0.960199       199
          DJ   0.750308  0.750308  0.750308      1622
          DP   0.956511  0.962524  0.959508      4136
          DV   0.851446  0.784926  0.816834      1088
         ERB   0.856035  0.858165  0.857098      3215
          ET   0.867314  0.897822  0.882305       597
         ONJ   0.892487  0.962291  0.926075       716
         OUN   0.815571  0.824270  0.819897      5719
         RON   0.924973  0.936009  0.930458       922
      RON_WH   0.922078  0.959459  0.940397       222
        ROPN   0.871882  0.861464  0.866642      2678
          UM   0.925550  0.954625  0.939863       573
           _   0.956294  0.932175  0.944080      2934

   micro avg   0.876712  0.877488  0.877100     24871
   macro avg   0.886333  0.894135  0.889957     24871
weighted avg   0.876817  0.877488  0.877010     24871





              precision    recall  f1-score   support

         ART   0.907173  0.860000  0.882957       250
     ART_NEG   0.965517  0.984925  0.975124       199
          DJ   0.775330  0.759556  0.767362      1622
          DP   0.952812  0.966634  0.959674      4136
          DV   0.851594  0.785846  0.817400      1088
         ERB   0.872957  0.880560  0.876742      3215
          ET   0.871452  0.874372  0.872910       597
         ONJ   0.945355  0.966480  0.955801       716
         OUN   0.835450  0.839832  0.837635      5719
         RON   0.919957  0.934924  0.927380       922
      RON_WH   0.963636  0.954955  0.959276       222
        ROPN   0.875829  0.887603  0.881677      2678
          UM   0.920826  0.933682  0.927210       573
           _   0.961843  0.953647  0.957727      2934

   micro avg   0.888211  0.889711  0.888960     24871
   macro avg   0.901409  0.898787  0.899920     24871
weighted avg   0.887893  0.889711  0.888707     24871





              precision    recall  f1-score   support

         ART   0.879518  0.876000  0.877756       250
     ART_NEG   0.956311  0.989950  0.972840       199
          DJ   0.782774  0.762022  0.772259      1622
          DP   0.957814  0.966151  0.961964      4136
          DV   0.828545  0.821691  0.825104      1088
         ERB   0.883504  0.884603  0.884053      3215
          ET   0.850998  0.927973  0.887821       597
         ONJ   0.950139  0.958101  0.954103       716
         OUN   0.843018  0.849799  0.846395      5719
         RON   0.943333  0.920824  0.931943       922
      RON_WH   0.946667  0.959459  0.953020       222
        ROPN   0.881613  0.889843  0.885709      2678
          UM   0.920962  0.935428  0.928139       573
           _   0.958904  0.954329  0.956611      2934

   micro avg   0.891679  0.895300  0.893486     24871
   macro avg   0.898864  0.906870  0.902694     24871
weighted avg   0.891595  0.895300  0.893379     24871





              precision    recall  f1-score   support

         ART   0.878543  0.868000  0.873239       250
     ART_NEG   0.965517  0.984925  0.975124       199
          DJ   0.754282  0.787300  0.770437      1622
          DP   0.958343  0.967843  0.963070      4136
          DV   0.863592  0.808824  0.835311      1088
         ERB   0.888819  0.877760  0.883255      3215
          ET   0.851623  0.922948  0.885852       597
         ONJ   0.923280  0.974860  0.948370       716
         OUN   0.851524  0.825319  0.838217      5719
         RON   0.947191  0.914317  0.930464       922
      RON_WH   0.955157  0.959459  0.957303       222
        ROPN   0.850893  0.907767  0.878410      2678
          UM   0.954145  0.944154  0.949123       573
           _   0.958018  0.948875  0.953425      2934

   micro avg   0.890495  0.891641  0.891068     24871
   macro avg   0.900066  0.906597  0.902971     24871
weighted avg   0.890878  0.891641  0.890973     24871





              precision    recall  f1-score   support

         ART   0.889362  0.836000  0.861856       250
     ART_NEG   0.960591  0.979899  0.970149       199
          DJ   0.769043  0.778052  0.773521      1622
          DP   0.959857  0.971228  0.965509      4136
          DV   0.865366  0.815257  0.839565      1088
         ERB   0.885374  0.879316  0.882335      3215
          ET   0.849923  0.929648  0.888000       597
         ONJ   0.913386  0.972067  0.941813       716
         OUN   0.854672  0.838084  0.846296      5719
         RON   0.934426  0.927332  0.930866       922
      RON_WH   0.959459  0.959459  0.959459       222
        ROPN   0.871960  0.910381  0.890756      2678
          UM   0.949301  0.947644  0.948472       573
           _   0.961235  0.955010  0.958112      2934

   micro avg   0.894363  0.896305  0.895333     24871
   macro avg   0.901711  0.907098  0.904051     24871
weighted avg   0.894293  0.896305  0.895115     24871



In [None]:
import numpy as np
import json
from seqeval.metrics import f1_score
from google.colab import files

# Step 1: Predict on test set
raw_preds, _, _ = trainer.predict(tokenized_test)

# Step 2: Convert logits to predicted label indices
pred_labels = np.argmax(raw_preds, axis=2)

# Step 3: Convert to tag strings
predicted_tag_lists = [
    [id2tag[pred] for pred, label in zip(preds, labels) if label != -100]
    for preds, labels in zip(pred_labels, tokenized_test["labels"])
]

# Step 4: Get true tags (filtered for -100s)
true_tag_lists = [
    [id2tag[label] for label in labels if label != -100]
    for labels in tokenized_test["labels"]
]

# Step 5: Get tokens
tokens = processed_data["test"]["tokens"]

# Step 6: Calculate per-sentence F1 and store
sentence_analysis = []
for tkns, true_tags, pred_tags in zip(tokens, true_tag_lists, predicted_tag_lists):
    f1 = f1_score([true_tags], [pred_tags])  # Note: seqeval expects list of lists
    sentence_analysis.append({
        "tokens": tkns,
        "true_tags": true_tags,
        "predicted_tags": pred_tags,
        "f1_score": f1
    })

# Step 7: Save to JSON
with open(f"{key}_sentence_f1_analysis.json", "w", encoding="utf-8") as f:
    json.dump(sentence_analysis, f, indent=2, ensure_ascii=False)

# Step 8: Download
files.download(f"{key}_sentence_f1_analysis.json")




              precision    recall  f1-score   support

         ART   0.805907  0.856502  0.830435       223
     ART_NEG   0.985000  0.960976  0.972840       205
          DJ   0.786707  0.761849  0.774078      1709
          DP   0.966650  0.969265  0.967956      4067
          DV   0.807547  0.838394  0.822681      1021
         ERB   0.887470  0.892628  0.890041      3269
          ET   0.859200  0.908629  0.883224       591
         ONJ   0.961487  0.956407  0.958940       757
         OUN   0.847778  0.854523  0.851137      5781
         RON   0.944924  0.920084  0.932339       951
      RON_WH   0.957143  0.980488  0.968675       205
        ROPN   0.896241  0.904058  0.900132      2637
          UM   0.921997  0.957861  0.939587       617
           _   0.964443  0.956026  0.960216      2979

   micro avg   0.896189  0.899808  0.897995     25012
   macro avg   0.899464  0.908406  0.903734     25012
weighted avg   0.896367  0.899808  0.898013     25012



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
metrics = trainer.evaluate(eval_dataset=tokenized_test)
print(metrics)


              precision    recall  f1-score   support

         ART   0.805907  0.856502  0.830435       223
     ART_NEG   0.985000  0.960976  0.972840       205
          DJ   0.786707  0.761849  0.774078      1709
          DP   0.966650  0.969265  0.967956      4067
          DV   0.807547  0.838394  0.822681      1021
         ERB   0.887470  0.892628  0.890041      3269
          ET   0.859200  0.908629  0.883224       591
         ONJ   0.961487  0.956407  0.958940       757
         OUN   0.847778  0.854523  0.851137      5781
         RON   0.944924  0.920084  0.932339       951
      RON_WH   0.957143  0.980488  0.968675       205
        ROPN   0.896241  0.904058  0.900132      2637
          UM   0.921997  0.957861  0.939587       617
           _   0.964443  0.956026  0.960216      2979

   micro avg   0.896189  0.899808  0.897995     25012
   macro avg   0.899464  0.908406  0.903734     25012
weighted avg   0.896367  0.899808  0.898013     25012

{'eval_loss': 0.196837216

In [None]:
metrics

{'eval_loss': 0.19683721661567688,
 'eval_accuracy': 0.9369801469248983,
 'eval_f1': 0.897995012468828,
 'eval_runtime': 12.2186,
 'eval_samples_per_second': 128.411,
 'eval_steps_per_second': 8.102,
 'epoch': 5.0}

In [None]:
end = time.time()
print(f"Time taken : {(end-start)/60} mins")

Time taken : 33.431765321890516 mins
