# **Setup**

In [None]:
!pip3 install transformers
!pip3 install datasets
!pip3 install sentencepiece
!pip3 install seqeval

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... 

# **Naampadam Dataset**

In [None]:
# Let's download the Naampadam (Indic NER) dataset
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode

lang='gu'

raw_datasets = load_dataset('ai4bharat/naamapadam', lang)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/38.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/86.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/472845 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1076 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2389 [00:00<?, ? examples/s]

In [None]:
# let's now print how the Dataset looks like
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 472845
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1076
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2389
    })
})

In [None]:
raw_datasets.column_names

{'train': ['tokens', 'ner_tags'],
 'test': ['tokens', 'ner_tags'],
 'validation': ['tokens', 'ner_tags']}

In [None]:
column_names = raw_datasets["train"].column_names
print(column_names)

features = raw_datasets["train"].features
print(features)

['tokens', 'ner_tags']
{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}


In [None]:
text_column_name = "tokens"
label_column_name = "ner_tags"

In [None]:
# If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.

label_list = features[label_column_name].feature.names

label_to_id = {label_list[i]: features[label_column_name].feature.str2int( label_list[i] ) for i in range(len(label_list))}

print(label_to_id)

num_labels = len(label_list)


{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


# **Training an NER Model with the dataset**

In [None]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

config = AutoConfig.from_pretrained('ai4bharat/IndicNER', num_labels=num_labels, finetuning_task='ner')
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")
model = AutoModelForTokenClassification.from_pretrained('ai4bharat/IndicNER', num_labels=num_labels )

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/667M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [None]:
# Tokenize all texts and align the labels with them.
padding = "max_length"
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        # print('=====')
        # print('{} {}'.format(i,label)) #ak
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
train_dataset = raw_datasets["train"]
subtrain_dataset = train_dataset.select(range(40000))
train_dataset = subtrain_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on train dataset",
)

      

Running tokenizer on train dataset #0:   0%|          | 0/10 [00:00<?, ?ba/s]

  

Running tokenizer on train dataset #1:   0%|          | 0/10 [00:00<?, ?ba/s]

Running tokenizer on train dataset #2:   0%|          | 0/10 [00:00<?, ?ba/s]

Running tokenizer on train dataset #3:   0%|          | 0/10 [00:00<?, ?ba/s]

In [None]:
eval_dataset = raw_datasets["validation"]
# subeval_dataset = eval_dataset.select(range(1000))
eval_dataset = eval_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on Validation dataset",
)

        

Running tokenizer on Validation dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Validation dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Validation dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Validation dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
pip install seqeval

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Metrics
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
!pip install -U accelerate
!pip install -U transformers



**Training**

In [None]:
batch_size=16
args=TrainingArguments(
    output_dir='output_dir',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    evaluation_strategy = "epoch",
    learning_rate=2e-5)

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[early_stopping_callback],
    args=args,
)

In [None]:
trainer.args

TrainingArguments(
_n_gpu=2,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_

In [None]:
train_result = trainer.train()
metrics = train_result.metrics

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Loc Precision,Loc Recall,Loc F1,Loc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.1938,0.183361,0.803642,0.840232,0.82153,1208,0.727788,0.726415,0.727101,1060,0.808727,0.837771,0.822993,1615,0.785679,0.808138,0.79675,0.943175
2,0.1528,0.187258,0.792673,0.841887,0.81654,1208,0.707384,0.732075,0.719518,1060,0.817853,0.839628,0.828598,1615,0.779842,0.810971,0.795102,0.94273
3,0.1334,0.193912,0.792423,0.831126,0.811313,1208,0.695255,0.718868,0.706865,1060,0.806859,0.830341,0.818431,1615,0.771925,0.800155,0.785787,0.941429




In [None]:
metrics = trainer.evaluate()

trainer.log_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        3.0
  eval_LOC_f1             =     0.8113
  eval_LOC_number         =       1208
  eval_LOC_precision      =     0.7924
  eval_LOC_recall         =     0.8311
  eval_ORG_f1             =     0.7069
  eval_ORG_number         =       1060
  eval_ORG_precision      =     0.6953
  eval_ORG_recall         =     0.7189
  eval_PER_f1             =     0.8184
  eval_PER_number         =       1615
  eval_PER_precision      =     0.8069
  eval_PER_recall         =     0.8303
  eval_loss               =     0.1939
  eval_overall_accuracy   =     0.9414
  eval_overall_f1         =     0.7858
  eval_overall_precision  =     0.7719
  eval_overall_recall     =     0.8002
  eval_runtime            = 0:00:47.71
  eval_samples_per_second =     50.068
  eval_steps_per_second   =      1.572


# **Evaluate the Trained Model**

In [None]:

original_test_size = len(raw_datasets["test"])
new_test_size = int(original_test_size * 0.025)

test_dataset = raw_datasets['test'].select(range(new_test_size)).map(
      tokenize_and_align_labels,
      batched=True,
      num_proc=4,
      load_from_cache_file=True,
      desc="Running tokenizer on test dataset of language",
)

In [None]:
final_metrics = {}

predictions, labels, metrics = trainer.predict(test_dataset)

for key in metrics:
    if 'overall_precision' in key:
      final_metrics['Precision'] = metrics[key]
    elif 'overall_recall' in key:
      final_metrics['Recall'] = metrics[key]
    elif 'overall_f1' in key:
      final_metrics['F1'] = metrics[key]

In [None]:
import pandas as pd

combined_results = pd.DataFrame.from_dict(
            final_metrics, orient="index"
        )

print(combined_results)

                  0
Precision  0.767442
Recall     0.825000
F1         0.795181


# **Saving Trained Model**

In [None]:
# model.save_pretrained("indic_model")

In [None]:
# model = AutoModelForTokenClassification.from_pretrained('/kaggle/working/indic_model')

In [None]:
# model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [None]:
# !zip -r IndicTuned.zip /kaggle/working/indic_model

  adding: kaggle/working/indic_model/ (stored 0%)
  adding: kaggle/working/indic_model/config.json (deflated 54%)
  adding: kaggle/working/indic_model/model.safetensors (deflated 7%)


In [None]:
# model.save_pretrained("tokenizer")

In [None]:
label_list=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
id2label = {
    str(i): label for i, label in enumerate(label_list)
}
label2id = {
    label: str(i) for i, label in enumerate(label_list)
}

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json

In [None]:
config = json.load(open("/content/drive/MyDrive/indic_model ner/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("/content/drive/MyDrive/indic_model ner/config.json","w"))

In [None]:
model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/indic_model ner")

In [None]:
model

In [None]:
def get_ner(sentence):
    tok_sentence = tokenizer(sentence, return_tensors='pt')

    with torch.no_grad():
        logits = model(**tok_sentence).logits.argmax(-1)
        predicted_tokens_classes = [
            model.config.id2label[t.item()] for t in logits[0]]

        predicted_labels = []

        previous_token_id = 0
        word_ids = tok_sentence.word_ids()
        for word_index in range(len(word_ids)):
            if word_ids[word_index] == None:
                previous_token_id = word_ids[word_index]
            elif word_ids[word_index] == previous_token_id:
                previous_token_id = word_ids[word_index]
            else:
                predicted_labels.append(predicted_tokens_classes[word_index])
                previous_token_id = word_ids[word_index]

        ner_output = []
        for index in range(len(sentence.split(' '))):
            if(index<len(predicted_labels)):
                ner_output.append((sentence.split(' ')[index], predicted_labels[index]))
            else:
                ner_output.append((sentence.split(' ')[index], 'O'))
        return ner_output

In [None]:
import  torch
# let us try with some example sentences here
sentence = '૯મી ઓગસ્ટ ૨૦૧૬ના રોજ આદિવાસી વિકાસ સંગઠન દ્વારા આદિવાસી ભવન ખાતે ખૂબ જ ઉત્સાહભેર ઉજવણી કરવામાં આવશે.'

predicted_labels = get_ner(sentence=sentence)
predicted_labels

[('૯મી', 'O'),
 ('ઓગસ્ટ', 'O'),
 ('૨૦૧૬ના', 'O'),
 ('રોજ', 'O'),
 ('આદિવાસી', 'B-ORG'),
 ('વિકાસ', 'I-ORG'),
 ('સંગઠન', 'I-ORG'),
 ('દ્વારા', 'O'),
 ('આદિવાસી', 'B-LOC'),
 ('ભવન', 'I-ORG'),
 ('ખાતે', 'O'),
 ('ખૂબ', 'O'),
 ('જ', 'O'),
 ('ઉત્સાહભેર', 'O'),
 ('ઉજવણી', 'O'),
 ('કરવામાં', 'O'),
 ('આવશે.', 'O')]

# **Question 4 Model Prediction**

In [None]:
# Prediction using model which I have downloaded
indicNer_list_Q4 = []
with open('/Q1_Questions.txt', "r", encoding="utf-8") as file:
    for line in file:
        line_temp = line.strip()
        tokens_new = get_ner(line_temp)
        # get_ner is function created above which uses downloaded model
        indicNer_list_Q4.append(tokens_new)
print(indicNer_list_Q4)

[[('૯મી', 'O'), ('ઓગસ્ટ', 'O'), ('૨૦૧૬ના', 'O'), ('રોજ', 'O'), ('આદિવાસી', 'B-ORG'), ('વિકાસ', 'I-ORG'), ('સંગઠન', 'I-ORG'), ('દ્વારા', 'O'), ('આદિવાસી', 'B-LOC'), ('ભવન', 'I-ORG'), ('ખાતે', 'O'), ('ખૂબ', 'O'), ('જ', 'O'), ('ઉત્સાહભેર', 'O'), ('ઉજવણી', 'O'), ('કરવામાં', 'O'), ('આવશે.', 'O')], [('આ', 'O'), ('મામલા', 'O'), ('સાથે', 'O'), ('જોડાયેલી', 'O'), ('મોટી', 'O'), ('જાણકારી', 'O'), ('એ', 'O'), ('છે', 'O'), ('કે', 'O'), ('પીએનબીની', 'B-ORG'), ('બ્રૈડી', 'I-ORG'), ('ફોર્ડ', 'I-ORG'), ('બ્રાંચ', 'I-ORG'), ('જે', 'O'), ('પૂર્વ', 'O'), ('ડેપ્યુટી', 'O'), ('મેનેજર', 'O'), ('ગોકુલનાથ', 'B-PER'), ('શેટ્ટીની', 'I-PER'), ('શનિવારે', 'O'), ('ધરપકડ', 'O'), ('કરવામાં', 'O'), ('આવી', 'O'), ('છે.', 'O')], [('‘જ્ઞાન', 'O'), ('સાથે', 'O'), ('ગમ્મત’', 'O'), ('પર', 'O'), ('શેર', 'O'), ('કરેલી', 'O'), ('આ', 'O'), ('માહિતી', 'O'), ('જો', 'O'), ('તમને', 'O'), ('ઉપયોગી', 'O'), ('લાગી', 'O'), ('હોય', 'O'), ('તો', 'O'), ('બીજા', 'O'), ('મિત્રો', 'O'), ('સાથે', 'O'), ('જરૂર', 'O'), ('શેર', 'O'), ('કરજો.', 

In [None]:
second_elements = [[pair[1] for pair in inner_list] for inner_list in indicNer_list_Q4]

print(second_elements)


[['O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'B-LOC', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PER', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O'], ['O', 'B-LOC', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O'], ['B-ORG', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'B-LOC', 'O', 'O', 'B-LOC', 'O', 'O', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-PER', 'I-PER', 'O