<h1><b>POS Tagging</b></h1>
The notebook is divided into three section -  <i>Data Preparation</i>, <i>Model Definition</i> and the actual <i>Training and Evaluation</i>.
Following these three sections is one more sectionb - <i>Using Langauge Embeddings</i> - where words are converted to word embeddings and concatenated with language vectors.

In [2]:
!pip install -q conllu
!pip install -q datasets
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h

<h1>Data Preparation</h1>

<h2>Data Fetching, Cleaning and Storing</h2>
In this section, we fetch the data from the URL: 'https://raw.githubusercontent.com/singhakr/Bhojpuri-Magahi-and-Maithili-Linguistic-Resources/main/bhojpuri/pos-tagged/bhojpuri-pos-tagged-ver-1.3.txt'. The data is cleaned and converted to required format and the output is stored in a file "output.txt". You can download the obtained file in your local storage and use it for further procssing, thus eliminating the need of re-reunning this section again.

In [3]:
import urllib.request

In [4]:
dataurl = 'https://raw.githubusercontent.com/singhakr/Bhojpuri-Magahi-and-Maithili-Linguistic-Resources/main/bhojpuri/pos-tagged/bhojpuri-pos-tagged-ver-1.3.txt'
filepath = 'bhojpuri-pos-tagged-ver-1.3.txt'
urllib.request.urlretrieve(dataurl, filepath)

('bhojpuri-pos-tagged-ver-1.3.txt',
 <http.client.HTTPMessage at 0x7ca18cb1eaa0>)

In [5]:
with open(filepath, mode="r", encoding="utf-8") as data:
    annotations = data.read()

In [6]:
i=0
sents=[]
while(i<len(annotations)):
  sent = ""
  while i<len(annotations):
    if annotations[i]!='\n':
      sent = sent + annotations[i]
      i+=1
    else:
      i+=1
      break
  sents.append(sent)

#############
lines = []
for sent in sents:
  if(len(sent)>0 and sent[0]=='<'):
    continue
  elif "document" in sent:
    continue
  else:
    lines.append(sent)

#############
all = ""
for x in lines:
  all = all + '\n' + x

In [7]:
if "<fs af=',,,,,,,'>" in all:
    all = all.replace("<fs af=',,,,,,,'>", '')
if "\t))" in all:
  all = all.replace("\t))", "100\t))\tSYM")
elif "))" in all:
  all = all.replace("))", "100\t))\tSYM")
if "NNP:?" in all:
  all = all.replace("NNP:?", "NNP")

In [8]:
all = all[1:]

In [9]:
outputfile = open('output.txt', 'w')
outputfile.writelines(all)
outputfile.close()

<h2>Dataset Preparation</h2>

In [11]:
from conllu import parse
with open("output.txt", mode="r", encoding="utf-8-sig") as data_file:
    data = data_file.read()
sentences = parse(data)

In [12]:
print(type(sentences))
print(sentences[1])

<class 'conllu.models.SentenceList'>
TokenList<नेवता, देखते, मन, चटक, गइल, ।>


In [13]:
sents=[]
postags=[]
for sentence in sentences:
  sent = ['hi']
  postag = ['-ID-']
  for token in sentence:
    sent.append(token['form'])
    try:
      postag.append(token['lemma'])
    except:
      postag.append("NA")
  sents.append(sent)
  postags.append(postag)

In [14]:
print(type(sents))
print(sents[1])

<class 'list'>
['hi', 'नेवता', 'देखते', 'मन', 'चटक', 'गइल', '।']


In [15]:
print(type(postags))
print(postags[1])

<class 'list'>
['-ID-', 'NN', 'VM', 'NN', 'VM', 'VAUX', 'SYM']


<h3> Get Tag Set</h3>

In [16]:
tags = []
for postag in postags:
  for tag in postag:
    tags.append(tag)

tags = set(tags)
tag2int = {}
int2tag = {}

for i, tag in enumerate(sorted(tags)):
    tag2int[tag] = i+1
    int2tag[i+1] = tag

tag2int['-PAD-'] = 0
int2tag[0] = '-PAD-'

In [17]:
tag2int

{'-': 1,
 '-ID-': 2,
 'CC': 3,
 'CCP': 4,
 'CL': 5,
 'DEM': 6,
 'ECH': 7,
 'INJ': 8,
 'INTF': 9,
 'JJ': 10,
 'NA': 11,
 'NEG': 12,
 'NEGP': 13,
 'NN': 14,
 'NNP': 15,
 'NP': 16,
 'NST': 17,
 'PRP': 18,
 'PSP': 19,
 'QC': 20,
 'QF': 21,
 'QO': 22,
 'RB': 23,
 'RDP': 24,
 'RP': 25,
 'SYM': 26,
 'UNK': 27,
 'V': 28,
 'VAUX': 29,
 'VGF': 30,
 'VM': 31,
 'WQ': 32,
 '-PAD-': 0}

<h3>Convert to Hugging Face DatasetDict</h3>

In [52]:
from datasets import Dataset
import pandas as pd

data_list = []
for i in range(0, len(sents)-1):
  temp = {'form':sents[i], 'postag':[tag2int[tag] for tag in postags[i]]}
  data_list.append(temp)
dataset = Dataset.from_pandas(pd.DataFrame(data=data_list))

In [53]:
from datasets import DatasetDict
train_testvalid = dataset.train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [54]:
dataset

DatasetDict({
    train: Dataset({
        features: ['form', 'postag'],
        num_rows: 3436
    })
    test: Dataset({
        features: ['form', 'postag'],
        num_rows: 430
    })
    valid: Dataset({
        features: ['form', 'postag'],
        num_rows: 429
    })
})

<h2>Tokenization</h2>

In [86]:
from transformers import AutoTokenizer

model_checkpoint = "ai4bharat/IndicBERTv2-SS"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenized_inputs = tokenizer(sents, truncation=True, is_split_into_words=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [87]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            new_labels.append(label)

    return new_labels

In [88]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['form'], truncation=True, is_split_into_words=True
    )
    all_labels = examples['postag']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [89]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset['train'].column_names,
)

Map:   0%|          | 0/3436 [00:00<?, ? examples/s]

Map:   0%|          | 0/430 [00:00<?, ? examples/s]

Map:   0%|          | 0/429 [00:00<?, ? examples/s]

<h2>Data Collation</h2>


In [90]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer
)

In [91]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["valid"], collate_fn=data_collator, batch_size=8
)

<h1>Model Definition</h1>

<h2>Model</h2>

In [92]:
from transformers import AutoModelForTokenClassification

model_checkpoint = "ai4bharat/IndicBERTv2-SS"
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=int2tag,
    label2id=tag2int,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-SS and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<h2>Optimiser and Scheduler</h2>

In [93]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [94]:
!pip install -q accelerate

In [95]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [96]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

<h2>Preprocessor and Evaluator</h2>

In [97]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[int2tag[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [int2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [98]:
def create_confusion_matrix(predicted, actual, tagset):
  n = len(tagset)

  int2tag_temp = {}
  tag2int_temp = {}
  for i, j in enumerate(tagset):
    int2tag_temp[i] = j
    tag2int_temp[j] = i

  confusion_matrix = [[0 for i in range(n)] for j in range(n)]

  m = len(predicted)
  for i in range(m):
    l = len(predicted[i])
    for j in range(l):
      confusion_matrix[tag2int_temp[predicted[i][j]]][tag2int_temp[actual[i][j]]] += 1

  return confusion_matrix

In [99]:
def print_confusion_matrix(true_predictions, true_labels, tagset):
  confusion_matrix = create_confusion_matrix(true_predictions, true_labels, tagset)
  n = len(confusion_matrix)
  print(confusion_matrix)



In [100]:
#Note: We calculate micro-average scores.
def evaluator(predicted, actual, tagset):
  #confusion matrix is a 2-D square matrix
  confusion_matrix = create_confusion_matrix(predicted, actual, tagset)
  TN, TP, FN, FP = 0, 0, 0, 0
  n = len(confusion_matrix)
  sum = 0
  for i in range(n):
    TP = TP + confusion_matrix[i][i]

    for j in range(n):
      sum+= confusion_matrix[i][j]
      if j!=i:
        FP = FP + confusion_matrix[i][j]
        FN = FN + confusion_matrix[j][i]

  TN = sum - TP - FP - FN

  accuracy = (TP+TN)/sum
  precision = TP/(TP+FP)
  recall = TP/(TP+FN)
  f1score = (2*precision*recall)/(precision+recall)

  results = {'accuracy': accuracy,
             'precision': precision,
             'recall': recall,
             'f1score': f1score}
  return results


<h1>Training and Evaluation</h1>

In [101]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    true_predictions = []
    true_labels = []
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_prediction, true_label = postprocess(predictions_gathered, labels_gathered)

        for x in true_prediction:
          true_predictions.append(x)
        for y in true_label:
          true_labels.append(y)

    tagset = [tag for tag in tag2int]

    results = evaluator(true_predictions, true_labels, tagset)
    print(
        f"epoch {epoch}:",
        {
            key: results[f"{key}"]
            for key in ["precision", "recall", "f1score", "accuracy"]
        },
    )


  0%|          | 0/1290 [00:00<?, ?it/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 0: {'precision': 0.8080452545568825, 'recall': 0.8080452545568825, 'f1score': 0.8080452545568825, 'accuracy': 0.616090509113765}
epoch 1: {'precision': 0.8250157133878064, 'recall': 0.8250157133878064, 'f1score': 0.8250157133878064, 'accuracy': 0.6500314267756129}
epoch 2: {'precision': 0.8274041483343809, 'recall': 0.8274041483343809, 'f1score': 0.8274041483343809, 'accuracy': 0.6548082966687618}


<h1>Using Language Embeddings</h1>

<h2>For One Word</h2>
We get a word embedding for one word, and concatenate language vector to it.

<h3>Getting Word Embedding</h3>

In [102]:
import torch
from transformers import AutoModel
from transformers import AutoTokenizer

tokenizer_embed = AutoTokenizer.from_pretrained("ai4bharat/IndicBERTv2-SS")
model_embed = AutoModel.from_pretrained("ai4bharat/IndicBERTv2-SS")

# get the embedding vector for the word "example"
example_token_id = tokenizer_embed.convert_tokens_to_ids(['मन'])[0]
example_embedding = model_embed.embeddings.word_embeddings(torch.tensor([example_token_id]))

Some weights of BertModel were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-SS and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [103]:
print(type(example_embedding))
print(example_embedding.shape)

<class 'torch.Tensor'>
torch.Size([1, 768])


In [104]:
example_embedding_np = example_embedding.detach().numpy()

In [105]:
print(type(example_embedding_np))
print(example_embedding_np.shape)

<class 'numpy.ndarray'>
(1, 768)


<h3>Concatenate langvec</h3>

In [106]:
!pip3 install -q lang2vec

In [107]:
import lang2vec.lang2vec as l2v
features = l2v.get_features("hin", "geo")

In [108]:
print(type(features['hin']))
print(features['hin'][0:2])
print(len(features['hin']))

<class 'list'>
[0.6082000136375427, 0.690500020980835]
299


In [109]:
import numpy as np
concatenated_embedding = np.concatenate((example_embedding_np[0], features['hin']))

In [110]:
concatenated_embedding.shape

(1067,)

In [111]:
concatenated_embedding_tensor = torch.from_numpy(concatenated_embedding)

<h3>Into one Function</h3>

In [112]:
def word_to_concatenated_embedding(word, langvec):
  temp = []
  temp.append(word)
  example_token_id = tokenizer_embed.convert_tokens_to_ids(temp)[0]
  example_embedding = model_embed.embeddings.word_embeddings(torch.tensor([example_token_id]))
  example_embedding_np = example_embedding.detach().numpy()
  concatenated_embedding = np.concatenate((example_embedding_np[0], langvec))
  return concatenated_embedding

<h2>For the entire Dataset</h2>

In [113]:
langvec = features['hin']
sents_embedding = []
for sent in sents:
  sent_embedding = [concatenated_embedding]
  for word in sent:
    embedding = word_to_concatenated_embedding(word, langvec)
    sent_embedding.append(np.array(embedding))
  sents_embedding.append(np.array(sent_embedding))

sents_embedding = np.array(sents_embedding)

  sents_embedding = np.array(sents_embedding)


In [114]:
def sentences_to_embedding(sentences, language_code):
  features = l2v.get_features(language_code, "geo")
  langvec = features[language_code]

  sents_embedding = []
  for sent in sentences:
    sent_embedding = [concatenated_embedding]
    for word in sent:
      embedding = word_to_concatenated_embedding(word, langvec)
      sent_embedding.append(np.array(embedding))
    sents_embedding.append(np.array(sent_embedding))

  sents_embedding = np.array(sents_embedding)

  return sents_embedding

In [115]:
print(type(sents_embedding))
print(sents_embedding.shape)

<class 'numpy.ndarray'>
(4296,)


In [116]:
print(type(sents_embedding[0]))
print(sents_embedding[0].shape)

<class 'numpy.ndarray'>
(6, 1067)
