In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip3 install transformers
!pip3 install datasets
!pip3 install sentencepiece
!pip3 install seqeval
!pip3 install accelerate

In [None]:
from sklearn.model_selection import train_test_split
import json
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np
from sklearn.metrics import precision_recall_fscore_support,f1_score
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode

In [None]:
# Import all the necessary classes and initialize the tokenizer and model.
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

config = AutoConfig.from_pretrained('ai4bharat/indic-bert', num_labels=7, finetuning_task='ner')
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=7)

In [None]:
model.config #checking configuration

AlbertConfig {
  "_name_or_path": "ai4bharat/indic-bert",
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers

In [None]:
lang='mr'

data = load_dataset('ai4bharat/naamapadam', lang) #loading data online

In [None]:
data #data format

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 455248
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1080
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2300
    })
})

In [None]:
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG","B-LOC", "I-LOC"]
label_to_index = {"O": 0, "B-PER": 1, "I-PER": 2, "B-ORG": 3, "I-ORG": 4,"B-LOC": 5, "I-LOC": 6}
index_to_label = {0: "O" , 1: "B-PER" , 2: "I-PER" , 3: "B-ORG" , 4: "I-ORG" ,5: "B-LOC" , 6: "I-LOC" }
# label_to_index = {"O": 0, "B-PER": 1, "I-PER": 2, "B-ORG": 3, "I-ORG": 4,"B-LOC": 5, "I-LOC": 6,"B-MISC": 7, "I-MISC": 8}
# index_to_label = {0: "O" , 1: "B-PER" , 2: "I-PER" , 3: "B-ORG" , 4: "I-ORG" ,5: "B-LOC" , 6: "I-LOC",7: "B-MISC",8:  "I-MISC" }
label2id= {
    "B-LOC": 0,
    "B-ORG": 1,
    "B-PER": 2,
    "I-LOC": 3,
    "I-ORG": 4,
    "I-PER": 5,
    "O": 6
  }
id2label= {
    "0": "B-LOC",
    "1": "B-ORG",
    "2": "B-PER",
    "3": "I-LOC",
    "4": "I-ORG",
    "5": "I-PER",
    "6": "O"
  }
num_labels = len(label_list)

In [None]:
text_column_name = "tokens"
label_column_name = "ner_tags"

In [None]:
# Sample only 1,00,000 entries from the training dataset
train_dataset_sampled = data['train'].shuffle(seed=53).select(range(100000))

In [None]:
train_dataset_sampled['tokens'][0]

['मी',
 'आज',
 'या',
 'शुभ',
 'प्रसंगी',
 ',',
 'या',
 'भव्य',
 'आयोजनासाठी',
 'समितीला',
 'शुभेच्छा',
 'देतो',
 ',',
 'पूज्य',
 'महाराज',
 'साहेबांना',
 'वंदन',
 'करतो',
 'आणि',
 'भगवान',
 'महावीरांच्या',
 'चरणी',
 'प्रार्थना',
 'करतो',
 'की',
 'अशा',
 'आचार्य',
 'भगवंतांना',
 'अशी',
 'आचार्य',
 'शक्ती',
 'दे',
 ',',
 'अशी',
 'दिव्यता',
 'दे',
 'की',
 'येणाऱ्या',
 'शतकापर्यंत',
 'मानवजातीच्या',
 'कल्याणासाठी',
 'त्यांचा',
 'मार्ग',
 'आमच्या',
 'कामी',
 'येईल',
 '.']

In [None]:
max_seq_length = max(len(seq) for seq in train_dataset_sampled['tokens'])

In [None]:
max_seq_length

100

In [None]:
# model=model.to("cuda")

In [None]:
# Tokenize all texts and align the labels with them.
padding = "max_length"
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        # padding=padding,
        # truncation=True,
        # max_length=max_seq_length,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        # print('=====')
        # print('{} {}'.format(i,label)) #ak
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # if i==0:
            #   print(word_idx)
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                # label_ids.append(-100)
                label_ids.append(label[word_idx]) #changed this to make all the tokens of a word have the same label
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
train_dataset = train_dataset_sampled
train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=32,
    load_from_cache_file=True,
    desc="Running tokenizer on train dataset",
    batch_size=500,
)

Running tokenizer on train dataset (num_proc=32):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
train_dataset['tokens'][3]

['मी', 'त्यांचे', 'समर्थन', '.']

In [None]:
tokenizer.convert_ids_to_tokens(train_dataset['input_ids'][3][:20])

['[CLS]', '▁म', '▁तय', 'च', '▁समर', 'थन', '▁.', '[SEP]']

In [None]:
train_dataset['labels'][3][:10]

[-100, 0, 0, 0, 0, 0, 0, -100]

In [None]:
train_dataset['tokens'][3]

['मी', 'त्यांचे', 'समर्थन', '.']

In [None]:
eval_dataset = data["validation"]
eval_dataset = eval_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on Validation dataset",
    batch_size=8,
)

Running tokenizer on Validation dataset (num_proc=4):   0%|          | 0/2300 [00:00<?, ? examples/s]

In [None]:
eval_dataset['labels'][0][:10]

[-100, 5, 5, 0, 0, 0, 0, 0, 0, -100]

**Create Data Collator, Metrics**

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
# Metrics
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    # macro_f1 = f1_score(true_labels, true_predictions, average='macro')
    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    # final_results['macro_f1'] = macro_f1;
    return final_results

**Set Training Arguments**

In [None]:
args=TrainingArguments(
    output_dir="/content/gdrive/MyDrive/cs689/a2/",
    per_device_train_batch_size=50,


    per_device_eval_batch_size=8,num_train_epochs=5,save_steps=5000,save_safetensors=False,weight_decay = 1e-6)


**Training**

In [None]:
# # Load the last saved checkpoint
# checkpoint_path = '/content/gdrive/MyDrive/cs689/a2/checkpoint-10000/pytorch_model.bin'
# checkpoint = torch.load(checkpoint_path,map_location=torch.device('cpu'))
# # checkpoint = torch.load(checkpoint_path)

# # Load the model weights from the checkpoint
# model.load_state_dict(checkpoint)

<All keys matched successfully>

In [None]:
# checkpoint.keys()

In [None]:
# Initialize our Trainer
# early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)
# args.metric_for_best_model = "f1"
# args.load_best_model_at_end = True
# args.evaluation_strategy = IntervalStrategy.STEPS
# args.eval_steps = args.save_steps
# args.greater_is_better = True

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[early_stopping_callback],
    args=args,
)

In [None]:
trainer.args

In [None]:
train_result = trainer.train()


Step,Training Loss
500,0.5943
1000,0.3983
1500,0.3616
2000,0.3406
2500,0.3089
3000,0.3077
3500,0.3046
4000,0.2978
4500,0.2695
5000,0.2655


In [None]:
metrics = train_result.metrics

In [None]:
metrics = trainer.evaluate()

In [None]:
trainer.log_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        5.0
  eval_LOC_f1             =     0.7822
  eval_LOC_number         =       3110
  eval_LOC_precision      =     0.7608
  eval_LOC_recall         =     0.8048
  eval_ORG_f1             =     0.6935
  eval_ORG_number         =       2577
  eval_ORG_precision      =     0.6811
  eval_ORG_recall         =     0.7062
  eval_PER_f1             =     0.7963
  eval_PER_number         =       3945
  eval_PER_precision      =     0.7939
  eval_PER_recall         =     0.7987
  eval_loss               =      0.309
  eval_overall_accuracy   =     0.9032
  eval_overall_f1         =     0.7641
  eval_overall_precision  =     0.7526
  eval_overall_recall     =      0.776
  eval_runtime            = 0:00:09.95
  eval_samples_per_second =    231.132
  eval_steps_per_second   =     28.942


In [None]:
with open('/content/gdrive/MyDrive/cs689/a2/saved_models/metrics/val_metrics.json', 'w') as f:
    json.dump(metrics, f)

In [None]:
metrics = trainer.evaluate(train_dataset)


In [None]:
trainer.log_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        5.0
  eval_LOC_f1             =     0.8633
  eval_LOC_number         =     134763
  eval_LOC_precision      =     0.8426
  eval_LOC_recall         =     0.8851
  eval_ORG_f1             =     0.7999
  eval_ORG_number         =     105539
  eval_ORG_precision      =      0.809
  eval_ORG_recall         =     0.7909
  eval_PER_f1             =     0.8664
  eval_PER_number         =     164191
  eval_PER_precision      =     0.8615
  eval_PER_recall         =     0.8714
  eval_loss               =      0.174
  eval_overall_accuracy   =      0.943
  eval_overall_f1         =     0.8483
  eval_overall_precision  =     0.8418
  eval_overall_recall     =      0.855
  eval_runtime            = 0:07:53.08
  eval_samples_per_second =    211.378
  eval_steps_per_second   =     26.422


In [None]:
with open('/content/gdrive/MyDrive/cs689/a2/saved_models/metrics/train_metrics.json', 'w') as f:
    json.dump(metrics, f)

## Evaluate the Trained Model

Let us now evaluate the trained model on the test sets of all languages

We need to first tokenize the test sets

In [None]:
# tokenized_test_set = {}
tokenized_test_set = data['test'].map(
      tokenize_and_align_labels,
      batched=True,
      num_proc=32,
      load_from_cache_file=True,
      # desc="Running tokenizer on test dataset of language {0}".format(lang),
      )

Map (num_proc=32):   0%|          | 0/1080 [00:00<?, ? examples/s]

Run prediction on test set of each of the language separately and extract overall `Precison`, `Recall` and `F-Score` separately

In [None]:
tokenized_test_set['ner_tags'][0]

[3, 4, 4, 0, 5, 0, 0, 0, 0]

In [None]:
# final_metrics = {}

# for lang in tokenized_test_set:
predictions, labels, metrics = trainer.predict(tokenized_test_set)

lang_specific_results = {}
for key in metrics:
  if 'overall_precision' in key:
    lang_specific_results['Precision'] = metrics[key]
  elif 'overall_recall' in key:
    lang_specific_results['Recall'] = metrics[key]
  elif 'overall_f1' in key:
    lang_specific_results['F1'] = metrics[key]
  # elif 'macro_f1' in key:
  #   lang_specific_results['macro_f1'] = metrics[key]
final_metrics = lang_specific_results

In [None]:
(predictions[0][0])

array([ 1.5927824 , -0.3106748 , -0.7261397 , -0.36029962, -0.6925054 ,
       -0.03230964, -0.4041909 ], dtype=float32)

In [None]:
with open('/content/gdrive/MyDrive/cs689/a2/saved_models/metrics/final_metrics.json', 'w') as f:
    json.dump(final_metrics, f)

Print the individual result on each of the language

In [None]:
import pandas as pd

combined_results = pd.DataFrame.from_dict(
            final_metrics, orient="index"
        )

print(combined_results)

                  0
Precision  0.814007
Recall     0.761814
F1         0.787046


In [None]:
combined_results.to_csv('/content/gdrive/MyDrive/cs689/a2/saved_models/metrics/combined_results.csv')

In [None]:
model=model.to("cpu")

In [None]:
model.config.label2id={"O": 0, "B-PER": 1, "I-PER": 2, "B-ORG": 3, "I-ORG": 4,"B-LOC": 5, "I-LOC": 6}
model.config.id2label={0: "O" , 1: "B-PER" , 2: "I-PER" , 3: "B-ORG" , 4: "I-ORG" ,5: "B-LOC" , 6: "I-LOC" }


In [None]:
def get_predictions( sentence, tokenizer, model ):
  # Let us first tokenize the sentence - split words into subwords
  tok_sentence = tokenizer(sentence, return_tensors='pt')

  with torch.no_grad():
    # we will send the tokenized sentence to the model to get predictions
    logits = model(**tok_sentence).logits.argmax(-1)

    # We will map the maximum predicted class id with the class label
    # predicted_tokens_classes = [model.config.id2label[t.item()] for t in logits[0]]
    predicted_tokens_classes = [str(t.item()) for t in logits[0]]

    predicted_labels = []

    previous_token_id = 0
    # we need to assign the named entity label to the head word and not the following sub-words
    word_ids = tok_sentence.word_ids()
    for word_index in range(len(word_ids)):
        if word_ids[word_index] == None:
            previous_token_id = word_ids[word_index]
        elif word_ids[word_index] == previous_token_id:
            previous_token_id = word_ids[word_index]
        else:
            predicted_labels.append( int(predicted_tokens_classes[ word_index ]) )
            previous_token_id = word_ids[word_index]

    return predicted_labels

In [None]:
# let us try with some example sentences herejj
# sentence = 'लगातार हमलावर हो रहे शिवपाल और राजभर को सपा की दो टूक, चिट्ठी जारी कर कहा- जहां जाना चाहें जा सकते हैं'
sentence = ' '.join(['उल्हासनगर',
 'रेल्वे',
 'स्थानक',
 'हे',
 'ठाणे',
 'जिल्ह्यातील',
 'शहर',
 'आहे',
 '.'])
# sentence = ' '.join(train_dataset['tokens'][1])
predicted_labels = get_predictions(sentence=sentence,
                                   tokenizer=tokenizer,
                                   model=model
                                   )

for index in range(len(sentence.split(' '))):
  print( sentence.split(' ')[index] , '\t' , predicted_labels[index] ,end=' ')

उल्हासनगर 	 5 रेल्वे 	 0 स्थानक 	 0 हे 	 0 ठाणे 	 5 जिल्ह्यातील 	 0 शहर 	 0 आहे 	 0 . 	 0 

In [None]:
index_to_label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC'}

In [None]:
train_actual_labels = []
train_predicted_labels = []
test_actual_labels = []
test_predicted_labels = []
validation_actual_labels = []
validation_predicted_labels = []

In [None]:
train_tags = train_dataset_sampled['ner_tags']
train_toks = train_dataset_sampled['tokens']
for i in range(len(train_dataset_sampled)):
  train_actual_labels.append(train_tags[i])
  train_predicted_labels.append(get_predictions(sentence=' '.join(train_toks[i]), tokenizer=tokenizer, model=model))

In [None]:
for i in range(len(data['validation'])):
  validation_actual_labels.append(data['validation'][i]['ner_tags'])
  predicted_labels = get_predictions(sentence=' '.join(data['validation'][i]['tokens']), tokenizer=tokenizer, model=model)
  validation_predicted_labels.append(predicted_labels)

In [None]:
for i in range(len(data['test'])):
  test_actual_labels.append(data['test'][i]['ner_tags'])
  predicted_labels = get_predictions(sentence=' '.join(data['test'][i]['tokens']), tokenizer=tokenizer, model=model)
  test_predicted_labels.append(predicted_labels)

In [None]:
data_to_save = {
    "train_actual_labels": train_actual_labels,
    "train_predicted_labels": train_predicted_labels,
    "validation_actual_labels": validation_actual_labels,
    "validation_predicted_labels": validation_predicted_labels,
    "test_actual_labels": test_actual_labels,
    "test_predicted_labels": test_predicted_labels,
}


# Save the data to a JSON file
with open('/content/gdrive/MyDrive/cs689/a2/saved_models/bert_predictions/prediction_ttv_indicbert.json', 'w') as f:
    json.dump(data_to_save, f)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

def calculate_metrics_sentencewise(actual_tags, predicted_tags, labels):
    # Initialize lists to store precision, recall, F1 score, and support for each class
    precision_list = []
    recall_list = []
    f1_list = []
    macro_f1 = []
    # macro_f1 = []
    # for actual_tags2, predicted_tags2 in zip(actual_tags,predicted_tags):
    for i in range(len(actual_tags)):
          if(i%1000==0):
            print(i)
          if(len(actual_tags[i])!=len(predicted_tags[i])):
            continue
    # Calculate precision, recall, F1 score, and support for each class
      # for label in labels:
          # vjvjvj
          # Calculate precision, recall, F1 score, and support for the current class
          precision, recall, f1, _ = precision_recall_fscore_support(actual_tags[i], predicted_tags[i], labels=labels, average=None,zero_division=0)
          # type(precision)
          # Append the values to the respective lists
          precision_list.append(precision)
          recall_list.append(recall)
          f1_list.append(f1)
          macro_f1.append(sum(f1)/len(f1))
        # Calculate macro F1 score
      # macro_f1.append(np.sum(f1_list) / len(f1_list))
    precision_list = np.sum(precision_list,axis=0)/len(precision_list)
    recall_list = np.sum(recall_list,axis=0)/len(recall_list)
    f1_list = np.sum(f1_list,axis=0)/len(f1_list)
    macro_f1 = sum(macro_f1)/len(macro_f1)
    return precision_list, recall_list, f1_list, macro_f1


In [None]:
def calculate_metrics_overall(actual_tags, predicted_tags, labels):
    # Initialize lists to store precision, recall, F1 score, and support for each class
    at=[]
    pt=[]
    precision_list = []
    recall_list = []
    f1_list = []
    macro_f1 = []
    # macro_f1 = []
    # for actual_tags2, predicted_tags2 in zip(actual_tags,predicted_tags):
    for i in range(len(actual_tags)):
          if(i%1000==0):
            print(i)
          if(len(actual_tags[i])!=len(predicted_tags[i])):
            continue
          at.extend(actual_tags[i])
          pt.extend(predicted_tags[i])

    precision, recall, f1, _ = precision_recall_fscore_support(at, pt, labels=labels, average=None,zero_division=0)
          # type(precision)
          # Append the values to the respective lists
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    macro_f1.append(sum(f1)/len(f1))
        # Calculate macro F1 score
      # macro_f1.append(np.sum(f1_list) / len(f1_list))
    precision_list = np.sum(precision_list,axis=0)/len(precision_list)
    recall_list = np.sum(recall_list,axis=0)/len(recall_list)
    f1_list = np.sum(f1_list,axis=0)/len(f1_list)
    macro_f1 = sum(macro_f1)/len(macro_f1)
    precision_all, recall_all, f1_all, _ = precision_recall_fscore_support(at, pt, labels=labels, average='micro',zero_division=0)
    return precision_list, recall_list, f1_list, macro_f1, precision_all, recall_all, f1_all
    # return precision_list, recall_list, f1_list, macro_f1

In [None]:
with open('/content/gdrive/MyDrive/cs689/a2/saved_models/bert_predictions/prediction_ttv_indicbert.json', 'r') as f:
    data_ttv = json.load(f)

In [None]:
data_ttv['train_predicted_labels'][26]

[0, 4, 4, 4, 4, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
train_dataset_sampled['tokens'][26]

['मुंबईतील',
 'वरळी',
 'येथील',
 'ग्रीनलॉन्स',
 'स्कूलच्या',
 'फरझान',
 'भरुचाने',
 '99.20',
 'टक्के',
 'गुणांसह',
 'देशातून',
 'दुसरा',
 'क्रमांक',
 'मिळवला',
 'आहे',
 '.']

In [None]:
train_dataset_sampled['ner_tags'][26]

[3, 4, 4, 4, 4, 1, 2, 0, 0, 0, 5, 0, 0, 0, 0, 0]

In [None]:

# Have to run this cell thrice for train, testing and validation
labels = [0, 1, 2, 3, 4, 5, 6]
# # labels = [0, 1, 2, 3]
# # Define the actual and predicted tags
# # actual_tags = [[0, 0, 1], [2, 2, 3]]
# # predicted_tags = [[2, 2, 1], [2, 0, 3]]
actual_tags = data_ttv['validation_actual_labels']
predicted_tags = data_ttv['validation_predicted_labels']

# Calculate metrics
precision_list, recall_list, f1_list, macro_f1 = calculate_metrics_sentencewise(actual_tags, predicted_tags, labels)
validation_sentwise_stats=[precision_list,recall_list,f1_list,macro_f1]

precision_list_overall, recall_list_overall, f1_list_overall, macro_f1_overall = calculate_metrics_overall(actual_tags, predicted_tags, labels)
validation_overall_stats=[precision_list_overall,recall_list_overall,f1_list_overall,macro_f1_overall]

print('precision_classwise_sentencewisecalculated: ',precision_list,'\nrecall_classwise_sentencewisecalculated: ',recall_list,
      '\nf1_classwise_sentencewisecalculated: ',f1_list,'\nmacro_f1_sentencewisecalculated: ',macro_f1)

print('precision_classwise_overall: ',precision_list_overall,'\nrecall_classwise_overall: ',recall_list_overall,
      '\nf1_classwise_overall: ',f1_list_overall,'\nmacro_f1_overall: ',macro_f1_overall)



0
1000
2000
0
1000
2000
precision_classwise_sentencewisecalculated:  [0.9500628  0.39874562 0.30215505 0.17809408 0.06241443 0.23042532
 0.04286852] 
recall_classwise_sentencewisecalculated:  [0.95565131 0.39976674 0.29934272 0.17941615 0.06043918 0.23802657
 0.04321805] 
f1_classwise_sentencewisecalculated:  [0.94893187 0.39547    0.29646542 0.17518223 0.05907925 0.23043962
 0.042     ] 
macro_f1_sentencewisecalculated:  0.30679548259164824
precision_classwise_overall:  [0.95603712 0.84308365 0.8723545  0.70574713 0.64154104 0.78812861
 0.60816327] 
recall_classwise_overall:  [0.96051575 0.84911894 0.83639822 0.70737327 0.56573117 0.82556131
 0.54181818] 
f1_classwise_overall:  [0.9582712  0.84609053 0.85399806 0.70655926 0.60125589 0.8064108
 0.57307692] 
macro_f1_overall:  0.7636660952994229


In [None]:
train_overall_stats[0]

array([0.97074165, 0.89355861, 0.91015029, 0.84381716, 0.844869  ,
       0.86020977, 0.77656561])

In [None]:
with open('/content/gdrive/MyDrive/cs689/a2/saved_models/bert_predictions/stats_ttv_indicbert.txt', 'w') as f3:
    f3.write('IndicBERT\n\n')
    f3.write('Over all the predicted words\n\n')
    f3.write('TRAINING\n')
    f3.write('MACRO-f1  : %1f\n' % (train_overall_stats[3]))
    f3.write('Metric    :     O    :   B-PER  :   I-PER  :   B-ORG  :   I-ORG  :   B-LOC  :   I-LOC\n')
    f3.write('Precision : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (train_overall_stats[0][0],train_overall_stats[0][1],train_overall_stats[0][2],train_overall_stats[0][3],train_overall_stats[0][4],train_overall_stats[0][5],train_overall_stats[0][6]))
    f3.write('Recall    : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (train_overall_stats[1][0],train_overall_stats[1][1],train_overall_stats[1][2],train_overall_stats[1][3],train_overall_stats[1][4],train_overall_stats[1][5],train_overall_stats[1][6]))
    f3.write('f1        : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (train_overall_stats[2][0],train_overall_stats[2][1],train_overall_stats[2][2],train_overall_stats[2][3],train_overall_stats[2][4],train_overall_stats[2][5],train_overall_stats[2][6]))
    f3.write('\nTESTING\n')
    f3.write('MACRO-f1  : %1f\n' % (test_overall_stats[3]))
    f3.write('Metric    :     O    :   B-PER  :   I-PER  :   B-ORG  :   I-ORG  :   B-LOC  :   I-LOC\n')
    f3.write('Precision : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (test_overall_stats[0][0],test_overall_stats[0][1],test_overall_stats[0][2],test_overall_stats[0][3],test_overall_stats[0][4],test_overall_stats[0][5],test_overall_stats[0][6]))
    f3.write('Recall    : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (test_overall_stats[1][0],test_overall_stats[1][1],test_overall_stats[1][2],test_overall_stats[1][3],test_overall_stats[1][4],test_overall_stats[1][5],test_overall_stats[1][6]))
    f3.write('f1        : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (test_overall_stats[2][0],test_overall_stats[2][1],test_overall_stats[2][2],test_overall_stats[2][3],test_overall_stats[2][4],test_overall_stats[2][5],test_overall_stats[2][6]))
    f3.write('\nVALIDATION\n')
    f3.write('MACRO-f1  : %1f\n' % (validation_overall_stats[3]))
    f3.write('Metric    :     O    :   B-PER  :   I-PER  :   B-ORG  :   I-ORG  :   B-LOC  :   I-LOC\n')
    f3.write('Precision : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (validation_overall_stats[0][0],validation_overall_stats[0][1],validation_overall_stats[0][2],validation_overall_stats[0][3],validation_overall_stats[0][4],validation_overall_stats[0][5],validation_overall_stats[0][6]))
    f3.write('Recall    : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (validation_overall_stats[1][0],validation_overall_stats[1][1],validation_overall_stats[1][2],validation_overall_stats[1][3],validation_overall_stats[1][4],validation_overall_stats[1][5],validation_overall_stats[1][6]))
    f3.write('f1        : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (validation_overall_stats[2][0],validation_overall_stats[2][1],validation_overall_stats[2][2],validation_overall_stats[2][3],validation_overall_stats[2][4],validation_overall_stats[2][5],validation_overall_stats[2][6]))
    f3.write('\n\n------------------------------------------------------------------------------------------------------------------------------------------\n\n')
    f3.write('Averaged over sentences (Not very important, just for stats)\n\n')
    f3.write('TRAINING\n')
    f3.write('MACRO-f1  : %1f\n' % (train_sentwise_stats[3]))
    f3.write('Metric    :     O    :   B-PER  :   I-PER  :   B-ORG  :   I-ORG  :   B-LOC  :   I-LOC\n')
    f3.write('Precision : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (train_sentwise_stats[0][0],train_sentwise_stats[0][1],train_sentwise_stats[0][2],train_sentwise_stats[0][3],train_sentwise_stats[0][4],train_sentwise_stats[0][5],train_sentwise_stats[0][6]))
    f3.write('Recall    : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (train_sentwise_stats[1][0],train_sentwise_stats[1][1],train_sentwise_stats[1][2],train_sentwise_stats[1][3],train_sentwise_stats[1][4],train_sentwise_stats[1][5],train_sentwise_stats[1][6]))
    f3.write('f1        : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (train_sentwise_stats[2][0],train_sentwise_stats[2][1],train_sentwise_stats[2][2],train_sentwise_stats[2][3],train_sentwise_stats[2][4],train_sentwise_stats[2][5],train_sentwise_stats[2][6]))
    f3.write('\nTESTING\n')
    f3.write('MACRO-f1  : %1f\n' % (test_sentwise_stats[3]))
    f3.write('Metric    :     O    :   B-PER  :   I-PER  :   B-ORG  :   I-ORG  :   B-LOC  :   I-LOC\n')
    f3.write('Precision : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (test_sentwise_stats[0][0],test_sentwise_stats[0][1],test_sentwise_stats[0][2],test_sentwise_stats[0][3],test_sentwise_stats[0][4],test_sentwise_stats[0][5],test_sentwise_stats[0][6]))
    f3.write('Recall    : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (test_sentwise_stats[1][0],test_sentwise_stats[1][1],test_sentwise_stats[1][2],test_sentwise_stats[1][3],test_sentwise_stats[1][4],test_sentwise_stats[1][5],test_sentwise_stats[1][6]))
    f3.write('f1        : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (test_sentwise_stats[2][0],test_sentwise_stats[2][1],test_sentwise_stats[2][2],test_sentwise_stats[2][3],test_sentwise_stats[2][4],test_sentwise_stats[2][5],test_sentwise_stats[2][6]))
    f3.write('\nVALIDATION\n')
    f3.write('MACRO-f1  : %1f\n' % (validation_sentwise_stats[3]))
    f3.write('Metric    :     O    :   B-PER  :   I-PER  :   B-ORG  :   I-ORG  :   B-LOC  :   I-LOC\n')
    f3.write('Precision : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (validation_sentwise_stats[0][0],validation_sentwise_stats[0][1],validation_sentwise_stats[0][2],validation_sentwise_stats[0][3],validation_sentwise_stats[0][4],validation_sentwise_stats[0][5],validation_sentwise_stats[0][6]))
    f3.write('Recall    : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (validation_sentwise_stats[1][0],validation_sentwise_stats[1][1],validation_sentwise_stats[1][2],validation_sentwise_stats[1][3],validation_sentwise_stats[1][4],validation_sentwise_stats[1][5],validation_sentwise_stats[1][6]))
    f3.write('f1        : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (validation_sentwise_stats[2][0],validation_sentwise_stats[2][1],validation_sentwise_stats[2][2],validation_sentwise_stats[2][3],validation_sentwise_stats[2][4],validation_sentwise_stats[2][5],validation_sentwise_stats[2][6]))

In [None]:
import re
#converting sentences to proper format for classification
def sen_to_list(sentence):

  pattern = r'[\'\+’\-\*/\.!‘@#$%&~`=,<>\?\|\(\)]'
  words = re.sub(r'([कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहabcdefghijklmnopqrstuvwxyz])(?=[1234567890०१२३४५६७८९])|([1234567890०१२३४५६७८९])(?=[कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहabcdefghijklmnopqrstuvwxyz])', r'\g<1>\g<2> ', sentence)
  words = re.sub(r'[‘’@#$%&~`=,<>\+\-\?\|\(\)\'\*\.]',r' \g<0> ' ,words)
  words = re.sub('\xa0',r' ' ,words)
  words = words.strip()
  words = words.split(' ')
  words = [item for item in words if item]
  return words

In [None]:
with open('/content/gdrive/MyDrive/cs689/a2/q1_cs689_a2_gpt.txt', 'r') as f2:
  gpt = f2.read()

In [None]:
with open('/content/gdrive/MyDrive/cs689/a2/q1_cs689_a2.txt', 'r') as f3:
  orig = f3.read()

In [None]:
orig = orig.split('\n')
gpt = gpt.split('\n')

In [None]:
orig = [item for item in orig if item]
gpt = [item for item in gpt if item]

In [None]:
# orig

In [None]:
q1_list = []
q1_actual_labels = []
q1_predicted_labels = []
q1_gpt_labels = []

In [None]:
for i in range(0,len(orig)-1,2):
  q1_list.append(sen_to_list(orig[i][3:]))
  q1_actual_labels.append(orig[i+1].split(' '))
  q1_gpt_labels.append(gpt[i+1].split(' '))

In [None]:
label_to_index_q1 = {"O": 0, "B-PER": 1, "I-PER": 2, "B-ORG": 3, "I-ORG": 4,"B-LOC": 5, "I-LOC": 6, "B-MISC":7,"I-MISC":8}  # Add more labels as needed
index_to_label_q1 = {0: "O" , 1: "B-PER" , 2: "I-PER" , 3: "B-ORG" , 4: "I-ORG" ,5: "B-LOC" , 6: "I-LOC",7:"B-MISC",8: "I-MISC" }  # Add more labels as needed

In [None]:
for j in range(len(q1_actual_labels)):
  for i in range(len(q1_actual_labels[j])):
    q1_actual_labels[j][i] = label_to_index_q1[q1_actual_labels[j][i]]
    q1_gpt_labels[j][i] = label_to_index_q1[q1_gpt_labels[j][i]]

In [None]:
 for i in range(len(q1_list)):
  # predicted_labels = get_predictions(sentence=' '.join(train_toks[i]), tokenizer=tokenizer, model=model)
  # train_predicted_labels.append(predicted_labels)
  q1_predicted_labels.append(get_predictions(sentence=' '.join(q1_list[i]), tokenizer=tokenizer, model=model))

In [None]:

# Define the labels
labels = [0, 1, 2, 3, 4, 5, 6,7,8]

# Calculate metrics
precision_list_overall, recall_list_overall, f1_list_overall, macro_f1_overall, precision_all_overall,recall_all_overall,f1_all_overall = calculate_metrics_overall(q1_actual_labels, q1_predicted_labels, labels)
bert_vs_manual_overall_stats=[precision_list_overall,recall_list_overall,f1_list_overall,macro_f1_overall,precision_all_overall,recall_all_overall,f1_all_overall]

precision_list_overall2, recall_list_overall2, f1_list_overall2, macro_f1_overall2, precision_all_overall2,recall_all_overall2,f1_all_overall2 = calculate_metrics_overall(q1_actual_labels, q1_gpt_labels, labels)
gpt_vs_manual_overall2_stats=[precision_list_overall2,recall_list_overall2,f1_list_overall2,macro_f1_overall2, precision_all_overall2,recall_all_overall2,f1_all_overall2]

print('precision_classwise_overall: ',precision_list_overall,'\nrecall_classwise_overall: ',recall_list_overall,
      '\nf1_classwise_overall: ',f1_list_overall,'\nmacro_f1_overall: ',macro_f1_overall,'\nprecision_all_overall: ',precision_all_overall,'\nrecall_all_overall: ',recall_all_overall,
      '\nf1_all_overall: ',f1_all_overall)

print('precision_classwise_overall2: ',precision_list_overall2,'\nrecall_classwise_overall2: ',recall_list_overall2,
      '\nf1_classwise_overall2: ',f1_list_overall2,'\nmacro_f1_overall2: ',macro_f1_overall2,'\nprecision_all_overall2: ',precision_all_overall2,'\nrecall_all_overall2: ',recall_all_overall2,
      '\nf1_all_overall2: ',f1_all_overall2)



0
0
precision_classwise_overall:  [0.81108312 0.6        0.75       0.68421053 0.65217391 0.5
 0.25       0.         0.        ] 
recall_classwise_overall:  [0.95266272 0.9        0.75       0.56521739 0.71428571 0.5
 1.         0.         0.        ] 
f1_classwise_overall:  [0.87619048 0.72       0.75       0.61904762 0.68181818 0.5
 0.4        0.         0.        ] 
macro_f1_overall:  0.5052284752284754 
precision_all_overall:  0.7815126050420168 
recall_all_overall:  0.7815126050420168 
f1_all_overall:  0.7815126050420168
precision_classwise_overall2:  [0.75555556 0.57142857 0.5        0.         0.         0.
 0.         1.         1.        ] 
recall_classwise_overall2:  [0.98421053 0.4        0.33333333 0.         0.         0.
 0.         0.04       0.025     ] 
f1_classwise_overall2:  [0.85485714 0.47058824 0.4        0.         0.         0.
 0.         0.07692308 0.04878049] 
macro_f1_overall2:  0.20568321587546834 
precision_all_overall2:  0.7413127413127413 
recall_all_ove

In [None]:
with open('/content/gdrive/MyDrive/cs689/a2/saved_models/bert_predictions/bert_vs_manual.txt', 'w') as f3:
    f3.write('IndicBERT\n\n')
    f3.write('Over all the predicted words over all 9 labels\n\n')
    f3.write('BERT Model\n')
    f3.write('MACRO-f1  : %1f\n' % (bert_vs_manual_overall_stats[3]))
    f3.write('Metric    :     O    :   B-PER  :   I-PER  :   B-ORG  :   I-ORG  :   B-LOC  :   I-LOC  :  B-MISC  :  I-MISC\n')
    f3.write('Precision : %1f : %1f : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (bert_vs_manual_overall_stats[0][0],bert_vs_manual_overall_stats[0][1],bert_vs_manual_overall_stats[0][2],bert_vs_manual_overall_stats[0][3],bert_vs_manual_overall_stats[0][4],bert_vs_manual_overall_stats[0][5],bert_vs_manual_overall_stats[0][6],bert_vs_manual_overall_stats[0][7],bert_vs_manual_overall_stats[0][8]))
    f3.write('Recall    : %1f : %1f : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (bert_vs_manual_overall_stats[1][0],bert_vs_manual_overall_stats[1][1],bert_vs_manual_overall_stats[1][2],bert_vs_manual_overall_stats[1][3],bert_vs_manual_overall_stats[1][4],bert_vs_manual_overall_stats[1][5],bert_vs_manual_overall_stats[1][6],bert_vs_manual_overall_stats[1][7],bert_vs_manual_overall_stats[1][8]))
    f3.write('f1        : %1f : %1f : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (bert_vs_manual_overall_stats[2][0],bert_vs_manual_overall_stats[2][1],bert_vs_manual_overall_stats[2][2],bert_vs_manual_overall_stats[2][3],bert_vs_manual_overall_stats[2][4],bert_vs_manual_overall_stats[2][5],bert_vs_manual_overall_stats[2][6],bert_vs_manual_overall_stats[2][7],bert_vs_manual_overall_stats[2][8]))
    f3.write('Prec_all  : %1f\n' % (bert_vs_manual_overall_stats[4]))
    f3.write('Recall_all: %1f\n' % (bert_vs_manual_overall_stats[5]))
    f3.write('f1_all    : %1f\n' % (bert_vs_manual_overall_stats[6]))
    f3.write('\nGPT\n')
    f3.write('MACRO-f1  : %1f\n' % (gpt_vs_manual_overall2_stats[3]))
    f3.write('Metric    :     O    :   B-PER  :   I-PER  :   B-ORG  :   I-ORG  :   B-LOC  :   I-LOC  :  B-MISC  :  I-MISC\n')
    f3.write('Precision : %1f : %1f : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (gpt_vs_manual_overall2_stats[0][0],gpt_vs_manual_overall2_stats[0][1],gpt_vs_manual_overall2_stats[0][2],gpt_vs_manual_overall2_stats[0][3],gpt_vs_manual_overall2_stats[0][4],gpt_vs_manual_overall2_stats[0][5],gpt_vs_manual_overall2_stats[0][6],gpt_vs_manual_overall2_stats[0][7],gpt_vs_manual_overall2_stats[0][8]))
    f3.write('Recall    : %1f : %1f : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (gpt_vs_manual_overall2_stats[1][0],gpt_vs_manual_overall2_stats[1][1],gpt_vs_manual_overall2_stats[1][2],gpt_vs_manual_overall2_stats[1][3],gpt_vs_manual_overall2_stats[1][4],gpt_vs_manual_overall2_stats[1][5],gpt_vs_manual_overall2_stats[1][6],gpt_vs_manual_overall2_stats[1][7],gpt_vs_manual_overall2_stats[1][8]))
    f3.write('f1        : %1f : %1f : %1f : %1f : %1f : %1f : %1f : %1f : %1f\n' % (gpt_vs_manual_overall2_stats[2][0],gpt_vs_manual_overall2_stats[2][1],gpt_vs_manual_overall2_stats[2][2],gpt_vs_manual_overall2_stats[2][3],gpt_vs_manual_overall2_stats[2][4],gpt_vs_manual_overall2_stats[2][5],gpt_vs_manual_overall2_stats[2][6],gpt_vs_manual_overall2_stats[2][7],gpt_vs_manual_overall2_stats[2][8]))
    f3.write('Prec_all  : %1f\n' % (gpt_vs_manual_overall2_stats[4]))
    f3.write('Recall_all: %1f\n' % (gpt_vs_manual_overall2_stats[5]))
    f3.write('f1_all    : %1f\n' % (gpt_vs_manual_overall2_stats[6]))