# Problem-4:

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset
import json
import time
from sklearn.metrics import f1_score
import numpy as np

## Function to return predicted 'ner' labels for each tokenizer and Model:

In [2]:
def get_predictions( sentence, tokenizer, model ):
    tok_sentence = tokenizer(sentence, return_tensors='pt')
    
    with torch.no_grad():
        logits = model(**tok_sentence).logits.argmax(-1)

        predicted_tokens_classes = [model.config.id2label[t.item()] for t in logits[0]]

        predicted_labels = []
        previous_token_id = 0
        word_ids = tok_sentence.word_ids()
        for word_index in range(len(word_ids)):
            if word_ids[word_index] == None:
                previous_token_id = word_ids[word_index]
            elif word_ids[word_index] == previous_token_id:
                previous_token_id = word_ids[word_index]
            else:
                predicted_labels.append( predicted_tokens_classes[ word_index ] )
                previous_token_id = word_ids[word_index]
        return predicted_labels

### Function to return required accuracy matrics:

In [3]:
from sklearn.metrics import precision_score, recall_score, f1_score
def calculate_metrics(true_labels, predicted_labels):
    
    true_labels_flat = [label for sublist in true_labels for label in sublist]
    predicted_labels_flat = [label for sublist in predicted_labels for label in sublist]
    
    #because manual labelling and different tokenizer can can generate different length token lists 
    minlen = min(len(true_labels_flat),len(predicted_labels_flat))
    true_labels_flat = true_labels_flat[:minlen]
    predicted_labels_flat = predicted_labels_flat[:minlen]
    
    precision = precision_score(true_labels_flat, predicted_labels_flat, average='weighted')
    recall = recall_score(true_labels_flat, predicted_labels_flat, average='weighted')
    f1 = f1_score(true_labels_flat, predicted_labels_flat, average='weighted')

    return precision, recall, f1

### List 'sentences' contains all 25 sentences assigned to me:

In [4]:
file_path = 'raw_sentences_file.txt' 
sentences = []

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if len(line)>0:
            sentences.append(line[3:])

###  List 'manual_labels' contains 'ner' labels manually assigned to these 25 sentence:

In [5]:
file_path = 'manual_ner_file.txt'  
manual_labels =[]
NER_labels = ['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        labels = [tag.strip().strip("'") for tag in line.split(' ')]  
        labels = [tag for tag in labels if tag in NER_labels]
        if labels:
            manual_labels.append(labels)
print(manual_labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


### Scores For Finetuned Indic_ner Model:

In [6]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

model_path = "finetunedINDIC_ner_model"
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Printing first sentence
print("First senetence:")
print(sentences[0])

predicted_labels = []
for sentence in sentences:
    pred_labels = get_predictions(sentence=sentence, tokenizer=tokenizer,model=model)
    predicted_labels.append(pred_labels)

print("Predicted labels for first sentence:")
print(predicted_labels[0])

print("Comparison in manual labels and predicted labels:")
print("="*50)
precision, recall, f1 = calculate_metrics(manual_labels, predicted_labels)
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, Macro-F1: {f1:.4f}')

First senetence:
आपात्काल मे धेेैर्य , अभ्युदय मे क्षमा , सदन मे वाक्पटुता , युद्ध के समय बहादुरी , यशमे अभिरूचि , ज्ञान का व्यसन ये सब चीजे महापुरूषोंमे नैसर्गिक रूपसे पायी जाती हैं ।
Predicted labels for first sentence:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Comparison in manual labels and predicted labels:
Precision: 0.8731, Recall: 0.8588, Macro-F1: 0.8655


  _warn_prf(average, modifier, msg_start, len(result))


### Scores For Finetuned Indic_BERT Model:

In [7]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

model_path = "finetunedBERT_ner_model"
model = AutoModelForTokenClassification.from_pretrained(model_path,num_labels=7)
tokenizer = AutoTokenizer.from_pretrained(model_path)

print("Third senetence:")
print(sentences[2])

predicted_labels = []
for sentence in sentences:
    pred_labels = get_predictions(sentence=sentence, tokenizer=tokenizer,model=model)
    predicted_labels.append(pred_labels)

print("Predicted labels for third sentence:")
print(predicted_labels[2])

print("Comparison in manual labels and predicted labels:")
print("="*50)
precision, recall, f1 = calculate_metrics(manual_labels, predicted_labels)
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, Macro-F1: {f1:.4f}')

Third senetence:
कहने की जरूरत नहीं कि मुकेश चौरसिया मीडियाखबर डॉट कॉम नाम के एक पोर्टल का सहयोगी है और मीडियाखबर नामक पोर्टल का मालिक है पुष्‍कर पुष्‍प।
Predicted labels for third sentence:
['O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Comparison in manual labels and predicted labels:
Precision: 0.8182, Recall: 0.8128, Macro-F1: 0.8155


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Predictions from ChatGPT, (collected in a text file namely 'ChatGPT_ner_file.txt')

In [8]:
file_path = 'ChatGPT_ner_file.txt' 
predicted_labels =[]
NER_labels = ['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        labels = [tag.strip().strip("'") for tag in line.split(' ')]  
        labels = [tag for tag in labels if tag in NER_labels]
        if labels:
            predicted_labels.append(labels)
        
print("Comparison in manual labels and predicted labels:")
print("="*50)
precision, recall, f1 = calculate_metrics(manual_labels, predicted_labels)
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, Macro-F1: {f1:.4f}')

Comparison in manual labels and predicted labels:
Precision: 0.8435, Recall: 0.8148, Macro-F1: 0.8285


  _warn_prf(average, modifier, msg_start, len(result))


## Example of Prediction for question 5, (img attached in pdf):

### Example for Indic_ner

In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("finetunedINDIC_ner_model")
model = AutoModelForTokenClassification.from_pretrained("finetunedINDIC_ner_model")

In [10]:
sentence = 'अभिनेत्री सोहा अली खान से उनकी मां अभिनेत्री शर्मिला टैगोर खासी नाराज़ हैं.'

predicted_labels = get_predictions(sentence=sentence, 
                                   tokenizer=tokenizer,
                                   model=model
                                   )
print(predicted_labels)
for index in range(len(sentence.split(' '))):
  print( sentence.split(' ')[index] + '\t' + predicted_labels[index] )
predicted_labels = [i.strip(',') for i in predicted_labels]

['O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O']
अभिनेत्री	O
सोहा	B-PER
अली	I-PER
खान	I-PER
से	O
उनकी	O
मां	O
अभिनेत्री	O
शर्मिला	B-PER
टैगोर	I-PER
खासी	O
नाराज़	O
हैं.	O


### Example for IndicBERT :

In [11]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("finetunedBERT_ner_model")
model = AutoModelForTokenClassification.from_pretrained("finetunedBERT_ner_model")

In [12]:
sentence = 'लोकसभा में कांग्रेस नेता मलिक्कार्जुन खड़गे ने कहा कि प्रधानमंत्री नरेंद्र मोदी भारत के साथ वही करना चाहते हैं'

predicted_labels = get_predictions(sentence=sentence, 
                                   tokenizer=tokenizer,
                                   model=model
                                   )
print(predicted_labels)
for index in range(len(sentence.split(' '))):
  print( sentence.split(' ')[index] + '\t' + predicted_labels[index] )
predicted_labels = [i.strip(',') for i in predicted_labels]

['O', 'O', 'B-LOC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O']
लोकसभा	O
में	O
कांग्रेस	B-LOC
नेता	O
मलिक्कार्जुन	B-PER
खड़गे	I-PER
ने	O
कहा	O
कि	O
प्रधानमंत्री	O
नरेंद्र	B-PER
मोदी	I-PER
भारत	B-ORG
के	O
साथ	O
वही	O
करना	O
चाहते	O
हैं	O
