In [19]:
import spacy
import random
import json
import pickle
import pandas as pd

In [20]:
data = pd.read_csv('./dataset/hi-train.csv', encoding='utf-8')

In [21]:
data.head()

Unnamed: 0,label,text
0,india,मेट्रो की इस लाइन के चलने से दक्षिणी दिल्ली से...
1,pakistan,नेटिजन यानि इंटरनेट पर सक्रिय नागरिक अब ट्विटर...
2,news,इसमें एक फ़्लाइट एटेनडेंट की मदद की गुहार है औ...
3,india,"प्रतीक खुलेपन का, आज़ाद ख्याली का और भीड़ से अ..."
4,india,ख़ासकर पिछले 10 साल तक प्रधानमंत्री रहे मनमोहन...


In [22]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

In [23]:
model = AutoModelForTokenClassification.from_pretrained("MichaelHuang/muril_base_cased_hindi_ner")
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

# Define the labels dictionary
labels_dict = {
    0: "B-FESTIVAL",
    1: "B-GAME",
    2: "B-LANGUAGE",
    3: "B-LITERATURE",
    4: "B-LOCATION",
    5: "B-MISC",
    6: "B-NUMEX",
    7: "B-ORGANIZATION",
    8: "B-PERSON",
    9: "B-RELIGION",
    10: "B-TIMEX",
    11: "I-FESTIVAL",
    12: "I-GAME",
    13: "I-LANGUAGE",
    14: "I-LITERATURE",
    15: "I-LOCATION",
    16: "I-MISC",
    17: "I-NUMEX",
    18: "I-ORGANIZATION",
    19: "I-PERSON",
    20: "I-RELIGION",
    21: "I-TIMEX",
    22: "O"
}

In [24]:
def ner_predict(sentence, model, tokenizer, labels_dict):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_labels = torch.argmax(outputs.logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    labels = predicted_labels.squeeze().tolist()
    predicted_labels = [labels_dict[label] for label in labels]
    result = list(zip(tokens, predicted_labels))
    return result

In [25]:
# Pickle the model for future use for fast inference and getting the same results and usage of the model
pickle.dump(model, open('./models/NERTransmodel.pkl', 'wb'))
pickle.dump(tokenizer, open('./models/NERTranstokenizer.pkl', 'wb'))
pickle.dump(labels_dict, open('./models/NERTranslabels_dict.pkl', 'wb'))

# Load the model and tokenizer
model = pickle.load(open('./models/NERTransmodel.pkl', 'rb'))
tokenizer = pickle.load(open('./models/NERTranstokenizer.pkl', 'rb'))
labels_dict = pickle.load(open('./models/NERTranslabels_dict.pkl', 'rb'))

# Test the model
sentence = "इस वर्ष के बाद भारत के लिए बहुत ही शुभ और खुशनुमा साबित होगा।"
result = ner_predict(sentence, model, tokenizer, labels_dict)
print(result)
# Conver the result to string in the format of word/tag
result = [f"{word}/{tag}" for word, tag in result]
print(type(result))
# conver the list to string
result = " ".join(result)
print(result)


[('[CLS]', 'O'), ('इस', 'O'), ('वर्ष', 'O'), ('के', 'O'), ('बाद', 'O'), ('भारत', 'B-LOCATION'), ('के', 'O'), ('लिए', 'O'), ('बहुत', 'O'), ('ही', 'O'), ('शुभ', 'O'), ('और', 'O'), ('खुश', 'O'), ('##नुमा', 'O'), ('साबित', 'O'), ('होगा', 'O'), ('।', 'O'), ('[SEP]', 'O')]
<class 'list'>
[CLS]/O इस/O वर्ष/O के/O बाद/O भारत/B-LOCATION के/O लिए/O बहुत/O ही/O शुभ/O और/O खुश/O ##नुमा/O साबित/O होगा/O ।/O [SEP]/O


In [26]:
import json 
result = json.dumps(result, ensure_ascii=False)
print(result)

"[CLS]/O इस/O वर्ष/O के/O बाद/O भारत/B-LOCATION के/O लिए/O बहुत/O ही/O शुभ/O और/O खुश/O ##नुमा/O साबित/O होगा/O ।/O [SEP]/O"


In [27]:
test_sentence = "अकबर ईद पर टेनिस खेलता है"
predictions = ner_predict(test_sentence, model, tokenizer, labels_dict)

for token, label in predictions:
    print(f"{token}: {label}")

[CLS]: O
अकबर: B-PERSON
ईद: O
पर: O
टेनिस: B-GAME
खेलता: O
है: O
[SEP]: O


In [33]:
def response_parse(data):
    data = data.encode('utf-8').decode('unicode_escape')
    data = data.replace('"', '')
    data = data.replace('[', '')
    data = data.replace(']', '')
    data = data[1:-1]
    data = data.split('}, {')
    return data

In [35]:
import requests
url = "http://127.0.0.1:8000/get_ner_tags"
payload = json.dumps({
    "sentence": "अकबर ईद पर टेनिस खेलता है"
    })

headers = {
    'Content-Type': 'application/json'
    }
response = requests.request("POST", url,headers=headers, data=payload)
print(response_parse(response.text))


['erm: CLS, label: O,term: à¤\x85à¤\x95à¤¬à¤°, label: B-PERSON,term: à¤\x88à¤¦, label: O,term: à¤ªà¤°, label: O,term: à¤\x9fà¥\x87à¤¨à¤¿à¤¸, label: B-GAME,term: à¤\x96à¥\x87à¤²à¤¤à¤¾, label: O,term: à¤¹à¥\x88, label: O,term: SEP, label: ']
