### **Stop words**

In [1]:
stop_words  = open("stopwords.txt", "r")

In [2]:
stop_words_list = []
for line in stop_words:
    line = line.strip('\n').split()
    stop_words_list.append(line[0])

#### Statements to be imported

In [3]:
import inltk
from inltk.inltk import tokenize, get_embedding_vectors, predict_next_words, get_sentence_encoding, get_sentence_similarity, get_similar_sentences

In [4]:
import pyiwn
iwn = pyiwn.IndoWordNet()

2023-08-22:21:59:27,390 INFO     [iwn.py:43] Loading hindi language synsets...


### **Preprocessing Data**

In [5]:
def preprocess_sentence(sentence):
    tokenized_ips = tokenize(sentence, 'hi')
    for i in range(len(tokenized_ips)):
        tokenized_ips[i] = tokenized_ips[i][1:]
    preprocessed_ips = ""
    for token in tokenized_ips:
        if token not in stop_words_list:
            preprocessed_ips += (" "+token)
    return preprocessed_ips

In [6]:
import pandas as pd
from googletrans import Translator

import nltk
import string
import re

from nltk.stem import WordNetLemmatizer

from englisttohindi.englisttohindi import EngtoHindi

In [7]:
def HindiLemmetizer(word):
    temp=translator.translate(word).text
    updated_temp=lemmatizer.lemmatize(temp)
    return (EngtoHindi(updated_temp)).convert

### **pyiwn**

#### Data Gathering

In [8]:
import operator

def generate_synonyms_set(root_word, input_sentence, lst):
    # amb_word = HindiLemmetizer(amb_word)
    synsets = iwn.synsets(root_word)
    syn_head_sim = {}
    tst = []
    
    for synset in synsets:
        head = synset.head_word()
        ratio = get_sentence_similarity(synset.examples()[0], input_sentence ,'hi')
        print(head, ":", synset.examples()[0], ratio)
        
        if (head not in syn_head_sim):
            syn_head_sim[head] = ratio
        elif (head in syn_head_sim) and syn_head_sim[head] < ratio:
            syn_head_sim[head] = ratio
            
        if head == root_word and len(synset.lemma_names()) > 1:
            tst.append((synset.lemma_names()[1], ratio))
        
    print("Before:", syn_head_sim)
    syn_head_sim = dict(sorted(syn_head_sim.items(), key=operator.itemgetter(1),reverse=True))
    synonyms_list = []
    
    print("Printing dict:", syn_head_sim)
    for x in list(syn_head_sim)[0:7]:
        if x == root_word:
            for i, j in tst:
                print(i, j)
                print(syn_head_sim[x])
                if j == syn_head_sim[x]:
                    print("True")
                    lst.append(i)
        synonyms_list.append(x)
    
    print(synonyms_list)
    print("lst:", lst)
    return synonyms_list

### **Cosine Similarity**

In [9]:
def generate_sense_replaced_sentences(synonyms_set, input_sentence, amb_word):
    sentences_dict = {}
    for syn in synonyms_set:
        dummy = input_sentence
        sentences_dict[syn] = dummy.replace(amb_word, syn)
    return sentences_dict

In [10]:
def generate_highly_similar_sentences(sentences_dict):
    high_sim_sent = {}
    for sense, sent in sentences_dict.items():
        sim = get_sentence_similarity(sent, input_sentence, 'hi')
        if sim >= 0.9:
            print(">0.9->", sim, sent)
            high_sim_sent[sense] = sent
    return high_sim_sent

### **MuRIL**

#### Model

In [11]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow_text as text
from bert import bert_tokenization
import numpy as np
from scipy.spatial import distance

In [12]:
def get_model(model_url, max_seq_length):
    inputs = dict(
        input_word_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
        input_mask=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
        input_type_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    )

    muril_layer = hub.KerasLayer(model_url, trainable=True)
    outputs = muril_layer(inputs)

    assert 'sequence_output' in outputs
    assert 'pooled_output' in outputs
    assert 'encoder_outputs' in outputs
    assert 'default' in outputs
    return tf.keras.Model(inputs=inputs,outputs=outputs["pooled_output"]), muril_layer

In [13]:
def create_input(input_strings, tokenizer, max_seq_length):
    input_ids_all, input_mask_all, input_type_ids_all = [], [], []
    for input_string in input_strings:
        input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        sequence_length = min(len(input_ids), max_seq_length)
    
        if len(input_ids) >= max_seq_length:
            input_ids = input_ids[:max_seq_length]
        else:
            input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

        input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

        input_ids_all.append(input_ids)
        input_mask_all.append(input_mask)
        input_type_ids_all.append([0] * max_seq_length)
  
    return np.array(input_ids_all), np.array(input_mask_all), np.array(input_type_ids_all)

In [14]:
def encode(input_text):
    input_ids, input_mask, input_type_ids = create_input(input_text, 
                                                       tokenizer, 
                                                       max_seq_length)
    inputs = dict(
      input_word_ids=input_ids,
      input_mask=input_mask,
      input_type_ids=input_type_ids,
    )
    return muril_model(inputs)

In [16]:
max_seq_length = 128
muril_model, muril_layer = get_model(model_url="https://tfhub.dev/google/MuRIL/1", max_seq_length=max_seq_length)
     

vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

2023-08-22:22:00:49,24 INFO     [resolver.py:419] Downloading TF-Hub Module 'https://tfhub.dev/google/MuRIL/1'.
2023-08-22:22:01:08,453 INFO     [resolver.py:157] Downloading https://tfhub.dev/google/MuRIL/1: 33.02MB
2023-08-22:22:01:24,509 INFO     [resolver.py:157] Downloading https://tfhub.dev/google/MuRIL/1: 73.02MB
2023-08-22:22:01:39,966 INFO     [resolver.py:157] Downloading https://tfhub.dev/google/MuRIL/1: 103.02MB
2023-08-22:22:01:56,803 INFO     [resolver.py:157] Downloading https://tfhub.dev/google/MuRIL/1: 133.02MB
2023-08-22:22:02:16,723 INFO     [resolver.py:157] Downloading https://tfhub.dev/google/MuRIL/1: 173.02MB
2023-08-22:22:02:37,728 INFO     [resolver.py:157] Downloading https://tfhub.dev/google/MuRIL/1: 213.02MB
2023-08-22:22:02:56,823 INFO     [resolver.py:157] Downloading https://tfhub.dev/google/MuRIL/1: 253.02MB
2023-08-22:22:03:12,691 INFO     [resolver.py:157] Downloading https://tfhub.dev/google/MuRIL/1: 283.02MB
2023-08-22:22:03:29,48 INFO     [resolver.

#### Returning best Euclidean distance sentence

In [17]:
def tokenize_sentences(ppc_high_sim_sent):
    tokenized_sentences = []
    for sentence in ppc_high_sim_sent:
        s = sentence.strip()
        tokenized_sentences.append(s.split(" "))
    return tokenized_sentences

In [18]:
def generate_muril_output(sent):
    embeddings = encode(sent)
    tot_dist = 0
    print("->", sent)
    for index in range(len(sent)):
        tot_dist += distance.euclidean(np.array(embeddings[index]), np.array(embeddings[(index+1)%len(sent)]))
        print(sent[index], "\t", sent[(index+1)%len(sent)], "\t:", distance.euclidean(np.array(embeddings[index]), np.array(embeddings[(index+1)%len(sent)])))
    print("avg : ", tot_dist/len(sent), end = "\n\n")
    return tot_dist/len(sent)

### **WSD Model**

In [19]:
def wsd_model(input_str, amb_word, root_word):    
    lst = []
    synonyms_set = generate_synonyms_set(root_word, input_str, lst)
    print("Synonym_set:",synonyms_set)
    sentences_dict = generate_sense_replaced_sentences(synonyms_set, input_sentence, amb_word)
    print("Sentence Dict:",sentences_dict)
    
    high_sim_sent = generate_highly_similar_sentences(sentences_dict)
    print("Hig_similar Sentence:",high_sim_sent)
    senses = []
    for sense in high_sim_sent.keys():
        senses.append(sense)
    
    ppc_high_sim_sent = []
    for sent in high_sim_sent.values():
        ppc_high_sim_sent.append(preprocess_sentence(sent))
        
    tokenized_sentences = tokenize_sentences(ppc_high_sim_sent)
    
    euclidean_dist = {}
    for index in range(len(tokenized_sentences)):
        euclidean_dist[senses[index]] = generate_muril_output(tokenized_sentences[index])
        
    if bool(euclidean_dist) == False:
        return input_str
    best_sense = list(euclidean_dist.keys())[0]
    min_dist = list(euclidean_dist.values())[0]
    min_dist
    for sense, dist in euclidean_dist.items():
        if min_dist > dist:
            min_dist = dist
            best_sense = sense
            
    if best_sense == root_word and len(lst):
        best_sense = lst[0]
    
    input_str = input_str.replace(amb_word, best_sense)
    return input_str

### Input

In [30]:
input_sentence = input("Enter input sentence:")
print(input_sentence)

Enter input sentence: ज़्यादा खाना मत परोसिए, बस कीजिए ।


ज़्यादा खाना मत परोसिए, बस कीजिए ।


In [31]:
amb_word = input("Enter ambiguous word:")
print(amb_word)

Enter ambiguous word: बस


बस


In [32]:
root_word = input("Enter root word:")
print(root_word)

Enter root word: बस


बस


### Output

In [23]:
wsd_model(input_sentence, amb_word, root_word)

क़ब्ज़ा : अब किले पर सैनिकों का क़ब्ज़ा है । 0.42797377705574036


केवल : इस समय केवल भगवान ही उसकी सहायता कर सकते हैं । 0.5418449640274048


बस : बस, ट्रेन आदि आम जनता के यातायात के सर्वोत्तम साधन हैं । 0.4440802335739136


बस : ज़्यादा खाना मत परोसिए, बस कीजिए । 1.0


अकेला : वह घर पर अकेला है। 0.2919095456600189


बस : बस सभी संजाल यंत्रों को जोड़ने के लिए बसबार का उपयोग करता है । 0.5846896767616272


सिर्फ़ : दीदी की सगाई में सिर्फ़ दस लोग आए थे । 0.4237293303012848


सिर्फ़ : मैं उनसे सिर्फ़ कह सकता था बाकी उनकी मर्ज़ी । 0.48274025321006775
Before: {'क़ब्ज़ा': 0.42797377705574036, 'केवल': 0.5418449640274048, 'बस': 1.0, 'अकेला': 0.2919095456600189, 'सिर्फ़': 0.48274025321006775}
Printing dict: {'बस': 1.0, 'केवल': 0.5418449640274048, 'सिर्फ़': 0.48274025321006775, 'क़ब्ज़ा': 0.42797377705574036, 'अकेला': 0.2919095456600189}
अलम् 1.0
1.0
True
बस टोपोलोजी 0.5846896767616272
1.0
['बस', 'केवल', 'सिर्फ़', 'क़ब्ज़ा', 'अकेला']
lst: ['अलम्']
Synonym_set: ['बस', 'केवल', 'सिर्फ़', 'क़ब्ज़ा', 'अकेला']
Sentence Dict: {'बस': 'ज़्यादा खाना मत परोसिए, बस कीजिए ।', 'केवल': 'ज़्यादा खाना मत परोसिए, केवल कीजिए ।', 'सिर्फ़': 'ज़्यादा खाना मत परोसिए, सिर्फ़ कीजिए ।', 'क़ब्ज़ा': 'ज़्यादा खाना मत परोसिए, क़ब्ज़ा कीजिए ।', 'अकेला': 'ज़्यादा खाना मत परोसिए, अकेला कीजिए ।'}


>0.9-> 1.0 ज़्यादा खाना मत परोसिए, बस कीजिए ।


Hig_similar Sentence: {'बस': 'ज़्यादा खाना मत परोसिए, बस कीजिए ।'}
-> ['ज़्यादा', 'खाना', 'मत', '', 'िए', '', 'बस', 'कीजिए']
ज़्यादा 	 खाना 	: 0.01463143341243267
खाना 	 मत 	: 0.01074183452874422
मत 	  	: 0.010076195932924747
 	 िए 	: 0.01757214590907097
िए 	  	: 0.01757214590907097
 	 बस 	: 0.00843694806098938
बस 	 कीजिए 	: 0.012659317813813686
कीजिए 	 ज़्यादा 	: 0.013389192521572113
avg :  0.013134901761077344



'ज़्यादा खाना मत परोसिए, अलम् कीजिए ।'

### Gradio

In [24]:
import gradio as gr

In [25]:
outputs = gr.outputs.Textbox()
app = gr.Interface(fn=wsd_model, inputs=['text','text','text'], outputs=outputs,description="Hindi Word Sense Disambiguation")

In [26]:
app.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


