In [1]:
import pandas as pd
import numpy as np
import re
import unicodedata

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sentence_transformers import SentenceTransformer

from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def normalize_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = text.lower().strip()
    text = " ".join(text.split())
    return text


In [None]:
def devanagari_lexical_normalization(text):
    import re

    # -------------------------
    # 1. Nukta normalization
    # -------------------------
    NUKTA_MAP = {
        'क़': 'क', 'ख़': 'ख', 'ग़': 'ग', 'ज़': 'ज',
        'ड़': 'ड', 'ढ़': 'ढ', 'फ़': 'फ', 'ऱ': 'र', 'य़': 'य'
    }
    for k, v in NUKTA_MAP.items():
        text = text.replace(k, v)

    # -------------------------
    # 2. Nasal normalization
    # -------------------------
    NASAL_MAP = {
        'ङ्ग': 'ंग','ङ्घ': 'ंघ','ङ्क': 'ंक','ङ्ख': 'ंख',
        'ञ्च': 'ंच','ञ्छ': 'ंछ','ञ्ज': 'ंज','ञ्झ': 'ंझ',
        'ण्ड': 'ंड','ण्ठ': 'ंठ','ण्ट': 'ंट','ण्ढ': 'ंढ',
        'न्द': 'ंद','न्ध': 'ंध','न्त': 'ंत','न्थ': 'ंथ',
        'म्प': 'ंप','म्भ': 'ंभ'
    }
    for k, v in NASAL_MAP.items():
        text = text.replace(k, v)

    # -------------------------
    # 3. Collapse repeated letters
    # -------------------------
    text = re.sub(r'(.)\1+', r'\1', text)

    # -------------------------
    # 4. Remove extra spaces
    # -------------------------
    text = " ".join(text.split())

    # -------------------------
    # 5. Optional entity modifiers
    # -------------------------
    modifiers = ['जी', 'दाइ', 'चोर', 'भाइ']
    for mod in modifiers:
        text = re.sub(r'\b' + mod + r'\b', '', text)

    return text.strip()


In [None]:
dataset_path = 'dataset/training_dataset.csv'
df = pd.read_csv(dataset_path, header=None,
                 names=["comment","target","aspect","sentiment"])

comments = (
    df["comment"]
    .astype(str)
    .apply(normalize_text)
    .apply(devanagari_lexical_normalization)
    .tolist()
)

print("Total comments:", len(comments))


Total comments: 4414


In [8]:
ner_model_name = "Davlan/xlm-roberta-base-ner-hrl"

tokenizer = AutoTokenizer.from_pretrained(ner_model_name)
model = AutoModelForTokenClassification.from_pretrained(ner_model_name)

ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 199/199 [00:01<00:00, 140.63it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mXLMRobertaForTokenClassification LOAD REPORT[0m from: Davlan/xlm-roberta-base-ner-hrl
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [None]:
comment_entities = []
entity_frequency = defaultdict(int)

CONFIDENCE_THRESHOLD = 0.65

for comment in comments:
    detected = ner_pipeline(comment)

    filtered = []
    for ent in detected:
        if ent["score"] >= CONFIDENCE_THRESHOLD:
            word = normalize_text(ent["word"])
            word = devanagari_lexical_normalization(word)

            filtered.append({
                "word": word,
                "label": ent["entity_group"],
                "score": float(ent["score"])
            })

            entity_frequency[word] += 1

    comment_entities.append(filtered)

print("NER extraction complete")
print("Unique raw entities:", len(entity_frequency))


NER extraction complete
Unique raw entities: 530


In [11]:
unique_entities = list(entity_frequency.keys())

print("Sample entities:")
print(unique_entities[:20])


Sample entities:
['सिंहदरबार', 'बालेन', 'सुदन गुरुङ', 'ह', 'र्के', 'ओली', 'देउवा', 'प्रचण्ड', 'बाल', 'रवि नमि', 'हर्क सम्', 'सु', 'दन गुरुङ', 'झा', 'झापा', 'कुकुर', 'के पी ओली', 'केपी शर्मा ओली', 'केपी ओलीको', 'नेपाल']


In [12]:
embedding_model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)

entity_embeddings = embedding_model.encode(unique_entities)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 263.65it/s, Materializing param=pooler.dense.weight]                               
[1mBertModel LOAD REPORT[0m from: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [13]:
clustering = AgglomerativeClustering(
    n_clusters=None,
    distance_threshold=0.35,
    metric="cosine",
    linkage="average"
)

cluster_ids = clustering.fit_predict(entity_embeddings)


In [14]:
cluster_map = defaultdict(list)

for entity, cluster_id in zip(unique_entities, cluster_ids):
    cluster_map[cluster_id].append(entity)

print("Total clusters:", len(cluster_map))


Total clusters: 71


In [15]:
canonical_map = {}

for cluster_id, variants in cluster_map.items():
    canonical = max(variants, key=lambda v: entity_frequency[v])

    for v in variants:
        canonical_map[v] = canonical


In [16]:
normalized_comment_entities = []

for ents in comment_entities:
    normalized = []

    for e in ents:
        canonical = canonical_map.get(e["word"], e["word"])

        normalized.append({
            "canonical": canonical,
            "label": e["label"],
            "score": e["score"]
        })

    normalized_comment_entities.append(normalized)


In [17]:
dropdown_targets = sorted(list({
    e["canonical"]
    for comment in normalized_comment_entities
    for e in comment
}))

print("\nFINAL DROPDOWN TARGETS:")
print(dropdown_targets)
print("Total dropdown targets:", len(dropdown_targets))



FINAL DROPDOWN TARGETS:
['आन्तरिक राजस्व विभाग', 'उद्योग', 'एमसीसी', 'एमाले', 'एसजीबी', 'एस्पालि', 'ओली', 'कांग्रेस', 'काङ्ग्रेस', 'किरण', 'कृषि', 'के पी ओली', 'के पी चोर', 'केदार न्यौपाने', 'केवलकार', 'गिरि बन्द', 'चन्द्र सर', 'चाटु मिडिया', 'चिन तिब्बत', 'चिनकाजी महर्जन', 'चीन', 'जिन्दाबाद', 'जु', 'ज्ञान', 'डिल्ली बजार', 'तीन', 'दक्षिण कोरिया', 'दरबारमार्ग कन्सर्ट', 'दिल भुषण पाठक', 'दुर्गा प्रसाई', 'देउबा', 'धादिंग ढोला', 'ध्रुब राठी', 'निर्मला', 'नेपाल', 'नेपाल राष्ट्र बैंकले', 'पशुपतिनाथ', 'पिएचडी', 'बंगलादेश', 'बम', 'बल', 'बहादुर', 'बाल', 'बालेन', 'बि', 'बुढीगण्डकी जलविद्युत आयोजना', 'भारत', 'माओवादी', 'माफ', 'मिटरब्याजी', 'मोदी', 'रमेश लेखक', 'रवि दाई जय घण्टी', 'रवि लामिछाने', 'राजा', 'राम', 'रावण', 'रास्वपा', 'लनाथ बादलको', 'विद्युत प्राधिकरण', 'श्रीलंका', 'सत्य', 'सम्पत्ति शुद्धीकरण', 'सर्वोच्च', 'साउदी', 'सिंहदरबार', 'सुन', 'स्विच बै', 'स्विजरल्याण्ड', 'हरे', 'हर्के माचिक्नी']
Total dropdown targets: 71
