In [2]:
import pandas as pd

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sentence_transformers import SentenceTransformer, util
import numpy as np
from sklearn.cluster import AgglomerativeClustering


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data = {
    "comment": [
        "किरणजीतले सेवा राम्रो गरेको छ",
        "केपी ओलीको नीति धेरै राम्रो छैन",
        "रवी लामिछाने राम्रो काम गरेका छन्"
    ]
}
df = pd.DataFrame(data)

In [3]:
ner_model_name = "xlm-roberta-base-finetuned-conll03-english"  # baseline multilingual NER
tokenizer = AutoTokenizer.from_pretrained(ner_model_name)
model = AutoModelForTokenClassification.from_pretrained(ner_model_name)

OSError: xlm-roberta-base-finetuned-conll03-english is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

In [None]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


In [None]:
all_comments = df['comment'].tolist()

comment_entities = []  # store entities per comment
for comment in all_comments:
    entities = ner_pipeline(comment)
    # Filter low confidence entities
    filtered = [e for e in entities if e['score'] >= 0.8]
    # Each entity: text, label, score
    comment_entities.append(filtered)

# ---- Step 3: Extract unique entities ----
unique_entities = set()
for ents in comment_entities:
    for e in ents:
        unique_entities.add(e['word'].lower().replace(" ", ""))

unique_entities = list(unique_entities)
print(f"Unique entities before clustering: {unique_entities}")

# ---- Step 4: Embed entities using multilingual sentence embeddings ----
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
entity_embeddings = embedding_model.encode(unique_entities, convert_to_tensor=True)

# ---- Step 5: Cluster embeddings to merge similar entities ----
# Example: Agglomerative clustering with cosine distance
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.35, affinity='cosine', linkage='average')
cluster_ids = clustering.fit_predict(entity_embeddings.cpu().numpy())  # numpy array

# ---- Step 6: Map variants to canonical target ----
cluster_map = {}
for idx, cluster_id in enumerate(cluster_ids):
    cluster_map.setdefault(cluster_id, []).append(unique_entities[idx])

canonical_targets = {}
for cluster_id, variants in cluster_map.items():
    canonical_name = max(variants, key=len)  # can change to most frequent if needed
    for v in variants:
        canonical_targets[v] = canonical_name

# ---- Step 7: Replace entity mentions in comments with canonical targets ----
normalized_comment_entities = []
for ents in comment_entities:
    normalized = []
    for e in ents:
        key = e['word'].lower().replace(" ", "")
        canonical = canonical_targets.get(key, key)
        normalized.append({"canonical": canonical, "score": e['score'], "label": e['entity_group']})
    normalized_comment_entities.append(normalized)

# ---- Step 8: Prepare final dropdown list for GUI ----
dropdown_targets = list(set([e['canonical'] for ents in normalized_comment_entities for e in ents]))
print("Dropdown targets:", dropdown_targets)