In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load Excel file (Install openpyxl for loading (!pip install openpyxl))
df = pd.read_excel("D:\\IndiaAI\\Champions Code\\Dataset\\Champion_Data.xlsx")
df[['Gims_CategoryName','GrievanceDescription', 'FinalReply']].head(15)
#df.shape

Unnamed: 0,Gims_CategoryName,GrievanceDescription,FinalReply
0,UAM/Udyam Registration/Certificate related issues,Please find the attachment of Letter Head,"Sir,&#13;&#10; Please upload the clear im..."
1,UAM/Udyam Registration/Certificate related issues,Udyam no Udyam-br-31-0038425 My address i...,"Sir,&#13;&#10;As per your request you are advi..."
2,UAM/Udyam Registration/Certificate related issues,"To, The Udyog Aadhaar Registration Authority,...",&#2350;&#2361;&#2379;&#2342;&#2351;/&#2350;&#2...
3,UAM/Udyam Registration/Certificate related issues,"This is regarding Udyam Registration, even aft...",With the provided PAN the registered Udyam is ...
4,UAM/Udyam Registration/Certificate related issues,The grievance is regarding my Udyam Registrati...,With the provided PAN the registered Udyam is ...
5,UAM/Udyam Registration/Certificate related issues,"DEAR SIR, I AM A PROPRIETOR HAVING PAN: GZDPD7...","Madam,&#13;&#10;This is regarding your grievan..."
6,UAM/Udyam Registration/Certificate related issues,When I apply for udyam registration.. It shows...,"Sir, the following information is available in..."
7,UAM/Udyam Registration/Certificate related issues,सर / मैडम मैं सुभम अग्रवाल पुत्र फते चन्द्र ...,"Sir,&#13;&#10; As per our records an UAM is re..."
8,UAM/Udyam Registration/Certificate related issues,I am running a Cement Business and want to get...,Your Udyam registration Number is UDYAM-RJ-17...
9,Related to DCMSME Scheme,Sir mera pm vishwakarma yojna ka OTP nhi aa rh...,The Tool kit will be distributed through India...


In [4]:
# Print categories in Gims_CategoryName column and their counts
df['Gims_CategoryName'].value_counts()

Gims_CategoryName
UAM/Udyam Registration/Certificate related issues                    7663
Delayed Payment/MSEFC related issues                                  524
Loans from Banks                                                      194
Related to Scheme of KVIC                                             105
Related to MSME Scheme                                                 89
Related to  MSME-DFO                                                   72
MSME Act 2006 related Matters                                          48
Any other banking issue                                                42
Related to Public Procurement by PSUs                                  28
Related to DCMSME Scheme                                               28
Related to GST                                                         25
Related to various State Government                                    24
Insurance Claim related issues                                         16
Definition of MSMEs 

In [5]:
# Translating GrievanceDescription & FinalReply to English (Hindi/Hinglish/other → English) including lowercasing
# Uses existing libraries: deep-translator (GoogleTranslator) + langdetect for language detection.
# Note: Translation requires internet access and may be rate-limited by the provider.
# Install langauge detection and translation libraries if not already installed:(!pip install deep-translator langdetect)
import re
from functools import lru_cache
from deep_translator import GoogleTranslator
from langdetect import detect, LangDetectException
# Column names
CATEGORY_COL = "Gims_CategoryName"
DESC_COL = "GrievanceDescription"
REPLY_COL = "FinalReply"

# Quick script detection for Hindi (Devanagari block)
DEVANAGARI_RE = re.compile(r"[\u0900-\u097F]")

def _safe_str(x) -> str:
    if x is None or (isinstance(x, float) and pd.isna(x)) or pd.isna(x):
        return ""
    return str(x)

@lru_cache(maxsize=50000)
def _translate_cached(text: str) -> str:
    # GoogleTranslator auto-detects source when source='auto'
    return GoogleTranslator(source="auto", target="en").translate(text)

def to_english(text: str) -> str:
    text = _safe_str(text).strip()
    if not text:
        return ""
    # If the text contains Hindi characters, translate.
    if DEVANAGARI_RE.search(text):
        try:
            return _translate_cached(text)
        except Exception:
            return text
    # Otherwise, detect language; translate if not English.
    try:
        lang = detect(text)
    except LangDetectException:
        lang = "unknown"
    if lang and lang != "en" and lang != "unknown":
        try:
            return _translate_cached(text)
        except Exception:
            return text
    # Hinglish may be detected as 'en' sometimes; you can force-translate later if needed.
    return text

# Defensive checks
missing_cols = [c for c in [DESC_COL, REPLY_COL] if c not in df.columns]
if missing_cols:
    raise KeyError(f"Missing expected columns in df: {missing_cols}. Available columns: {list(df.columns)}")

# Translate into English columns
df[CATEGORY_COL] = df[CATEGORY_COL].apply(to_english)
df[DESC_COL] = df[DESC_COL].apply(to_english)
df[REPLY_COL] = df[REPLY_COL].apply(to_english)

# Optional: standardize to lowercase after translation (English output)

df[CATEGORY_COL] = df[CATEGORY_COL].str.lower()
df[DESC_COL] = df[DESC_COL].str.lower()
df[REPLY_COL] = df[REPLY_COL].str.lower()

# Preview
df[[CATEGORY_COL, DESC_COL, REPLY_COL]].head(10)

Unnamed: 0,Gims_CategoryName,GrievanceDescription,FinalReply
0,uam/udyam registration/certificate related issues,please find the attachment of letter head,"sir,&#13;&#10; please upload the clear im..."
1,uam/udyam registration/certificate related issues,udyam no udyam-br-31-0038425 my address i...,"sir,&#13;&#10;as per your request you are advi..."
2,uam/udyam registration/certificate related issues,"to, the udyog aadhaar registration authority,...",&#2350;&#2361;&#2379;&#2342;&#2351;/&#2350;&#2...
3,uam/udyam registration/certificate related issues,"this is regarding udyam registration, even aft...",with the provided pan the registered udyam is ...
4,uam/udyam registration/certificate related issues,the grievance is regarding my udyam registrati...,with the provided pan the registered udyam is ...
5,uam/udyam registration/certificate related issues,"dear sir, i am a proprietor having pan: gzdpd7...","madam,&#13;&#10;this is regarding your grievan..."
6,uam/udyam registration/certificate related issues,when i apply for udyam registration.. it shows...,"sir, the following information is available in..."
7,uam/udyam registration/certificate related issues,sir/madam i am subham aggarwal son of fateh ch...,"sir,&#13;&#10; as per our records an uam is re..."
8,uam/udyam registration/certificate related issues,i am running a cement business and want to get...,your udyam registration number is udyam-rj-17...
9,related to dcmsme scheme,"sir, i have not received the otp of pm vishwak...",the tool kit will be distributed through india...


In [6]:
# Advanced preprocessing for clustering (English-focused)
# Applies to GrievanceDescription & FinalReply:
# 1) remove stopwords
# 2) spelling correction
# 3) chat/slang treatment via contractions expansion (library)
# 4) URL removal
# 5) remove punctuation
# 6) remove numbers
# 7) handle repeated characters (e.g., 'pleeeease' -> 'pleease')

import string
import nltk
from nltk.corpus import stopwords
import contractions
from spellchecker import SpellChecker

# Ensure NLTK stopwords are available
try:
    _ = stopwords.words("english")
except LookupError:
    nltk.download("stopwords")

EN_STOPWORDS = set(stopwords.words("english"))

# Whitelist: domain terms/acronyms we do NOT want spell-corrected
DOMAIN_KEEP = {
    "msme", "udyam", "urn", "nic", "gst", "gstin", "pan", "aadhaar", "aadhar", "upi",
    "llp", "otp", "ifsc", "gov", "india", "portal",
}

URL_RE = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
NUM_RE = re.compile(r"\b\d+\b")
REPEATED_RE = re.compile(r"(.)\1{2,}")  # 3+ repeats -> reduce

PUNCT_TABLE = str.maketrans({ch: " " for ch in string.punctuation})

spell = SpellChecker(distance=1)

def _safe_str(x) -> str:
    if x is None or (isinstance(x, float) and pd.isna(x)) or pd.isna(x):
        return ""
    return str(x)

def _only_englishish_token(tok: str) -> bool:
    # Restrict spell correction to simple alphabetic tokens
    return tok.isalpha() and tok.isascii()

@lru_cache(maxsize=200000)
def _correct_token(tok: str) -> str:
    # Keep domain terms as-is
    if tok in DOMAIN_KEEP:
        return tok
    # Spell-correct only for reasonably sized words
    if len(tok) <= 3:
        return tok
    if not _only_englishish_token(tok):
        return tok
    corrected = spell.correction(tok)
    return corrected if corrected else tok

def preprocess_text(text: str) -> str:
    text = _safe_str(text).strip()
    if not text:
        return ""

    # Lowercase (English output expected after translation cell)
    text = text.lower()

    # 4) URL removal
    text = URL_RE.sub(" ", text)

    # 3) slang/chat treatment using existing library (contractions expansion)
    # Examples: "can't" -> "cannot", "i'm" -> "i am"
    # (This does not expand SMS shortcuts like u->you; add a small map only if needed later.)
    try:
        text = contractions.fix(text)
    except Exception:
        pass

    # 7) handle repeated characters (e.g., 'pleeeease' -> 'pleease')
    # Reduce 3+ repeats to 2 repeats (keeps some emphasis but normalizes)
    text = REPEATED_RE.sub(r"\1\1", text)

    # 6) remove standalone numbers
    text = NUM_RE.sub(" ", text)

    # 5) remove punctuation
    text = text.translate(PUNCT_TABLE)

    # Tokenize (simple whitespace)
    tokens = [t for t in text.split() if t]

    # 2) spelling correction (token-wise, cached)
    corrected = [_correct_token(t) for t in tokens]

    # 1) stopword removal
    filtered = [t for t in corrected if t not in EN_STOPWORDS]

    return " ".join(filtered).strip()

# Defensive checks
missing_cols = [c for c in [DESC_COL, REPLY_COL] if c not in df.columns]
if missing_cols:
    raise KeyError(f"Missing expected columns in df: {missing_cols}. Available columns: {list(df.columns)}")

df[DESC_COL] = df[DESC_COL].apply(preprocess_text)
df[CATEGORY_COL] = df[CATEGORY_COL].apply(preprocess_text)
df[REPLY_COL] = df[REPLY_COL].apply(preprocess_text)

# Preview
df[[CATEGORY_COL, DESC_COL, REPLY_COL]].head(40)

Unnamed: 0,Gims_CategoryName,GrievanceDescription,FinalReply
0,uam udyam registration certificate related issues,please find attachment letter head,sir please upload clear image aadhar card pan ...
1,uam udyam registration certificate related issues,udyam udyam br address wrong want cancel prere...,sir per request advised send details signed co...
2,uam udyam registration certificate related issues,udyog aadhaar registration authority dear sir ...,grievance change registered mobile number requ...
3,uam udyam registration certificate related issues,regarding udyam registration even multiple att...,provided pan registered udyam udyam cg satan k...
4,uam udyam registration certificate related issues,grievance regarding udyam registration even mu...,provided pan registered udyam udyam cg satan k...
5,uam udyam registration certificate related issues,dear sir proprietor pan gzdpd7433l aadhar numb...,madam regarding grievance registered champions...
6,uam udyam registration certificate related issues,apply udyam registration shows udyam registrat...,sir following information available database d...
7,uam udyam registration certificate related issues,sir madam subham aggarwal son fate chandra for...,sir per records uam registered given pan detai...
8,uam udyam registration certificate related issues,running cement business want get registered ms...,udyam registration number udyam rj verify udya...
9,related dcmsme scheme,sir received otp pm vishwakarma yojana take to...,tool kit distributed india post per guideline ...


In [7]:
df['ReplyStatus'].value_counts()

ReplyStatus
Replied    8878
Pending      60
Name: count, dtype: int64

In [8]:
# Filter and display FinalReply for pending entries : Might useful further for analysis or follow-up
pending_df = df[df['ReplyStatus'] == 'Pending']
pending_df[['GrievanceDescription', 'FinalReply', 'ReplyStatus','Gims_CategoryName']].head(15)

Unnamed: 0,GrievanceDescription,FinalReply,ReplyStatus,Gims_CategoryName
181,kerala state electricity board awarded rate co...,,Pending,related various state government
739,trademark application natraj class application...,,Pending,related various ministries
1191,writing formally file grievance regarding reje...,,Pending,loans banks
1232,tender id ape procurement portal participated ...,,Pending,related various state government
1292,pan card issue,,Pending,related tool rooms
1525,issue occurring every time try submit applicat...,,Pending,related tool rooms damn matters
1687,till date application converted case applicati...,,Pending,delayed payment msefc related issues
1801,getting proper response customs unnecessarily ...,,Pending,related various state government
1843,r sir today checking firm status portal show f...,,Pending,related tool rooms
2465,director msme dfo ministry small medium enterp...,,Pending,related msme dfo


In [11]:
# separating category which name is 'uam udyam registration certificate related issues' -  main category which have largest number of entries
dfx = df[df[CATEGORY_COL]=='uam udyam registration certificate related issues']
dfx.head(15)
#dfx.shape

Unnamed: 0,GrievanceNo,Gims_CategoryName,State,CreatedDate,ReplyStatus,GrievanceDescription,FinalReply,FinalReplyDate
0,UP00135828,uam udyam registration certificate related issues,UTTAR PRADESH,2024-12-30,Replied,please find attachment letter head,sir please upload clear image aadhar card pan ...,2024-12-31
1,BR00135827,uam udyam registration certificate related issues,BIHAR,2024-12-30,Replied,udyam udyam br address wrong want cancel prere...,sir per request advised send details signed co...,2024-12-31
2,MH00135826,uam udyam registration certificate related issues,MAHARASHTRA,2024-12-30,Replied,udyog aadhaar registration authority dear sir ...,grievance change registered mobile number requ...,2025-01-01
3,CG00135825,uam udyam registration certificate related issues,CHHATTISGARH,2024-12-30,Replied,regarding udyam registration even multiple att...,provided pan registered udyam udyam cg satan k...,2025-01-03
4,CG00135824,uam udyam registration certificate related issues,CHHATTISGARH,2024-12-30,Replied,grievance regarding udyam registration even mu...,provided pan registered udyam udyam cg satan k...,2025-01-03
5,JH00135823,uam udyam registration certificate related issues,JHARKHAND,2024-12-30,Replied,dear sir proprietor pan gzdpd7433l aadhar numb...,madam regarding grievance registered champions...,2025-01-02
6,OD00135822,uam udyam registration certificate related issues,ODISHA,2024-12-30,Replied,apply udyam registration shows udyam registrat...,sir following information available database d...,2024-12-31
7,UP00135821,uam udyam registration certificate related issues,UTTAR PRADESH,2024-12-30,Replied,sir madam subham aggarwal son fate chandra for...,sir per records uam registered given pan detai...,2024-12-31
8,RJ00135820,uam udyam registration certificate related issues,RAJASTHAN,2024-12-30,Replied,running cement business want get registered ms...,udyam registration number udyam rj verify udya...,2025-01-01
10,TN00135818,uam udyam registration certificate related issues,TAMIL NADU,2024-12-30,Replied,dear sir madam great day uan number tn16a00914...,data updated successfully requested,2025-01-02


In [12]:
pip install bertopic sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [15]:
from bertopic import BERTopic
model = BERTopic(verbose=True, embedding_model="all-MiniLM-L6-v2")
topics, probs = model.fit_transform(dfx[DESC_COL].tolist())

results = model.get_topic_info()
results = pd.DataFrame(results)
print("Summary of the Clusters:",results)


2025-12-25 19:37:35,689 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/240 [00:00<?, ?it/s]

2025-12-25 19:38:35,771 - BERTopic - Embedding - Completed ✓
2025-12-25 19:38:35,772 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-25 19:38:37,861 - BERTopic - Dimensionality - Completed ✓
2025-12-25 19:38:37,865 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-25 19:38:38,120 - BERTopic - Cluster - Completed ✓
2025-12-25 19:38:38,130 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-25 19:38:38,337 - BERTopic - Representation - Completed ✓


Summary of the Clusters:      Topic  Count                                               Name  \
0       -1   2805                   -1_registration_number_pan_udyam   
1        0    818                0_certificate_udyam_download_mobile   
2        1    192                1_cancellation_cancel_pending_still   
3        2    166                          2_msme_certificate_id_sir   
4        3    141                 3_turnover_fy_financial_investment   
..     ...    ...                                                ...   
102    101     11  101_change_rpandidurai1993gmail_tn17a00272_kha...   
103    102     11                         102_mob_udyag_gals_mallick   
104    103     10            103_avapj0831f_ramavath_rukmini_udayami   
105    104     10                  104_dptpr0433d_udayam_trying_msme   
106    105     10                    105_business_one_week_cancelled   

                                        Representation  \
0    [registration, number, pan, udyam, aadhar, mob.

In [26]:
model.get_topic(0) # define the keywrods in the first cluster

[('certificate', np.float64(0.02887765227745697)),
 ('udyam', np.float64(0.013940060230735274)),
 ('download', np.float64(0.011771178898900244)),
 ('mobile', np.float64(0.011458672378353083)),
 ('id', np.float64(0.010851105191239429)),
 ('lost', np.float64(0.010198404781978199)),
 ('number', np.float64(0.009803228633376007)),
 ('email', np.float64(0.00948424261099637)),
 ('mail', np.float64(0.009001073555572626)),
 ('please', np.float64(0.008632637640609914))]

In [25]:
from sklearn.metrics import silhouette_score
import numpy as np

# To calculate the score, we need the document embeddings and the topic labels (clusters).
# Note: We must filter out the outlier topic (-1) as it skews the score.

# 1. Get the document embeddings by re-encoding the documents.
# This is the most reliable way to get the embeddings in the correct format.
embeddings = model.embedding_model.encode(dfx[DESC_COL].tolist(), show_progress_bar=True)

# 2. Create a DataFrame with only the documents that are NOT outliers.
# The `topics` variable comes from the `fit_transform` cell.
non_outlier_df = pd.DataFrame({
    'doc_index': range(len(topics)),
    'topic': topics
}).query('topic != -1')

# 3. Get the embeddings and topic labels for the non-outlier documents.
non_outlier_embeddings = embeddings[non_outlier_df['doc_index']]
non_outlier_topics = non_outlier_df['topic']

# 4. Check if more than one cluster was found (score requires at least 2 clusters).
if len(non_outlier_topics.unique()) > 1:
    # Calculate the Silhouette Score
    score = silhouette_score(non_outlier_embeddings, non_outlier_topics)
    print(f"Silhouette Score: {score:.4f}")
else:
    print("Cannot calculate Silhouette Score: Only one topic was found (excluding outliers).")

AttributeError: 'SentenceTransformerBackend' object has no attribute 'encode'

In [None]:
model.visualize_heatmap(n_clusters=10, top_n_topics=10)

### Visualize Document-Topic Heatmap
This heatmap shows the similarity between topics. A brighter square indicates that two topics are more related, helping you spot meta-clusters.

In [None]:
# Shows the top 8 topics by size
model.visualize_barchart(top_n_topics=8)

### Visualize Topic Keywords
This creates a bar chart showing the most important words for each topic, making it easy to compare them.

In [None]:
# This plot is interactive! You can hover over topics.
model.visualize_topics()

### Visualize Topics in 2D Space
This plot is interactive. You can hover over topics to see their details. Topics that are closer together are more semantically similar.