In [27]:
import pandas as pd
import numpy as np
import string
import itertools
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import yake
print("done")

done


In [28]:
import csv

input_file = 'clustering_data.csv'   # your original CSV
output_file = 'clustering_data_clean.csv'  # cleaned CSV

with open(input_file, 'r', encoding='utf-8') as infile, \
     open(output_file, 'w', encoding='utf-8', newline='') as outfile:
    
    reader = csv.reader(infile)
    writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
    
    # Read header
    try:
        header = next(reader)
    except StopIteration:
        print("CSV is empty!")
        exit()
    
    if len(header) != 4:
        print(f"Warning: Header has {len(header)} columns, expected 4")
    
    writer.writerow(header)  # write header to cleaned file
    
    for row in reader:
        # Skip empty rows
        if not row or all(cell.strip() == '' for cell in row):
            continue
        
        # Only keep rows with exactly 4 columns
        if len(row) != 4:
            print(f"Skipping malformed row: {row}")
            continue
        
        # Write row safely (wrap text in quotes)
        writer.writerow(row)

print(f"Cleaned CSV saved as: {output_file}")


Cleaned CSV saved as: clustering_data_clean.csv


In [29]:
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer


In [30]:
df = pd.read_csv('clustering_data_clean.csv')
df.head()


Unnamed: 0,id,complaint_text,category,timestamp
0,1,Room lights malfunction every week and nobody ...,Maintenance,2025-08-06
1,2,Mess food is bad.,Hostel,2025-07-27
2,3,Lectures are rescheduled last minute without n...,Faculty,2025-09-02
3,4,Mess staff misbehaves and ignores requests.,Hostel,2025-07-22
4,5,The water supply is highly irregular especiall...,Maintenance,2025-08-17


In [31]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['complaint_text'].apply(preprocess)
df.head()


Unnamed: 0,id,complaint_text,category,timestamp,clean_text
0,1,Room lights malfunction every week and nobody ...,Maintenance,2025-08-06,room light malfunction every week nobody attends
1,2,Mess food is bad.,Hostel,2025-07-27,mess food bad
2,3,Lectures are rescheduled last minute without n...,Faculty,2025-09-02,lecture rescheduled last minute without notifi...
3,4,Mess staff misbehaves and ignores requests.,Hostel,2025-07-22,mess staff misbehaves ignores request
4,5,The water supply is highly irregular especiall...,Maintenance,2025-08-17,water supply highly irregular especially morning


In [32]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [34]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = text.split()
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


In [35]:
df['clean_text'] = df['complaint_text'].apply(preprocess)

# Print first 5 rows to check
print(df[['complaint_text', 'clean_text']].head())


                                      complaint_text  \
0  Room lights malfunction every week and nobody ...   
1                                  Mess food is bad.   
2  Lectures are rescheduled last minute without n...   
3        Mess staff misbehaves and ignores requests.   
4  The water supply is highly irregular especiall...   

                                          clean_text  
0   room light malfunction every week nobody attends  
1                                      mess food bad  
2  lecture rescheduled last minute without notifi...  
3              mess staff misbehaves ignores request  
4   water supply highly irregular especially morning  


In [36]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
print(" Model loaded successfully!")


 Model loaded successfully!


In [37]:
embeddings = model.encode(df['clean_text'].tolist(), convert_to_numpy=True)

os.makedirs('embeddings', exist_ok=True)
np.save('embeddings/embeddings.npy', embeddings)

print(f"✅ Embeddings generated! Shape: {embeddings.shape}")

✅ Embeddings generated! Shape: (500, 384)


In [38]:
# Cell 5: Compute Similarity Matrix
from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(embeddings)
print(sim_matrix.shape)


(500, 500)


In [39]:
# Cell 6: Define Custom Similarity + Final Score
def jaccard_similarity(text1, text2):
    """
    Compute Jaccard similarity between two texts based on word overlap.
    """
    set1 = set(text1.split())
    set2 = set(text2.split())
    return len(set1 & set2) / len(set1 | set2)

def final_score(i, j):
    """
    Compute the final similarity score between complaint i and complaint j.
    Combines:
    - Cosine similarity of embeddings
    - Jaccard keyword overlap
    - Category match (if available)
    """
    cosine_sim = sim_matrix[i][j]
    keyword_overlap = jaccard_similarity(df['clean_text'][i], df['clean_text'][j])
    category_match = 1 if df['category'][i] == df['category'][j] else 0
    return (0.7 * cosine_sim) + (0.2 * keyword_overlap) + (0.1 * category_match)


In [40]:
# Cell 6: Define Dynamic Threshold Function
import numpy as np

def dynamic_threshold(i, base=0.25, scale=0.2):
    """
    Compute a dynamic threshold for each complaint i based on
    the mean and standard deviation of its similarity distribution.
    This makes the threshold adaptive per complaint.
    """
    sims = sim_matrix[i]  # Ensure sim_matrix is already defined
    mean_sim = np.mean(sims)
    std_sim = np.std(sims)
    return base + scale * std_sim + 0.1 * mean_sim  # more context-sensitive threshold


In [41]:
# Cell 7: Clustering Using Dynamic Threshold
visited = set()
clusters = []

for i in range(len(df)):
    if i in visited:
        continue
    
    current_cluster = [i]
    visited.add(i)
    thr = dynamic_threshold(i)
    
    for j in range(len(df)):
        if j in visited:
            continue
        if final_score(i, j) > thr:
            current_cluster.append(j)
            visited.add(j)
    
    clusters.append(current_cluster)

# Show number of clusters formed
print(f"Clusters formed: {len(clusters)}")


Clusters formed: 34


In [54]:
import pandas as pd

results = []
threshold = 0.3  # show only pairs with final score > 0.3 (tune as needed)

for i in range(len(df)):
    for j in range(i+1, len(df)):  # avoid duplicate pairs (i,j) and (j,i)
        score = final_score(i, j)
        if score > threshold:
            results.append({
                "i": i,
                "j": j,
                "Complaint i": df['clean_text'][i],
                "Complaint j": df['clean_text'][j],
                "Cosine Similarity": round(sim_matrix[i][j], 4),
                "Jaccard Overlap": round(jaccard_similarity(df['clean_text'][i], df['clean_text'][j]), 4),
                "Category Match": 1 if df['category'][i] == df['category'][j] else 0,
                "Final Score": round(score, 4)
            })

# Sort results by final score (descending)
results_df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False)

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 50)  # show first 50 results (to avoid huge output)
display(results_df)


Unnamed: 0,i,j,Complaint i,Complaint j,Cosine Similarity,Jaccard Overlap,Category Match,Final Score
3851,64,78,sport field inaccessible due poor drainage rain,sport field waterlogged inaccessible due poor drainage,0.8796,0.7500,1,0.8657
1618,31,167,mess staff hygiene standard worrying raising food safety concern,mess staff hygiene practice questionable impacting food safety,0.9614,0.4167,1,0.8563
9057,167,181,mess staff hygiene practice questionable impacting food safety,mess staff sometimes neglect hygiene compromising food safety,0.9225,0.4545,1,0.8367
1390,28,106,sport equipment storage room often left unlocked causing security concern,sport equipment room often left unlocked raising safety concern,0.8817,0.5833,1,0.8338
283,6,20,broken chair gym unsafe use,rodded chair gym unsafe use,0.8479,0.6667,1,0.8268
...,...,...,...,...,...,...,...,...
392,7,493,water facility sport complex,sport team uniform ordering process slow size frequently unavailable,0.2620,0.0833,1,0.3000
12273,250,338,campus shuttle service frequently delayed causing student miss class,campus water fountain frequently order produce warm water,0.2450,0.1429,1,0.3000
1496,30,70,faculty feedback assignment lack detail making improvement difficult,faculty office hour clash lecture timing,0.2637,0.0769,1,0.3000
458,9,205,internet working hostel room day,unpleasant odor hostel basement due drainage issue,0.2598,0.0909,1,0.3000


In [42]:
#Check Cluster Sizes
cluster_sizes = [len(c) for c in clusters]
print("Number of clusters:", len(clusters))
print("Average cluster size:", sum(cluster_sizes)/len(cluster_sizes))
print("Clusters with size 1:", sum(1 for c in clusters if len(c) == 1))


Number of clusters: 34
Average cluster size: 14.705882352941176
Clusters with size 1: 6


In [43]:
#Calculating confidense score for clusters in detail clustering
import itertools
import numpy as np
def cluster_confidence(cluster, sim_matrix):
    if len(cluster) < 2:
        return 1.0  # single complaint = perfect confidence
    pairwise_sims = [sim_matrix[i][j] for i, j in itertools.combinations(cluster, 2)]
    return np.mean(pairwise_sims)

# Calculate confidence for all clusters
cluster_confidences = [cluster_confidence(c, sim_matrix) for c in clusters]

for i, (c, conf) in enumerate(zip(clusters, cluster_confidences)):
    print(f"Cluster {i+1} | Size: {len(c)} | Confidence: {conf:.2f}")


Cluster 1 | Size: 70 | Confidence: 0.32
Cluster 2 | Size: 41 | Confidence: 0.43
Cluster 3 | Size: 66 | Confidence: 0.39
Cluster 4 | Size: 34 | Confidence: 0.36
Cluster 5 | Size: 56 | Confidence: 0.34
Cluster 6 | Size: 30 | Confidence: 0.30
Cluster 7 | Size: 68 | Confidence: 0.36
Cluster 8 | Size: 1 | Confidence: 1.00
Cluster 9 | Size: 4 | Confidence: 0.24
Cluster 10 | Size: 3 | Confidence: 0.44
Cluster 11 | Size: 5 | Confidence: 0.39
Cluster 12 | Size: 8 | Confidence: 0.36
Cluster 13 | Size: 10 | Confidence: 0.41
Cluster 14 | Size: 7 | Confidence: 0.41
Cluster 15 | Size: 11 | Confidence: 0.35
Cluster 16 | Size: 11 | Confidence: 0.26
Cluster 17 | Size: 2 | Confidence: 0.27
Cluster 18 | Size: 14 | Confidence: 0.29
Cluster 19 | Size: 11 | Confidence: 0.25
Cluster 20 | Size: 5 | Confidence: 0.40
Cluster 21 | Size: 7 | Confidence: 0.40
Cluster 22 | Size: 1 | Confidence: 1.00
Cluster 23 | Size: 14 | Confidence: 0.37
Cluster 24 | Size: 3 | Confidence: 0.40
Cluster 25 | Size: 2 | Confidence: 0

In [44]:
def representative_complaint(cluster, df):
    # Option 1: Pick the longest complaint
    return max(cluster, key=lambda idx: len(df['complaint_text'][idx]))

# Example: pick representative for first cluster
rep_idx = representative_complaint(clusters[0], df)
print("Representative Complaint:", df['complaint_text'][rep_idx])


Representative Complaint: Hostel community kitchen appliances are frequently broken and not replaced promptly.


In [45]:
for i, cluster in enumerate(clusters, start=1):
    print(f"\n🔹 Cluster {i} | Size: {len(cluster)} | Confidence: {cluster_confidences[i-1]:.2f}")
    for idx in cluster:
        print(f"   - {df['complaint_text'][idx]} ({df['category'][idx]})")



🔹 Cluster 1 | Size: 70 | Confidence: 0.32
   - Room lights malfunction every week and nobody attends. (Maintenance)
   - The water supply is highly irregular especially in the mornings. (Maintenance)
   - The classroom fans are broken and have not been fixed for weeks. (Maintenance)
   - The HVAC system in the main library often fails leaving students uncomfortable. (Maintenance)
   - Frequent power outages affect online classes with no timely solutions. (Maintenance)
   - Water pressure in hostel bathrooms drops drastically in evening hours. (Maintenance)
   - Air conditioning in lecture halls not functioning properly. (Maintenance)
   - Common room lights flicker constantly disturbing students. (Maintenance)
   - Heating system in hostel rooms unreliable leading to cold nights. (Maintenance)
   - Frequent water leaks in hostel block C causing hazards. (Maintenance)
   - Hostel lights frequently go off at night causing safety concerns. (Hostel)
   - Repeated complaints about noisy ai

In [46]:
cluster_texts = []

for c in clusters:
    # Join all complaints in the cluster into one string
    cluster_text = " ".join(df['complaint_text'][i] for i in c)
    cluster_texts.append(cluster_text)


In [47]:
cluster_summaries = []

for text in cluster_texts:
    # Split by period
    sentences = text.split('.')
    # Take first 2–3 sentences for short summary
    short_summary = '. '.join(sentences[:3]).strip()
    cluster_summaries.append(short_summary)


In [48]:
import yake

kw_extractor = yake.KeywordExtractor(top=5, stopwords=None)

final_cluster_summaries = []

for idx, text in enumerate(cluster_texts):
    sentences = text.split('.')
    short_summary = '. '.join(sentences[:3]).strip()
    
    # Extract keywords from cleaned text
    cluster_clean_text = " ".join(df['clean_text'][i] for i in clusters[idx])
    keywords = [kw for kw, score in kw_extractor.extract_keywords(cluster_clean_text)]
    
    final_summary = f"{short_summary} | Keywords: {', '.join(keywords)}"
    final_cluster_summaries.append(final_summary)


In [49]:
print(f"Number of clusters: {len(clusters)}")
print(f"First 5 clusters: {clusters[:5]}")


Number of clusters: 34
First 5 clusters: [[0, 4, 18, 21, 25, 29, 33, 37, 41, 45, 51, 53, 57, 65, 69, 71, 75, 83, 95, 99, 101, 103, 107, 111, 115, 121, 129, 133, 136, 137, 141, 145, 153, 157, 161, 165, 169, 171, 175, 183, 187, 195, 198, 203, 207, 211, 215, 219, 225, 233, 237, 241, 245, 249, 254, 258, 262, 268, 274, 298, 310, 350, 354, 358, 382, 384, 404, 416, 452, 466], [1, 3, 23, 31, 43, 47, 55, 63, 67, 73, 81, 85, 89, 93, 97, 105, 113, 123, 127, 135, 143, 147, 155, 159, 167, 173, 177, 181, 185, 189, 193, 197, 201, 213, 217, 223, 231, 235, 243, 247, 340], [2, 8, 22, 26, 34, 42, 46, 54, 62, 66, 70, 72, 76, 80, 84, 88, 96, 100, 104, 108, 116, 122, 126, 130, 134, 142, 146, 154, 158, 162, 166, 170, 172, 180, 184, 196, 200, 204, 208, 212, 220, 222, 226, 234, 238, 242, 250, 251, 259, 267, 271, 283, 315, 335, 339, 343, 347, 375, 379, 383, 387, 399, 451, 459, 483, 499], [5, 15, 30, 38, 50, 58, 112, 120, 150, 176, 188, 192, 216, 230, 246, 255, 275, 279, 287, 299, 323, 327, 355, 371, 391, 407, 4

In [50]:
print(df.head())
print(df.columns)

   id                                     complaint_text     category  \
0   1  Room lights malfunction every week and nobody ...  Maintenance   
1   2                                  Mess food is bad.       Hostel   
2   3  Lectures are rescheduled last minute without n...      Faculty   
3   4        Mess staff misbehaves and ignores requests.       Hostel   
4   5  The water supply is highly irregular especiall...  Maintenance   

    timestamp                                         clean_text  
0  2025-08-06   room light malfunction every week nobody attends  
1  2025-07-27                                      mess food bad  
2  2025-09-02  lecture rescheduled last minute without notifi...  
3  2025-07-22              mess staff misbehaves ignores request  
4  2025-08-17   water supply highly irregular especially morning  
Index(['id', 'complaint_text', 'category', 'timestamp', 'clean_text'], dtype='object')


In [51]:
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['complaint_text'].apply(preprocess)


In [52]:
import yake

kw_extractor = yake.KeywordExtractor(top=5, stopwords=None)

final_cluster_summaries = []

for idx, c in enumerate(clusters):
    if not c:
        final_cluster_summaries.append("Empty cluster")
        continue
    
    # Combine all complaints
    cluster_text = " ".join(df['complaint_text'][i] for i in c)
    
    # Take first 2 sentences (simple split)
    sentences = cluster_text.split('.')
    short_summary = '. '.join(sentences[:2]).strip()
    
    # Keywords from cleaned text
    cluster_clean_text = " ".join(df['clean_text'][i] for i in c)
    keywords = [kw for kw, score in kw_extractor.extract_keywords(cluster_clean_text)]
    
    summary = f"{short_summary} | Keywords: {', '.join(keywords)}"
    final_cluster_summaries.append(summary)
for i, s in enumerate(final_cluster_summaries):
    print(f"Cluster {i} summary: {s}\n")


Cluster 0 summary: Room lights malfunction every week and nobody attends.  The water supply is highly irregular especially in the mornings | Keywords: hostel common room, leaving student uncomfortable, study room air, common study room, common room light

Cluster 1 summary: Mess food is bad.  Mess staff misbehaves and ignores requests | Keywords: mess food portion, mess food hygiene, concern mess food, mess staff hygiene, mess staff behavior

Cluster 2 summary: Lectures are rescheduled last minute without notifications.  Delay in releasing project grades is affecting internships | Keywords: teaching time faculty, scheduled office hour, miss scheduled office, class hour faculty, class faculty delay

Cluster 3 summary: Professors are rarely available for doubt clearance.  Faculty feedback is not constructive | Keywords: faculty grading criterion, feedback assignment lack, research space allocation, student faculty feedback, faculty research space

Cluster 4 summary: Broken chairs in the 