In [1]:
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import Counter
from tabulate import tabulate

# Load dataset
df = pd.read_csv("customer_complaints_1.csv")

# Text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.strip()
    return text

df['cleaned_text'] = df['text'].astype(str).apply(preprocess_text)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(df['cleaned_text'])

# Clustering using KMeans
k = 3
km_tfidf = KMeans(n_clusters=k, random_state=42)
km_tfidf.fit(X_tfidf)
y_pred_tfidf = km_tfidf.predict(X_tfidf)

# Show first 10 results
print("TF-IDF Clustering Results:\n")
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc[:60] + "...", cluster] for doc, cluster in zip(df['text'], y_pred_tfidf)])
print(tabulate(table_data[:10], headers="firstrow"))

# Top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km_tfidf.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f" {terms[ind]}")
    print()

# Mock purity
cluster_counts = Counter(y_pred_tfidf)
purity = max(cluster_counts.values()) / len(y_pred_tfidf)
print("Purity:", purity)


TF-IDF Clustering Results:

Document                                                           Predicted Cluster
---------------------------------------------------------------  -------------------
I used to love Comcast. Until all these constant updates. My...                    1
I'm so over Comcast! The worst internet provider. I'm taking...                    1
If I could give them a negative star or no stars on this rev...                    2
I've had the worst experiences so far since install on 10/4/...                    0
Check your contract when you sign up for Comcast as their ad...                    0
Thank God. I am changing to Dish. They gave me awesome prici...                    1
I Have been a long time customer and only have Xfinity as my...                    1
There is a malfunction on the DVR manager which is preventin...                    0
Charges overwhelming. Comcast service rep was so ignorant an...                    1

Top terms per cluster:
Cluster 0:
 s