In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
from tabulate import tabulate
from collections import Counter 

In [3]:
# Load the dataset
df = pd.read_csv("customer_complaints_1.csv")   

df.head()

Unnamed: 0,author,posted_on,rating,text
0,"Alantae of Chesterfeild, MI","Nov. 22, 2016",1,I used to love Comcast. Until all these consta...
1,"Vera of Philadelphia, PA","Nov. 19, 2016",1,I'm so over Comcast! The worst internet provid...
2,"Sarah of Rancho Cordova, CA","Nov. 17, 2016",1,If I could give them a negative star or no sta...
3,"Dennis of Manchester, NH","Nov. 16, 2016",1,I've had the worst experiences so far since in...
4,"Ryan of Bellevue, WA","Nov. 14, 2016",1,Check your contract when you sign up for Comca...


In [5]:
# Preprocessing - Vectorizing the text column using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])

# Apply KMeans clustering
k = 3  # Defining 3 clusters for example, adjust as needed
km = KMeans(n_clusters=k, random_state=42)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Prepare the output table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[text, cluster] for text, cluster in zip(df['text'], y_pred)])

# Display the table
table_result = tabulate(table_data, headers="firstrow", tablefmt="grid")
table_result

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:  # Print the top 10 terms for each cluster
        print(f' {terms[ind]}')
    print()

# Step 5: Evaluate results - Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]

purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)




Top terms per cluster:
Cluster 0:
 years
 second
 boxes
 rude
 contract
 adding
 joke
 customer
 investigating
 malfunction

Cluster 1:
 internet
 service
 comcast
 xfinity
 mbps
 rep
 tech
 speed
 told
 cable

Cluster 2:
 service
 day
 deal
 use
 billing
 just
 change
 services
 contract
 comcast

Purity: 0.5789473684210527
