In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter

In [2]:
dataset = ["I love playing football on the weekends",
 "I enjoy hiking and camping in the mountains",
 "I like to read books and watch movies",
 "I prefer playing video games over sports",
 "I love listening to music and going to concerts"]

In [3]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

In [5]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)
# Predict the clusters for each document
y_pred = km.predict(X)
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))
# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0

Top terms per cluster:
Cluster 0:
 to
 and
 movies
 books
 camping
 enjoy
 hiking
 in
 like
 watch

Cluster 1:
 playing
 on
 football
 weekends
 sports
 prefer
 over
 video
 games
 the



In [6]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6


In [9]:
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter

In [10]:
dataset = ["I love playing football on the weekends",
 "I enjoy hiking and camping in the mountains",
 "I like to read books and watch movies",
 "I prefer playing video games over sports",
 "I love listening to music and going to concerts"]

In [11]:
tokenized_dataset = [doc.split() for doc in dataset]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100,
window=5, min_count=1, workers=4)

In [12]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in
word2vec_model.wv], axis=0) for doc in dataset])

In [13]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

In [14]:
y_pred = km.predict(X)
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    1


In [15]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6


In [16]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize and remove stopwords
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)


In [17]:
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from collections import Counter
import numpy as np

# Apply preprocessing
processed_dataset = [preprocess_text(doc) for doc in dataset]
tokenized_dataset = [doc.split() for doc in processed_dataset]

# Word2Vec
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4)
X = np.array([np.mean([word2vec_model.wv[word] for word in doc if word in word2vec_model.wv], axis=0) for doc in tokenized_dataset])

# KMeans Clustering
k = 2
km = KMeans(n_clusters=k)
km.fit(X)
y_pred = km.predict(X)

# Purity Calculation
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity after Word2Vec + preprocessing:", purity)


Purity after Word2Vec + preprocessing: 0.8


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Apply preprocessing
processed_dataset = [preprocess_text(doc) for doc in dataset]

# TF-IDF + KMeans
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_dataset)

k = 2
km = KMeans(n_clusters=k)
km.fit(X)
y_pred = km.predict(X)

# Purity Calculation
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity after TF-IDF + preprocessing:", purity)


Purity after TF-IDF + preprocessing: 0.6


In [21]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import Counter
from tabulate import tabulate

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv('customer_complaints_1.csv')
texts = df['text'].dropna().astype(str).tolist()

# Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Preprocess all texts
processed_texts = [preprocess(t) for t in texts]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_texts)

# Clustering with KMeans
k = 5  # You can change this based on your needs
model = KMeans(n_clusters=k, random_state=42)
model.fit(X)
labels = model.predict(X)

# Attach results to original DataFrame
df['Cluster'] = labels

# Print example of clustered data
sample_table = [["Text", "Cluster"]]
sample_table += [[row['text'][:100] + "...", row['Cluster']] for _, row in df.head(10).iterrows()]
print(tabulate(sample_table, headers="firstrow"))

# (Optional) Show top terms per cluster
print("\nTop terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print(f"Cluster {i}: ", ", ".join([terms[ind] for ind in order_centroids[i, :10]]))


Text                                                                                                       Cluster
-------------------------------------------------------------------------------------------------------  ---------
I used to love Comcast. Until all these constant updates. My internet and cable crash a lot at night...          1
I'm so over Comcast! The worst internet provider. I'm taking online classes and multiple times was l...          1
If I could give them a negative star or no stars on this review I would. I have never worked with an...          2
I've had the worst experiences so far since install on 10/4/16. Nothing but problems. Two no shows o...          0
Check your contract when you sign up for Comcast as their advertised offers do not match the contrac...          2
Thank God. I am changing to Dish. They gave me awesome pricing and super people to deal with. You ca...          1
I Have been a long time customer and only have Xfinity as my ISP for a while now

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vishnuram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vishnuram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
