In [87]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter

In [88]:
dataset = ["I love playing football on the weekends",
           "I enjoy hiking and camping in the mountains",
           "I like to read books and watch movies",
           "I prefer playing video games over sports",
           "I love listening to music and going to concerts"]

In [89]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

In [90]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
        print()

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0

Top terms per cluster:
Cluster 0:
 to

 and

 read

 watch

 movies

 like

 books

 concerts

 going

 music

Cluster 1:
 playing

 the

 weekends

 on

 football

 video

 sports

 prefer

 over

 games



In [91]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6


In [119]:
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter

In [121]:
dataset = ["I love playing football on the weekends",
           "I enjoy hiking and camping in the mountains",
           "I like to read books and watch movies",
           "I prefer playing video games over sports",
           "I love listening to music and going to concerts"]

In [129]:
tokenized_dataset = [doc.split() for doc in dataset]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100,window=5, min_count=1, workers=4)

In [135]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in 
word2vec_model.wv], axis=0) for doc in dataset])

In [137]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0




In [117]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6


In [155]:
import pandas as pd
import re
import string
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from collections import Counter
from tabulate import tabulate
import numpy as np

# Load dataset
df = pd.read_csv("customer_complaints_1.csv")

# Preprocessing for Word2Vec
def preprocess_and_tokenize(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\d+", "", text)
    tokens = text.strip().split()
    return tokens

df['tokens'] = df['text'].astype(str).apply(preprocess_and_tokenize)

# Train Word2Vec model
w2v_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4, sg=1, seed=42)

# Average word vectors for each document
def average_word_vector(tokens, model, vector_size):
    valid_tokens = [token for token in tokens if token in model.wv]
    if not valid_tokens:
        return np.zeros(vector_size)
    return np.mean([model.wv[token] for token in valid_tokens], axis=0)

vector_size = 100
X_w2v = np.array([average_word_vector(tokens, w2v_model, vector_size) for tokens in df['tokens']])

# KMeans clustering
k = 3
km_w2v = KMeans(n_clusters=k, random_state=42)
y_pred_w2v = km_w2v.fit_predict(X_w2v)

# Show first 10 results
print("Word2Vec Clustering Results:\n")
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc[:60] + "...", cluster] for doc, cluster in zip(df['text'], y_pred_w2v)])
print(tabulate(table_data[:10], headers="firstrow"))

# Mock purity (based on majority class size)
cluster_counts = Counter(y_pred_w2v)
purity = max(cluster_counts.values()) / len(y_pred_w2v)
print("\nPurity:", purity)


Word2Vec Clustering Results:

Document                                                           Predicted Cluster
---------------------------------------------------------------  -------------------
I used to love Comcast. Until all these constant updates. My...                    0
I'm so over Comcast! The worst internet provider. I'm taking...                    2
If I could give them a negative star or no stars on this rev...                    0
I've had the worst experiences so far since install on 10/4/...                    0
Check your contract when you sign up for Comcast as their ad...                    0
Thank God. I am changing to Dish. They gave me awesome prici...                    0
I Have been a long time customer and only have Xfinity as my...                    1
There is a malfunction on the DVR manager which is preventin...                    0
Charges overwhelming. Comcast service rep was so ignorant an...                    1

Purity: 0.5263157894736842


