LAB 8 TUTORIAL


In [3]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter


In [6]:
dataset = ["I love playing football on the weekends",
 "I enjoy hiking and camping in the mountains",
 "I like to read books and watch movies",
 "I prefer playing video games over sports",
 "I love listening to music and going to concerts"]

In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

In [15]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)
# Predict the clusters for each document
y_pred = km.predict(X)
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
 print("Cluster %d:" % i)
 for ind in order_centroids[i, :10]:
     print(' %s' % terms[ind])
 print()

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    1

Top terms per cluster:
Cluster 0:
 playing
 the
 weekends
 on
 football
 video
 sports
 prefer
 over
 games

Cluster 1:
 to
 and
 read
 watch
 movies
 like
 books
 concerts
 going
 music



In [18]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6


Part 2 using word2vec

In [60]:
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter

In [62]:
tokenized_dataset = [doc.split() for doc in dataset]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100,
window=5, min_count=1, workers=4)


In [64]:

X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in
word2vec_model.wv], axis=0) for doc in dataset])



In [66]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)



In [68]:
y_pred = km.predict(X)
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              1
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    1


In [70]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)


Purity: 0.8


Exercise Part 1 (Preprocess + Calculating Purity without Vectorization)

In [89]:
lower_case = [sentence.lower() for sentence in dataset]

print(lower_case)

['i love playing football on the weekends', 'i enjoy hiking and camping in the mountains', 'i like to read books and watch movies', 'i prefer playing video games over sports', 'i love listening to music and going to concerts']


In [91]:
import nltk
from nltk.tokenize import word_tokenize

tokenized_dataset = [word_tokenize(sentence) for sentence in lower_case]

print(tokenized_dataset)


[['i', 'love', 'playing', 'football', 'on', 'the', 'weekends'], ['i', 'enjoy', 'hiking', 'and', 'camping', 'in', 'the', 'mountains'], ['i', 'like', 'to', 'read', 'books', 'and', 'watch', 'movies'], ['i', 'prefer', 'playing', 'video', 'games', 'over', 'sports'], ['i', 'love', 'listening', 'to', 'music', 'and', 'going', 'to', 'concerts']]


In [118]:
k = 2 

km = KMeans(n_clusters=k)
km.fit(X)

y_pred = km.predict(X)

table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(tokenized_dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                                                       Predicted Cluster
---------------------------------------------------------------------------  -------------------
['i', 'love', 'playing', 'football', 'on', 'the', 'weekends']                                  0
['i', 'enjoy', 'hiking', 'and', 'camping', 'in', 'the', 'mountains']                           1
['i', 'like', 'to', 'read', 'books', 'and', 'watch', 'movies']                                 1
['i', 'prefer', 'playing', 'video', 'games', 'over', 'sports']                                 1
['i', 'love', 'listening', 'to', 'music', 'and', 'going', 'to', 'concerts']                    1




In [130]:
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.8


Do the Purity differ when applying text preprocessing before vectorization? Yes

Exercise Part 2 using TF-IDF

In [132]:
from collections import Counter 
import pandas as pd

file_path = "customer_complaints_1.csv"
df = pd.read_csv(file_path)


In [139]:
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter


In [172]:
#this part is used to extract the data in the "text" column + converting the word to become string
text_data = df['text'].astype(str)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)

In [175]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)


In [180]:
y_pred = km.predict(X)

In [184]:
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(text_data, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
 print("Cluster %d:" % i)
 for ind in order_centroids[i, :10]:
     print(' %s' % terms[ind])
 print()


Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [187]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.8947368421052632


Exercise Part using Word2Vec

In [191]:
from collections import Counter 
import pandas as pd

file_path = "customer_complaints_1.csv"
df = pd.read_csv(file_path)


In [194]:
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter

In [204]:
text_data = df['text'].astype(str)
word2vec_model = Word2Vec(sentences=text_data, vector_size=100,
window=5, min_count=1, workers=4)

In [212]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

def sentence_vector(tokens):
    valid_words = [word for word in tokens if word in word2vec_model.wv]
    if not valid_words:
        return np.zeros(100)
    return np.mean([word2vec_model.wv[word] for word in valid_words], axis=0)

df['vector'] = df['text'].apply(sentence_vector)

# Step 5: Convert all vectors into a 2D array for clustering or ML
X = np.vstack(df['vector'].values)

# Optional: Check shape of final matrix
print(X.shape)

(19, 100)


In [215]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)




In [218]:
y_pred = km.predict(X)

In [221]:
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(text_data, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [224]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.9473684210526315
