TEXT CLUSTERING USING TF-IDF VECTORIZER

In [None]:
'''
Do the Purity differ when applying text preprocessing before vectorization?
Answer: yes, for TF-IDF, text preprecessing caused purity to increase from 0.6 to 0.8. However,
for Word2Vec, text preprocessing casued purity to decrease from 0.8 to 0.6.
'''

In [13]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet

# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [15]:
# Create the documents
dataset = [
    "I love playing football on the weekends",
    "I enjoy hiking and camping in the mountains",
    "I like to read books and watch movies",
    "I prefer playing video games over sports",
    "I love listening to music and going to concerts"
]

In [33]:
# Text preprocessing on list of documents

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define preprocessing functions

def clean_text(text):
    return text.lower()

def remove_stopwords(text):
    words = word_tokenize(text)
    return [word for word in words if word not in stop_words]

def stem_words(words):
    return [stemmer.stem(word) for word in words]

def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_words(words):
    pos_tags = pos_tag(words)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]

# Now apply the functions step-by-step to each document
all_tokens = []

for doc in dataset:
    text = clean_text(doc)
    words = remove_stopwords(text)
    words = stem_words(words)
    words = lemmatize_words(words)
    tokens = words  # Already tokenized
    all_tokens.append(tokens)

# Display the tokenized documents
for idx, tokens in enumerate(all_tokens):
    print(f"{tokens}")

['love', 'play', 'footbal', 'weekend']
['enjoy', 'hike', 'camp', 'mountain']
['like', 'read', 'book', 'watch', 'movi']
['prefer', 'play', 'video', 'game', 'sport']
['love', 'listen', 'music', 'go', 'concert']


In [47]:
# Vectorize the dataset
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(tokens)

In [49]:
# Perform clustring, and define the number of clusters.
k = 2
km = KMeans(n_clusters=k)
km.fit(X)

In [51]:
# Predict the clusters for each document
y_pred = km.predict(X)

In [53]:
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              0
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    0


In [55]:
# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
print()


Top terms per cluster:
Cluster 0:
 music
 love
 go
 concert
 listen
Cluster 1:
 listen
 music
 love
 go
 concert



In [57]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.8


TEXT CLUSTERING USING WORD2VEC VECTORIZER

In [63]:
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter

In [65]:
# Train Word2Vec model
tokenized_dataset = [doc.split() for doc in tokens]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100,
window=5, min_count=1, workers=4)

In [67]:
# Create document embeddings
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in
word2vec_model.wv], axis=0) for doc in tokens])

In [69]:
# Perform clustering, and define the number of clusters
k = 2
km = KMeans(n_clusters=k)
km.fit(X)



In [71]:
# Predict the clusters for each document
y_pred = km.predict(X)

In [73]:
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(tokens, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document      Predicted Cluster
----------  -------------------
love                          1
listen                        0
music                         1
go                            0
concert                       1


In [75]:
# Evaluate results. Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6
