# Text Clustering using TF-IDF Vectorizer 

In [1]:
import numpy as np 
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer 
from tabulate import tabulate 
from collections import Counter 
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
dataset = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"] 

In [14]:
def preprocessing_data(sentence):
    # Remove HTML tags
    modified_sentence = re.sub(r'<.*?>', ' ', sentence)
    
    # Remove punctuation
    modified_sentence = ''.join([i if i not in string.punctuation else ' ' for i in modified_sentence])
    
    # Remove digits
    modified_sentence = re.sub(r'\d+', ' ', modified_sentence)
     # Remove extra whitespaces
    modified_sentence = re.sub(r'\s+', ' ', modified_sentence)
    
    # Convert to lowercase
    modified_sentence = modified_sentence.lower()
    
    # Tokenization
    tokens = word_tokenize(modified_sentence)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    modified_sentence = ' '.join(lemmatized_tokens)
    return modified_sentence

In [15]:
preprocessed_data = [preprocessing_data(sentence) for sentence in dataset]

In [16]:
vectorizer = TfidfVectorizer() 
X = vectorizer.fit_transform(preprocessed_data) 

In [17]:
k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k) 
km.fit(X) 
 
# Predict the clusters for each document 
y_pred = km.predict(X) 
 
# Display the document and its predicted cluster in a table 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(preprocessed_data, y_pred)]) 
print(tabulate(table_data, headers="firstrow")) 

# Print top terms per cluster 
print("\nTop terms per cluster:") 
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
terms = vectorizer.get_feature_names_out() 
for i in range(k): 
    print("Cluster %d:" % i) 
    for ind in order_centroids[i, :10]: 
        print(' %s' % terms[ind]) 
    print() 

  super()._check_params_vs_input(X, default_n_init=10)


Document                              Predicted Cluster
----------------------------------  -------------------
love playing football weekend                         0
enjoy hiking camping mountain                         1
like read book watch movie                            0
prefer playing video game sport                       0
love listening music going concert                    0

Top terms per cluster:
Cluster 0:
 love
 playing
 football
 weekend
 going
 sport
 music
 concert
 video
 game

Cluster 1:
 camping
 enjoy
 hiking
 mountain
 weekend
 listening
 concert
 football
 game
 going



In [18]:
# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity) 

Purity: 0.8


# Text Cluster using WORD2VEC Vectorizer

In [19]:
import numpy as np 
from sklearn.cluster import KMeans 
from gensim.models import Word2Vec 
from tabulate import tabulate 
from collections import Counter 
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [20]:
dataset = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"] 

In [22]:
tokenized_dataset = [doc.split() for doc in preprocessed_data] 
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, 
window=5, min_count=1, workers=4) 

In [23]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in 
word2vec_model.wv], axis=0) for doc in preprocessed_data]) 

In [24]:
k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k) 
km.fit(X) 

# Predict the clusters for each document 
y_pred = km.predict(X) 
 
# Tabulate the document and predicted cluster 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(preprocessed_data, y_pred)]) 
print(tabulate(table_data, headers="firstrow")) 

  super()._check_params_vs_input(X, default_n_init=10)


Document                              Predicted Cluster
----------------------------------  -------------------
love playing football weekend                         1
enjoy hiking camping mountain                         0
like read book watch movie                            0
prefer playing video game sport                       1
love listening music going concert                    0


In [25]:
# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity) 

Purity: 0.6
