In [14]:
import numpy as np 
import string
import pandas as pd
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer 
from tabulate import tabulate 
from collections import Counter 

In [15]:
data = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"]

In [16]:
#defining the function to remove punctuations in the documents
def remove_punctuation(text):
    #punctuationfree = "".join([i for i in text if i not in string.punctuation])
    #return punctuationfree
    # Initialize an empty string to store the result
    punctuation_free = ""
    
    # Iterate over each character in the text
    for i in text:
        # Check if the character is not in the string.punctuation set
        if i not in string.punctuation:
            # If not, add the character to the result string
            punctuation_free += i
    
    return punctuation_free

In [18]:
# Applying the function to your data
data_no_punctuation = [remove_punctuation(sentence) for sentence in data]
data_lower = [sentence.lower() for sentence in data]


# Creating a DataFrame to display the original and processed data in a table format
df = pd.DataFrame({
    'Original Text': data,
    'Text Without Punctuation': data_no_punctuation,
    'Text in Lowercase': data_lower
})

print(df)

                                     Original Text  \
0          I love playing football on the weekends   
1      I enjoy hiking and camping in the mountains   
2            I like to read books and watch movies   
3         I prefer playing video games over sports   
4  I love listening to music and going to concerts   

                          Text Without Punctuation  \
0          I love playing football on the weekends   
1      I enjoy hiking and camping in the mountains   
2            I like to read books and watch movies   
3         I prefer playing video games over sports   
4  I love listening to music and going to concerts   

                                 Text in Lowercase  
0          i love playing football on the weekends  
1      i enjoy hiking and camping in the mountains  
2            i like to read books and watch movies  
3         i prefer playing video games over sports  
4  i love listening to music and going to concerts  


In [19]:
import re


In [20]:
#function to remove digit (\d) or hypens (-) from the documents with an empty string ''
def remove_numbers(text):
    return re.sub("[\d-]",'',text)

In [23]:
data_remove_number = [remove_numbers(sentence) for sentence in data]


In [26]:
# Apply the functions and convert to lowercase
data_no_punctuation = [remove_punctuation(sentence) for sentence in data]
data_lower = [sentence.lower() for sentence in data_no_punctuation]
data_remove_number = [remove_numbers(sentence) for sentence in data_lower]

# Create a DataFrame
df = pd.DataFrame({
    'Original Text': data,
    'Text Without Punctuation': data_no_punctuation,
    'Text in Lowercase': data_lower,
    'Text without numbers': data_remove_number
})

print(df)

                                     Original Text  \
0          I love playing football on the weekends   
1      I enjoy hiking and camping in the mountains   
2            I like to read books and watch movies   
3         I prefer playing video games over sports   
4  I love listening to music and going to concerts   

                          Text Without Punctuation  \
0          I love playing football on the weekends   
1      I enjoy hiking and camping in the mountains   
2            I like to read books and watch movies   
3         I prefer playing video games over sports   
4  I love listening to music and going to concerts   

                                 Text in Lowercase  \
0          i love playing football on the weekends   
1      i enjoy hiking and camping in the mountains   
2            i like to read books and watch movies   
3         i prefer playing video games over sports   
4  i love listening to music and going to concerts   

                         

In [27]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
from nltk.tokenize import word_tokenize

In [31]:
# Apply the functions and convert to lowercase
data_no_punctuation = [remove_punctuation(sentence) for sentence in data]
data_lower = [sentence.lower() for sentence in data_no_punctuation]
data_remove_number = [remove_numbers(sentence) for sentence in data_lower]
data_token = [word_tokenize(sentence) for sentence in data_remove_number]

# Create a DataFrame
df = pd.DataFrame({
    'Original Text': data,
    'Text Without Punctuation': data_no_punctuation,
    'Text in Lowercase': data_lower,
    'Text without numbers': data_remove_number,
    'Text tokenized': data_token
})

print(df)

                                     Original Text  \
0          I love playing football on the weekends   
1      I enjoy hiking and camping in the mountains   
2            I like to read books and watch movies   
3         I prefer playing video games over sports   
4  I love listening to music and going to concerts   

                          Text Without Punctuation  \
0          I love playing football on the weekends   
1      I enjoy hiking and camping in the mountains   
2            I like to read books and watch movies   
3         I prefer playing video games over sports   
4  I love listening to music and going to concerts   

                                 Text in Lowercase  \
0          i love playing football on the weekends   
1      i enjoy hiking and camping in the mountains   
2            i like to read books and watch movies   
3         i prefer playing video games over sports   
4  i love listening to music and going to concerts   

                         

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
stopwords = nltk.corpus.stopwords.words('english')

In [34]:
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output = []
    for i in text:
        if i not in stopwords:
            output.append(i)
    return output

In [35]:
# Apply the functions and convert to lowercase
data_no_punctuation = [remove_punctuation(sentence) for sentence in data]
data_lower = [sentence.lower() for sentence in data_no_punctuation]
data_remove_number = [remove_numbers(sentence) for sentence in data_lower]
data_token = [word_tokenize(sentence) for sentence in data_remove_number]
data_remove_stopword = [remove_stopwords(sentence) for sentence in data_token]

# Create a DataFrame
df = pd.DataFrame({
    'Original Text': data,
    'Text Without Punctuation': data_no_punctuation,
    'Text in Lowercase': data_lower,
    'Text without numbers': data_remove_number,
    'Text tokenized': data_token,
    'Text without stopword': data_remove_stopword
})

print(df)

                                     Original Text  \
0          I love playing football on the weekends   
1      I enjoy hiking and camping in the mountains   
2            I like to read books and watch movies   
3         I prefer playing video games over sports   
4  I love listening to music and going to concerts   

                          Text Without Punctuation  \
0          I love playing football on the weekends   
1      I enjoy hiking and camping in the mountains   
2            I like to read books and watch movies   
3         I prefer playing video games over sports   
4  I love listening to music and going to concerts   

                                 Text in Lowercase  \
0          i love playing football on the weekends   
1      i enjoy hiking and camping in the mountains   
2            i like to read books and watch movies   
3         i prefer playing video games over sports   
4  i love listening to music and going to concerts   

                         

In [36]:
from nltk.stem.porter import PorterStemmer

In [37]:
#defining the object for stemming
porter_stemmer = PorterStemmer()

In [38]:
#defining a function for stemming
def stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = porter_stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

In [39]:
# Apply the functions and convert to lowercase
data_no_punctuation = [remove_punctuation(sentence) for sentence in data]
data_lower = [sentence.lower() for sentence in data_no_punctuation]
data_remove_number = [remove_numbers(sentence) for sentence in data_lower]
data_token = [word_tokenize(sentence) for sentence in data_remove_number]
data_remove_stopword = [remove_stopwords(sentence) for sentence in data_token]
data_stem = [stemming(sentence) for sentence in data_remove_stopword]

# Create a DataFrame
df = pd.DataFrame({
    'Original Text': data,
    'Text Without Punctuation': data_no_punctuation,
    'Text in Lowercase': data_lower,
    'Text without numbers': data_remove_number,
    'Text tokenized': data_token,
    'Text without stopword': data_remove_stopword,
    'Text Stem': data_stem
})

print(df)

                                     Original Text  \
0          I love playing football on the weekends   
1      I enjoy hiking and camping in the mountains   
2            I like to read books and watch movies   
3         I prefer playing video games over sports   
4  I love listening to music and going to concerts   

                          Text Without Punctuation  \
0          I love playing football on the weekends   
1      I enjoy hiking and camping in the mountains   
2            I like to read books and watch movies   
3         I prefer playing video games over sports   
4  I love listening to music and going to concerts   

                                 Text in Lowercase  \
0          i love playing football on the weekends   
1      i enjoy hiking and camping in the mountains   
2            i like to read books and watch movies   
3         i prefer playing video games over sports   
4  i love listening to music and going to concerts   

                         

In [40]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [41]:
#importing the Lemmatizer function from nltk library
from nltk.stem import WordNetLemmatizer

In [42]:
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

In [47]:
# Function for lemmatization
def lemmatizer(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text])

In [48]:
# Apply the functions, convert to lowercase, and tokenize
data_no_punctuation = [remove_punctuation(sentence) for sentence in data]
data_lower = [sentence.lower() for sentence in data_no_punctuation]
data_remove_number = [remove_numbers(sentence) for sentence in data_lower]
data_token = [word_tokenize(sentence) for sentence in data_remove_number]
data_remove_stopword = [remove_stopwords(sentence) for sentence in data_token]
data_stem = [stemming(sentence) for sentence in data_remove_stopword]
data_lemm = [lemmatizer(sentence) for sentence in data_stem]

# Create a DataFrame
df = pd.DataFrame({
    'Original Text': data,
    'Text Without Punctuation': data_no_punctuation,
    'Text in Lowercase': data_lower,
    'Text without numbers': data_remove_number,
    'Text tokenized': data_token,
    'Text without stopword': data_remove_stopword,
    'Text Stem': data_stem,
    'Text Lemm': data_lemm
})

print(df)

                                     Original Text  \
0          I love playing football on the weekends   
1      I enjoy hiking and camping in the mountains   
2            I like to read books and watch movies   
3         I prefer playing video games over sports   
4  I love listening to music and going to concerts   

                          Text Without Punctuation  \
0          I love playing football on the weekends   
1      I enjoy hiking and camping in the mountains   
2            I like to read books and watch movies   
3         I prefer playing video games over sports   
4  I love listening to music and going to concerts   

                                 Text in Lowercase  \
0          i love playing football on the weekends   
1      i enjoy hiking and camping in the mountains   
2            i like to read books and watch movies   
3         i prefer playing video games over sports   
4  i love listening to music and going to concerts   

                         

In [66]:
# Apply TF-IDF vectorizer to lemmatized data
vectorizer = TfidfVectorizer() 
X = vectorizer.fit_transform(data_lemm)

print(X.shape)

(5, 21)


In [67]:
k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k) 
km.fit(X) 

In [68]:
# Predict the clusters for each document 
y_pred = km.predict(X) 

In [69]:
# Display the document and its predicted cluster in a table 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(data_lemm, y_pred)]) 
print(tabulate(table_data, headers="firstrow")) 

Document                        Predicted Cluster
----------------------------  -------------------
love play footbal weekend                       1
enjoy hike camp mountain                        0
like read book watch movi                       0
prefer play video game sport                    0
love listen music go concert                    1


In [70]:
# Print top terms per cluster 
print("\nTop terms per cluster:") 
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
terms = vectorizer.get_feature_names_out() 
for i in range(k): 
    print("Cluster %d:" % i) 
    for ind in order_centroids[i, :10]: 
        print(' %s' % terms[ind]) 
        print()


Top terms per cluster:
Cluster 0:
 camp

 mountain

 hike

 enjoy

 video

 sport

 prefer

 game

 book

 read

Cluster 1:
 love

 footbal

 weekend

 go

 music

 concert

 listen

 play

 sport

 camp



In [71]:
# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity) 

Purity: 0.6
