In [24]:
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Path to the folder containing text files
folder_path = 'data'

# Initialize lists to store file names and text data
file_names = []
text_data = []

# Read all text files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.txt'):
        with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
            file_names.append(file_name)
            text_data.append(file.read())

# Create a DataFrame
data = pd.DataFrame({'file_name': file_names, 'text': text_data})

# Preprocessing function
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [ps.stem(word) for word in tokens if word.lower() not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
data['processed_text'] = data['text'].apply(preprocess)
print(data.head())

           file_name                                               text  \
0      badminton.txt  Badminton is a racquet sport played using racq...   
1   barack obama.txt  Barack Hussein Obama II (born August 4, 1961) ...   
2       baseball.txt  Baseball is a bat-and-ball game played between...   
3   lee quan yew.txt  Lee Kuan Yew, GCMG, CH, SPMJ (born Harry Lee K...   
4  narendra modi.txt  Narendra Damodardas Modi (born 17 September 19...   

                                      processed_text  
0  badminton racquet sport play use racquet hit s...  
1  barack hussein obama ii ( born august 4 , 1961...  
2  basebal bat-and-bal game play two team nine pl...  
3  lee kuan yew , gcmg , ch , spmj ( born harri l...  
4  narendra damodarda modi ( born 17 septemb 1950...  


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sarwa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sarwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['processed_text'])

# Calculate Cosine Similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
print(similarity_matrix)

[[1.         0.0019422  0.13301019 0.0242031  0.0279295  0.01378345
  0.00588617 0.23820629]
 [0.0019422  1.         0.04499789 0.10135787 0.05576688 0.02060439
  0.08394575 0.01821279]
 [0.13301019 0.04499789 1.         0.021111   0.02106339 0.01497158
  0.02474223 0.09300259]
 [0.0242031  0.10135787 0.021111   1.         0.10885693 0.0310234
  0.11210892 0.05343237]
 [0.0279295  0.05576688 0.02106339 0.10885693 1.         0.02281268
  0.17992368 0.03642488]
 [0.01378345 0.02060439 0.01497158 0.0310234  0.02281268 1.
  0.03316937 0.02457381]
 [0.00588617 0.08394575 0.02474223 0.11210892 0.17992368 0.03316937
  1.         0.0157528 ]
 [0.23820629 0.01821279 0.09300259 0.05343237 0.03642488 0.02457381
  0.0157528  1.        ]]


In [28]:
from sklearn.decomposition import TruncatedSVD

# Apply LSA
lsa = TruncatedSVD(n_components=100)
lsa_matrix = lsa.fit_transform(tfidf_matrix)

# Calculate Cosine Similarity in LSA space
lsa_similarity_matrix = cosine_similarity(lsa_matrix)
print(lsa_similarity_matrix)

[[1.         0.0019422  0.13301019 0.0242031  0.0279295  0.01378345
  0.00588617 0.23820629]
 [0.0019422  1.         0.04499789 0.10135787 0.05576688 0.02060439
  0.08394575 0.01821279]
 [0.13301019 0.04499789 1.         0.021111   0.02106339 0.01497158
  0.02474223 0.09300259]
 [0.0242031  0.10135787 0.021111   1.         0.10885693 0.0310234
  0.11210892 0.05343237]
 [0.0279295  0.05576688 0.02106339 0.10885693 1.         0.02281268
  0.17992368 0.03642488]
 [0.01378345 0.02060439 0.01497158 0.0310234  0.02281268 1.
  0.03316937 0.02457381]
 [0.00588617 0.08394575 0.02474223 0.11210892 0.17992368 0.03316937
  1.         0.0157528 ]
 [0.23820629 0.01821279 0.09300259 0.05343237 0.03642488 0.02457381
  0.0157528  1.        ]]


In [30]:
from sklearn.cluster import KMeans

# Number of clusters
num_clusters = 5

# Apply K-Means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Add cluster labels to the DataFrame
data['cluster'] = kmeans.labels_
print(data[['file_name', 'cluster']])



             file_name  cluster
0        badminton.txt        2
1     barack obama.txt        4
2         baseball.txt        1
3     lee quan yew.txt        4
4    narendra modi.txt        0
5  queen elizabeth.txt        3
6       shinzo abe.txt        0
7     table tennis.txt        2
