In [None]:
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import nltk
nltk.download('punkt')

# Download stopwords and initialize stemmer
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Text preprocessing function
def preprocess_text(text):
    # Tokenize, lowercase, remove stopwords, and stem
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Preprocess all documents
preprocessed_documents = [preprocess_text(doc) for doc in documents]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
term_document_matrix = vectorizer.fit_transform(preprocessed_documents)


In [None]:
from sklearn.decomposition import TruncatedSVD

num_topics = 100
lsa = TruncatedSVD(n_components=num_topics)
lsi_matrix = lsa.fit_transform(term_document_matrix)


In [None]:
terms = vectorizer.get_feature_names_out()

# Print top terms for each topic
for i, topic in enumerate(lsa.components_):
    top_terms = [terms[j] for j in topic.argsort()[-10:][::-1]]
    print(f"Topic {i+1}: {', '.join(top_terms)}")


Topic 1: would, use, one, get, like, know, peopl, think, could, time
Topic 2: window, file, drive, thank, card, use, program, driver, run, disk
Topic 3: game, team, year, drive, play, player, get, car, go, win
Topic 4: drive, scsi, disk, ide, control, card, hard, floppi, system, chip
Topic 5: key, chip, encrypt, govern, use, clipper, secur, escrow, phone, system
Topic 6: thank, pleas, anyon, know, post, would, email, mail, advanc, appreci
Topic 7: key, game, god, chip, encrypt, clipper, use, team, system, escrow
Topic 8: card, driver, monitor, video, window, color, mode, vga, car, chip
Topic 9: card, armenian, peopl, game, israel, govern, pleas, jew, arab, muslim
Topic 10: car, sale, new, includ, price, imag, offer, book, list, year
Topic 11: file, would, imag, card, monitor, format, color, think, like, video
Topic 12: file, card, car, driver, key, know, get, god, bike, chip
Topic 13: would, car, god, file, armenian, appreci, price, card, game, like
Topic 14: armenian, anyon, know, use

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Create a query
query = "science and technology"

# Preprocess the query
query = preprocess_text(query)

# Transform the query into the LSI space
query_vector = lsa.transform(vectorizer.transform([query]))

# Compute cosine similarity between the query and documents
similarities = cosine_similarity(query_vector, lsi_matrix)

# Get the most relevant document indices
top_indices = similarities[0].argsort()[::-1]

# Print the most relevant documents
for i in range(5):
    print(f"Document {i + 1}: {documents[top_indices[i]]}")


Document 1: -- 
PAOLO,MARC ANTHONY
Georgia Institute of Technology, Atlanta Georgia, 30332
uucp:     ...!{allegra,amd,hplabs,ut-ngp}!gatech!prism!gt4661a
Internet: gt4661a@prism.gatech.edu

Document 2: 
Gulp.

[Disclaimer:  This opinion is mine and does not represent the views of
Fermilab, Universities Research Association, the Department of Energy,
or the 49th Ward Regular Science Fiction Organization.]
 
Document 3: The following statement was released
on February 27,1992 by the Science &
Environmental Policy Project

As independent scientists researching atmosphere and climate problems, we are 
concerned by the agenda for UNCED, the United Nations Conference on 
Environment and Development, being developed by environmental and  activists 
groups and certain political leaders. This so called "Earth Summit" is 
scheduled to convene in  Brazil in June 1992 and aims to impose a system of 
global envionmental regulations, including onerous taxes on energy fuels, on 
the population of the

In [None]:
#Evaluation
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score,adjusted_mutual_info_score, normalized_mutual_info_score, completeness_score, homogeneity_score, v_measure_score, silhouette_score


In [None]:
# Perform K-Means clustering on the LSI results
n_clusters = 20  # Adjust this number based on your dataset
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
clusters = kmeans.fit_predict(lsi_matrix)




In [None]:
from sklearn.metrics import accuracy_score

# Assuming you have actual labels for documents
true_labels = newsgroups.target  # Replace with your actual labels

# Calculate purity
purity = accuracy_score(true_labels, clusters)
print(f'Purity: {purity}')


Purity: 0.058898439987265204


In [None]:
nmi = normalized_mutual_info_score(true_labels, clusters)
print(f'Normalized Mutual Info Score(NMI): {nmi}')


NMI: 0.3131003278003342


In [None]:
silhouette_avg = silhouette_score(lsi_matrix, clusters)
print(f'Silhouette Score: {silhouette_avg}')


Silhouette Score: 0.025945077776465663


In [None]:
completeness = completeness_score(true_labels, clusters)
homogeneity = homogeneity_score(true_labels, clusters)
v_measure = v_measure_score(true_labels, clusters)

print(f'Completeness: {completeness}')
print(f'Homogeneity: {homogeneity}')
print(f'V-Measure: {v_measure}')


Completeness: 0.34756798361468416
Homogeneity: 0.2848520880546633
V-Measure: 0.3131003278003342


In [None]:
ari_score = adjusted_rand_score(true_labels, clusters)
print(f"Adjusted Rand Index (ARI): {ari_score}")
ami_score = adjusted_mutual_info_score(true_labels, clusters)
print(f"Adjusted Mutual Info Index (AMI): {ami_score}")

Adjusted Rand Index (ARI): 0.07192883408330963
Adjusted Mutual Info Index (AMI): 0.31064911873418827


In [None]:
Normalized Mutual Info Score(NMI): 0.3131003278003342
Silhouette Score: 0.025945077776465663
Completeness: 0.34756798361468416
Homogeneity: 0.2848520880546633
V-Measure: 0.3131003278003342
Adjusted Rand Index (ARI): 0.07192883408330963
Adjusted Mutual Info Index (AMI): 0.31064911873418827