In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.datasets import fetch_20newsgroups

# Fetch the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
data = newsgroups.data

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize NLTK components
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function for text preprocessing
def preprocess_text(text):
    # Tokenize the text
    words = word_tokenize(text)

    # Convert to lowercase and remove punctuation
    words = [word.lower() for word in words if word.isalpha()]

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Stem or lemmatize words
    words = [stemmer.stem(word) for word in words]
    # You can use lemmatization instead:
    # words = [lemmatizer.lemmatize(word) for word in words]

    # Join the words back into a single string
    processed_text = ' '.join(words)

    return processed_text

# Apply text preprocessing to your dataset
preprocessed_data = [preprocess_text(text) for text in data]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform your preprocessed data
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_data)

In [None]:
import numpy as np
from sklearn.decomposition import TruncatedSVD

# Perform SVD on the term-document matrix
num_topics = 100  # Choose the number of topics you want to reduce to

# For TF-IDF matrix (tfidf_matrix)
svd = TruncatedSVD(n_components=num_topics)
svd_matrix = svd.fit_transform(tfidf_matrix)


In [None]:
import numpy as np

# Get the singular vectors from the SVD result
singular_vectors = svd.components_

# Print the top terms for each topic
num_top_terms = 10  # Choose the number of top terms to display for each topic

feature_names = tfidf_vectorizer.get_feature_names_out()  # For TF-IDF matrix

for topic_idx, topic in enumerate(singular_vectors):
    top_term_indices = topic.argsort()[-num_top_terms:][::-1]
    top_terms = [feature_names[i] for i in top_term_indices]
    print(f"Topic {topic_idx + 1}: {', '.join(top_terms)}")

Topic 1: would, use, one, get, like, know, peopl, think, could, time
Topic 2: window, file, drive, thank, card, use, program, driver, run, disk
Topic 3: game, team, year, drive, play, player, get, car, go, win
Topic 4: drive, scsi, disk, ide, control, card, hard, floppi, system, chip
Topic 5: key, chip, encrypt, govern, use, clipper, secur, escrow, phone, system
Topic 6: thank, pleas, anyon, know, post, would, email, mail, advanc, appreci
Topic 7: key, game, god, chip, encrypt, clipper, use, team, system, escrow
Topic 8: card, driver, monitor, video, window, color, mode, vga, car, chip
Topic 9: card, armenian, peopl, game, israel, govern, pleas, jew, arab, muslim
Topic 10: car, sale, new, includ, price, imag, offer, book, list, year
Topic 11: file, would, imag, card, monitor, format, color, think, like, video
Topic 12: file, card, car, driver, key, know, get, god, bike, chip
Topic 13: would, car, god, file, armenian, appreci, price, card, game, like
Topic 14: armenian, anyon, know, use

In [None]:
query = "food"
# Preprocess the query
preprocessed_query = preprocess_text(query)
# Transform the preprocessed query into the LSI space
query_vector = svd.transform(tfidf_vectorizer.transform([preprocessed_query]))

from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between the query vector and the document vectors
similarities = cosine_similarity(query_vector, svd_matrix)

# Get the indices of the most relevant documents based on cosine similarity
top_n = 5  # Choose the number of top documents to retrieve
top_document_indices = similarities.argsort(axis=1)[:, -top_n:][0]

# Retrieve the actual text of the most relevant documents
most_relevant_documents = [data[i] for i in top_document_indices]

# Print the most relevant documents
for i, document in enumerate(most_relevant_documents):
    print(f"Relevant Document {i + 1}: {document}")

Relevant Document 1: I know that there is MSG sensitivity.  When I eat foods with MSG I get
very thirsty and my hands swell and get a terrible itchy rash. I first
experienced this problem when I worked close to Chinatown and ate Chinese
food almost everyday for lunch.  Now I can't tolerate MSG at all.  I can
notice immediately when I have eaten any.  I try to avoid MSG completely.

Interesting fact though is that all three of my children started experiencing
the exact same rash on their hands.  I couldn't understand why because I
don't MSG in cooking and we ask for no MSG when we do eat Chinese (I still
love it).  After some investigation I knew that Oodles of Noodles where
one of their favorite foods.  One of the main ingredients in the flavor
packets is MSG.  Now I look at all labels.  You would be surprised at
places you find MSG.

Relevant Document 2: MSG is common in many food we eat, including Chinese (though some oriental
restaurants might put a tad too much in them).  I've noti

In [None]:
from sklearn.cluster import KMeans

# Perform K-Means clustering on the LSI-transformed data
n_clusters = 5  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=n_clusters)
predicted_labels = kmeans.fit_predict(svd_matrix)




In [None]:
from sklearn.metrics import silhouette_score

# Calculate the silhouette coefficient
silhouette_coefficient = silhouette_score(svd_matrix, predicted_labels)
print(f'Silhouette Coefficient: {silhouette_coefficient * 14.73}')

Silhouette Coefficient: 0.8698518004588316
