In [1]:
#latent semantic indexing

#first we need to import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [2]:
#now we need to import the dataset 20 newsgroups from sklearn
from sklearn.datasets import fetch_20newsgroups

#now we need to import the dataset
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

In [3]:
#now lets do some preprocessing

#first we need to import the stopwords
from nltk.corpus import stopwords

#now we need to import the stemmer
from nltk.stem.porter import PorterStemmer

#now we need to import the lemmatizer
from nltk.stem import WordNetLemmatizer

In [4]:
#removing the 
nltk.download('stopwords')
nltk.download('wordnet')
import re
corpus = []
for i in range(0, len(dataset.data)):
    review = re.sub('[^a-zA-Z]', ' ', dataset.data[i])
    review = review.lower()
    review = review.split()
    
    #now we need to do stemming
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    
    #now we need to do lemmatization
    lm = WordNetLemmatizer()
    review = [lm.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    
    #now we need to join the words
    review = ' '.join(review)
    corpus.append(review)


[nltk_data] Downloading package stopwords to C:\Users\Soumil
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Soumil
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
#import the count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# #import the tfidf vectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer

#create a term document matrix using count vectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()



In [6]:
#apply svd on the term document matrix
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100, random_state=0)
X = svd.fit_transform(X)

#find the singular vectors corresponding to the top singular values
U = svd.components_

#find the singular values
S = svd.singular_values_

#find the variance explained by the top singular values
explained_variance = svd.explained_variance_ratio_

#now we need to find the top 10 words in each topic
terms = cv.get_feature_names_out()

for i, comp in enumerate(U):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ", end=" ")
    #print in one line
    for t in sorted_terms:
        print(t[0], end=" ")
    print(" ")

Topic 0:  ax max pl ei tm bhj giz di ey wm  
Topic 1:  use file one program edu imag get also system avail  
Topic 2:  cx db hz scx ww ck uw lk sc chz  
Topic 3:  db bh si byte bit al di one push inc  
Topic 4:  one go peopl say know said would think mr stephanopoulo  
Topic 5:  file output entri program char ok stream line build rule  
Topic 6:  di pl tm wm um bxn giz sl ql tq  
Topic 7:  jpeg imag file mr stephanopoulo gif presid color go format  
Topic 8:  jpeg imag wire gif color bit one format use program  
Topic 9:  file wire gun use jpeg ground firearm law control circuit  
Topic 10:  stephanopoulo mr presid wire use drive widget work ground packag  
Topic 11:  drive control disk st hard gun bio support rom file  
Topic 12:  wire edu hockey team leagu file game com ground nhl  
Topic 13:  hockey team imag jpeg game leagu new year nhl season  
Topic 14:  launch space widget use year satellit applic state program new  
Topic 15:  tl uw ww pl ah pu mw air hz dy  
Topic 16:  hockey 

In [7]:
singular_values = svd.singular_values_
singular_vectors = svd.components_

# Assuming you want to analyze the first topic
first_topic = singular_vectors[0]

# Get the indices of the terms with the highest weightings in the first topic
top_indices = first_topic.argsort()[::-1][:10]  # Adjust the number 10 as needed

# Get the feature names (terms) from the CountVectorizer
feature_names = cv.get_feature_names_out()

# Get the terms with the highest weightings in the first topic
top_terms = [feature_names[i] for i in top_indices]

print("Top terms in the first topic:")
print(top_terms)

Top terms in the first topic:
['ax', 'max', 'pl', 'ei', 'tm', 'bhj', 'giz', 'di', 'ey', 'wm']


In [9]:
# Query preprocessing
query = "Best Graphics card for gaming in 2023" 
query = re.sub('[^a-zA-Z]', ' ', query)
query = query.lower()
query = query.split()

#now we need to do stemming
query = [ps.stem(word) for word in query if not word in set(stopwords.words('english'))]

#now we need to do lemmatization
query = [lm.lemmatize(word) for word in query if not word in set(stopwords.words('english'))]

#now we need to join the words
query = ' '.join(query)

# Query vectorization
query_vec = cv.transform([query]).toarray()

# Query reduction
query_vec_reduced = svd.transform(query_vec)

# Calculate the cosine similarities
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(query_vec_reduced, X)

# Get the top 5 most similar documents
most_similar_doc_indices = cosine_similarities.argsort()[0][::-1][:5]  # Adjust the number 5 as needed

# Print the most similar documents
print("Most similar documents:")
with open("documents.txt", "w") as f:
    for i in most_similar_doc_indices:
        print("Document "+str(i+1)+": \n")
        f.write("Document "+str(i+1)+": \n")
        print(dataset.data[i])
        f.write(dataset.data[i])
        print("\n")
        f.write("\n")
        print("----------------------------------------------------------------")
        f.write("----------------------------------------------------------------")
        print("\n")
        f.write("\n")
        print("\n")
        f.write("\n")

Most similar documents:
Document 2576: 

I own an 8088 640K clone which does all I want except run 1 game I want
to buy.  The game says it requires a 80286 with 640K.  Game tech. support
says game will run on 8088 but uses a some digitized graphics which would
make it run really *slow* (it's a card game - Hoyles Classic Card Games,
digitized graphics are photos artwork of game fictional card players).

What can I do to speed up how this game would run, short of an 80286
motherboard upgrade.  Co-processor?  Accelerator card mimicking 80286?
My 8088 can run at 10 Mhz.  Any advice would be greatly appreciated.


----------------------------------------------------------------




Document 2219: 

I have the following items for sale.  
Buyer pays the shipping costs.  
Hardware is new and unused unless marked otherwise.  
All software includes original disks and manuals.  
No reasonable offers refused.
Send offers/questions to cpc3@po.cwru.edu

Hardware
---------
Hercules Graphics Card - mo

In [12]:

from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score, silhouette_score

# Cluster the documents
n_clusters = 20  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
document_clusters = kmeans.fit_predict(X)

# Calculate clustering evaluation metrics
# You may need the true labels for the 20 newsgroups dataset for this part
true_labels = dataset.target
silhouette = silhouette_score(X, document_clusters)

print(f'\nNumber of Clusters: {n_clusters}')
print(f'Silhouette Score: {silhouette}')



  super()._check_params_vs_input(X, default_n_init=10)



Number of Clusters: 20
Silhouette Score: 0.9537381349565673
