In [64]:
#latent semantic indexing

#first we need to import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [65]:
#now we need to import the dataset 20 newsgroups from sklearn
from sklearn.datasets import fetch_20newsgroups

#now we need to import the dataset
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

In [18]:
#now lets do some preprocessing

#first we need to import the stopwords
from nltk.corpus import stopwords

#now we need to import the stemmer
from nltk.stem.porter import PorterStemmer

#now we need to import the lemmatizer
from nltk.stem import WordNetLemmatizer

In [103]:
#removing the 
nltk.download('stopwords')
nltk.download('wordnet')
import re
corpus = []
for i in range(0, len(dataset.data)):
    review = re.sub('[^a-zA-Z]', ' ', dataset.data[i])
    review = review.lower()
    review = review.split()
    
    #now we need to do stemming
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    
    #now we need to do lemmatization
    lm = WordNetLemmatizer()
    review = [lm.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    
    #now we need to join the words
    review = ' '.join(review)
    corpus.append(review)


[nltk_data] Downloading package stopwords to C:\Users\Soumil
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Soumil
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [104]:
#import the count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# #import the tfidf vectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer

#create a term document matrix using count vectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()



In [116]:
#apply svd on the term document matrix
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100, random_state=0)
X = svd.fit_transform(X)

#find the singular vectors corresponding to the top singular values
U = svd.components_

#find the singular values
S = svd.singular_values_

#find the variance explained by the top singular values
explained_variance = svd.explained_variance_ratio_

#now we need to find the top 10 words in each topic
terms = cv.get_feature_names_out()

for i, comp in enumerate(U):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ", end=" ")
    #print in one line
    for t in sorted_terms:
        print(t[0], end=" ")
    print(" ")

Topic 0:  abil around among associ app armenian assembl angel anyth altern  
Topic 1:  abl absolut atheism area art attempt atheist attitud approach assum  
Topic 2:  absolut art attitud assum assembl approach assist area appli anyway  
Topic 3:  abus area attack atheism atheist approach art assum anyway appreci  
Topic 4:  ac articl assembl armenian arm attempt assist applic appli assert  
Topic 5:  acceler armi assert april assist archiv argu art attack armenian  
Topic 6:  accept articl attempt assembl assert applic armenian assist apr arm  
Topic 7:  access armi april argument armenia archiv assert associ argu atheism  
Topic 8:  accord armenia associ assert april archiv armi argument armenian arab  
Topic 9:  account art attempt assembl attitud armenian area assum armi archiv  
Topic 10:  across area art attitud approach assist assum assembl apr arm  
Topic 11:  act assum arm approach attitud art assembl articl assist area  
Topic 12:  action atheism area around attempt atheist ar

In [117]:
singular_values = svd.singular_values_
singular_vectors = svd.components_

# Assuming you want to analyze the first topic
first_topic = singular_vectors[0]

# Get the indices of the terms with the highest weightings in the first topic
top_indices = first_topic.argsort()[::-1][:10]  # Adjust the number 10 as needed

# Get the feature names (terms) from the CountVectorizer
feature_names = cv.get_feature_names_out()

# Get the terms with the highest weightings in the first topic
top_terms = [feature_names[i] for i in top_indices]

print("Top terms in the first topic:")
print(top_terms)

Top terms in the first topic:
['abil', 'around', 'among', 'associ', 'app', 'armenian', 'assembl', 'angel', 'anyth', 'altern']


In [118]:
# Query preprocessing
query = "Best Graphics card for gaming in 2023" 
query = re.sub('[^a-zA-Z]', ' ', query)
query = query.lower()
query = query.split()

#now we need to do stemming
query = [ps.stem(word) for word in query if not word in set(stopwords.words('english'))]

#now we need to do lemmatization
query = [lm.lemmatize(word) for word in query if not word in set(stopwords.words('english'))]

#now we need to join the words
query = ' '.join(query)

# Query vectorization
query_vec = cv.transform([query]).toarray()

# Query reduction
query_vec_reduced = svd.transform(query_vec)

# Calculate the cosine similarities
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(query_vec_reduced, X)

# Get the top 5 most similar documents
most_similar_doc_indices = cosine_similarities.argsort()[0][::-1][:5]  # Adjust the number 5 as needed

# Print the most similar documents
print("Most similar documents:")
with open("documents.txt", "w") as f:
    for i in most_similar_doc_indices:
        f.write("Document "+str(i+1)+": \n")
        f.write(dataset.data[i])
        f.write("\n")
        f.write("----------------------------------------------------------------")
        f.write("\n")
        f.write("\n")

ValueError: X has 1500 features, but TruncatedSVD is expecting 100 features as input.

In [110]:
#find the top n documents similar to the query
n = 5
top_n = np.argsort(similarities, axis=1)[:,-n:]

#write the top n documents to a file
with open("documents.txt", "w") as f:
    for i in range(len(top_n[0])):
        f.write("Document "+str(i+1)+": \n")
        f.write(dataset.data[top_n[0][i]])
        f.write("\n")
        f.write("----------------------------------------------------------------")
        f.write("\n")
        f.write("\n")

In [114]:

from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score, silhouette_score

# Cluster the documents
n_clusters = 20  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
document_clusters = kmeans.fit_predict(X)

# Calculate clustering evaluation metrics
# You may need the true labels for the 20 newsgroups dataset for this part
true_labels = dataset.target

purity = purity_score(true_labels, document_clusters)
nmi = normalized_mutual_info_score(true_labels, document_clusters)
silhouette = silhouette_score(X, document_clusters)

print(f'\nNumber of Clusters: {n_clusters}')
print(f'Silhouette Score: {silhouette}')



  super()._check_params_vs_input(X, default_n_init=10)



Number of Clusters: 20
Silhouette Score: 0.9537381349565673
