In [1]:
pip install numpy scikit-learn pandas


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
import time

# Fetch sample data - 20 newsgroups dataset
data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
documents = data.data

# Preprocess text data using TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)

# LSA (Latent Semantic Analysis)
start_time = time.time()
lsa = TruncatedSVD(n_components=10, random_state=42)
lsa_topics = lsa.fit_transform(tfidf)
lsa_time = time.time() - start_time

# NMF (Non-Negative Matrix Factorization)
start_time = time.time()
nmf = NMF(n_components=10, random_state=42)
nmf_topics = nmf.fit_transform(tfidf)
nmf_time = time.time() - start_time

print(f"LSA Running Time: {lsa_time:.4f} seconds")
print(f"NMF Running Time: {nmf_time:.4f} seconds")


C:\Users\prith\anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
C:\Users\prith\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\prith\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


LSA Running Time: 0.5970 seconds
NMF Running Time: 1.8824 seconds


In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
import time

# Fetch sample data - 20 newsgroups dataset
data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
documents = data.data

# Preprocess text data using TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)

# NMF (Non-Negative Matrix Factorization)
start_time = time.time()
nmf = NMF(n_components=10, random_state=42)
nmf_topics = nmf.fit_transform(tfidf)
nmf_time = time.time() - start_time

# LSA (Latent Semantic Analysis)
start_time = time.time()
lsa = TruncatedSVD(n_components=10, random_state=42)
lsa_topics = lsa.fit_transform(tfidf)
lsa_time = time.time() - start_time

print(f"NMF Running Time: {nmf_time:.4f} seconds")
print(f"LSA Running Time: {lsa_time:.4f} seconds")


NMF Running Time: 1.9180 seconds
LSA Running Time: 0.5215 seconds


In [6]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
import numpy as np

# Load the 20 newsgroups dataset
data = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes'))
documents = data.data

# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)

# Perform LSA (Truncated SVD)
lsa = TruncatedSVD(n_components=5, random_state=42)
lsa_topics = lsa.fit_transform(tfidf)

# Perform NMF
nmf = NMF(n_components=5, random_state=42)
nmf_topics = nmf.fit_transform(tfidf)

# Print top words for each topic for both LSA and NMF
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

n_top_words = 10
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())

print("LSA Topics:")
print_top_words(lsa, feature_names, n_top_words)

print("\nNMF Topics:")
print_top_words(nmf, feature_names, n_top_words)


LSA Topics:
Topic #0: don just like know people think does use time good
Topic #1: windows thanks drive card dos file pc scsi software program
Topic #2: god windows jesus does bible thanks christ dos christian faith
Topic #3: drive scsi god ide hard card controller drives disk game
Topic #4: drive key scsi chip government encryption clipper keys law ide

NMF Topics:
Topic #0: don just people think like good know time did right
Topic #1: windows thanks file dos program does know files mail use
Topic #2: god jesus bible believe christ christian faith christians does sin
Topic #3: drive scsi card ide disk hard controller drives bus floppy
Topic #4: key chip encryption clipper government keys escrow use law algorithm


In [7]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
import numpy as np

# Load the 20 newsgroups dataset
data = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes'))
documents = data.data

# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)

# Perform LSA (Truncated SVD)
lsa = TruncatedSVD(n_components=10, random_state=42)
lsa_topics = lsa.fit_transform(tfidf)

# Perform NMF
nmf = NMF(n_components=10, random_state=42)
nmf_topics = nmf.fit_transform(tfidf)

# Print top words for each topic for both LSA and NMF
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

n_top_words = 10
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())

print("LSA Topics:")
print_top_words(lsa, feature_names, n_top_words)

print("\nNMF Topics:")
print_top_words(nmf, feature_names, n_top_words)


LSA Topics:
Topic #0: don just like know people think does use time good
Topic #1: windows thanks drive card dos file pc software scsi program
Topic #2: god windows jesus does bible thanks christ christian dos faith
Topic #3: drive scsi god ide card controller hard drives game disk
Topic #4: drive key scsi government chip encryption clipper people keys ide
Topic #5: windows dos file think problem drive os window run disk
Topic #6: know thanks don does just like drive car people advance
Topic #7: key game chip does god clipper encryption keys know team
Topic #8: geb edu dsl cadre n3jxp pitt chastity skepticism intellect shameful
Topic #9: car just bike 00 good ve new god engine like

NMF Topics:
Topic #0: don just like think know good ve time really want
Topic #1: windows dos file program files window use using run running
Topic #2: god jesus bible believe christ faith christian christians sin church
Topic #3: drive scsi ide disk card controller hard drives bus floppy
Topic #4: key chip