In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
import re
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import numpy as np

nltk.download('stopwords')
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
def load_dataset(a_set, cats):
    dataset = fetch_20newsgroups(subset=a_set, categories=cats,
                                remove=('headers', 'footers', 'quotes'),
                                shuffle=True)
    return dataset

categories = ["comp.windows.x", "misc.forsale", "rec.autos", "rec.motorcycles",
            "rec.sport.baseball", "rec.sport.hockey", "sci.crypt", "sci.med",
            "sci.space", "talk.politics.mideast"]

In [3]:
newsgroups_all = load_dataset('all', categories)
print(f"Loaded {len(newsgroups_all.data)} docs.")

Loaded 9850 docs.


In [4]:
def preprocess(text):
    # Usuń znaki specjalne i liczby używając wyrażenia regularnego
    text = re.sub(r'[^a-zA-Z\s]', '', text)

     # Podziel tekst na tokeny, usuwając stop words i stosując stemming
    tokens = [stemmer.stem(word) for word in text.lower().split() if word not in stop_words]
    return ' '.join(tokens)

In [5]:
preprocessed_docs = [preprocess(doc) for doc in newsgroups_all.data]

In [6]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

In [8]:
# Wybierz dwa z LSA, LDA i NMF, zaimplementuj te rozwiązania i porównaj wyniki
# wyświetl 10 tematów, porównaj je ze sobą
# wyświetl artykuły (albo jego część np. 100 słów) dla jednego z tematów i zobacz co na jego temat mają do powiedzenia modele


# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)
tfidf = vectorizer.fit_transform(preprocessed_docs)

# LDA
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda_topics = lda.fit_transform(tfidf)

# NMF
nmf = NMF(n_components=10, random_state=42)
nmf_topics = nmf.fit_transform(tfidf)

# Function to display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Display topics for LDA and NMF
no_top_words = 10
feature_names = vectorizer.get_feature_names_out()
print("LDA Topics:")
display_topics(lda, feature_names, no_top_words)
print("\nNMF Topics:")
display_topics(nmf, feature_names, no_top_words)


LDA Topics:
Topic 1:
dog motorcycl univers new delet april pictur lost th california
Topic 2:
game team hockey basebal fan play year player go think
Topic 3:
game player play year team hit pitch win score run
Topic 4:
key use email pleas sale offer encrypt system chip includ
Topic 5:
car bike would like one get use engin ride dont
Topic 6:
israel armenian peopl jew arab space isra us would muslim
Topic 7:
mr oh quot articl answer know want instead someth your
Topic 8:
window use server file program widget thank run display motif
Topic 9:
david van pit bos la det trust nsa think say
Topic 10:
one would dont know peopl like get effect think use

NMF Topics:
Topic 1:
like dont would get one think go know im thing
Topic 2:
team year player play season last win leagu hockey hit
Topic 3:
thank pleas email anyon know list mail post send address
Topic 4:
key chip encrypt clipper use secur govern phone escrow system
Topic 5:
armenian turkish muslim peopl genocid armenia turk turkey govern russi