In [26]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

In [2]:
# Define the four specific newsgroup classes you want to use
selected_classes = [
 'alt.atheism',
 'comp.graphics',
 'sci.space',
 'talk.politics.mideast'
]

In [40]:
stop_words = set(stopwords.words("english"))
stop_words = stop_words.union({'one', 'would', 'like', 'could'})

In [3]:
# Load the 20 Newsgroups dataset for the selected classes
newsgroups = fetch_20newsgroups(subset='all', categories=selected_classes, remove=('headers', 'footers', 'quotes'))

In [41]:
# Preprocess the text data and create a list of tokenized documents
tokenized_documents = [simple_preprocess(text) for text in newsgroups.data]

In [42]:
# Create a dictionary mapping of words to unique IDs
dictionary = corpora.Dictionary(tokenized_documents)

In [43]:
# Получаем идентификаторы слов, которые являются стоп-словами
stop_ids = [dictionary.token2id[word] for word in stop_words 
            if word in dictionary.token2id]

# Удаляем стоп-слова из словаря
dictionary.filter_tokens(stop_ids)

# Также можно удалить редкие и частые слова
dictionary.filter_extremes(no_below=5, no_above=0.5)

In [44]:
# Create a Bag of Words (BoW) representation of the documents
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

In [45]:
# Train the LDA model
lda_model = models.LdaModel(bow_corpus, num_topics=20, id2word=dictionary, passes=15, alpha='symmetric', eta='auto')

In [46]:
def print_topics(model, num_topics):
    topics = lda_model.print_topics(num_words=10)
    for topic in topics:
        print(topic)
        print()

In [47]:
print_topics(lda_model, 10)

(0, '0.013*"israel" + 0.010*"people" + 0.007*"israeli" + 0.006*"jews" + 0.006*"arab" + 0.005*"think" + 0.005*"right" + 0.005*"jewish" + 0.004*"even" + 0.004*"state"')

(1, '0.020*"azerbaijan" + 0.014*"armenian" + 0.012*"said" + 0.010*"azeri" + 0.009*"armenia" + 0.008*"karabakh" + 0.008*"people" + 0.008*"killed" + 0.008*"armenians" + 0.007*"bodies"')

(2, '0.036*"edu" + 0.018*"pub" + 0.016*"mail" + 0.016*"ftp" + 0.014*"send" + 0.012*"graphics" + 0.010*"com" + 0.010*"information" + 0.009*"available" + 0.009*"faq"')

(3, '0.017*"image" + 0.012*"graphics" + 0.010*"data" + 0.009*"software" + 0.007*"available" + 0.006*"computer" + 0.006*"also" + 0.006*"edu" + 0.006*"package" + 0.005*"processing"')

(4, '0.033*"ed" + 0.021*"istanbul" + 0.013*"turkey" + 0.013*"ankara" + 0.010*"new" + 0.009*"ermeni" + 0.009*"york" + 0.009*"kk" + 0.009*"university" + 0.008*"osmanli"')

(5, '0.015*"thanks" + 0.014*"anyone" + 0.013*"know" + 0.013*"bit" + 0.012*"program" + 0.011*"please" + 0.010*"need" + 0.008*"hel

In [48]:
# Iterate through the documents and get their topic distributions
document_topic_vectors = []
for doc_bow in bow_corpus:
    document_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
    document_topic_vector = [topic_prob for _, topic_prob in document_topics]
    document_topic_vectors.append(document_topic_vector)

In [49]:
# Specify the number of documents to print
num_documents_to_print = 5

In [50]:
# Iterate through the documents and get their topic distributions
document_topic_vectors = []
for i, doc_bow in enumerate(bow_corpus):
    document_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
    document_topic_vector = [topic_prob for _, topic_prob in document_topics]
    document_topic_vectors.append(document_topic_vector)
    
    #Print the topic vector for the first num_documents_to_print documents
    if i < num_documents_to_print:
        print(f"Document {i + 1} Topic Vector: {document_topic_vector}")
        print()

Document 1 Topic Vector: [0.008335287, 0.008335287, 0.21907607, 0.008335287, 0.008335287, 0.008335287, 0.29106158, 0.008335287, 0.008335287, 0.008335287, 0.008335287, 0.008335287, 0.008335287, 0.34816247, 0.008335287, 0.008335287, 0.008335287, 0.008335287, 0.008335287, 0.008335287]

Document 2 Topic Vector: [5.8658312e-05, 5.8658312e-05, 5.8658312e-05, 5.8658312e-05, 0.94539326, 5.8658312e-05, 5.8658312e-05, 5.8658312e-05, 5.8658312e-05, 0.03891934, 5.8658312e-05, 5.8658312e-05, 5.8658312e-05, 5.8658312e-05, 5.8658312e-05, 5.8658312e-05, 0.014690209, 5.8658312e-05, 5.8658312e-05, 5.8658312e-05]

Document 3 Topic Vector: [0.00027657443, 0.00027657443, 0.00027657443, 0.3102905, 0.00027657443, 0.26410618, 0.00027657443, 0.00027657443, 0.0952239, 0.00027657443, 0.046542913, 0.00027657443, 0.00027657443, 0.00027657443, 0.021069165, 0.037368536, 0.00027657443, 0.030642651, 0.00027657443, 0.19143727]

Document 4 Topic Vector: [0.000588814, 0.043787237, 0.000588814, 0.000588814, 0.000588814, 0

In [51]:
from gensim.models import LdaModel

In [52]:
corpus = bow_corpus
id2word = dictionary

In [66]:
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=20, passes=3, alpha='auto', eta='auto')
print_topics(lda_model, 10)

(0, '0.028*"adl" + 0.017*"bullock" + 0.012*"gerard" + 0.011*"information" + 0.010*"police" + 0.010*"san" + 0.009*"francisco" + 0.008*"adam" + 0.008*"anti" + 0.007*"fbi"')

(1, '0.050*"university" + 0.037*"professor" + 0.029*"history" + 0.009*"general" + 0.009*"genocide" + 0.007*"studies" + 0.006*"california" + 0.006*"armenian" + 0.005*"turkish" + 0.005*"dr"')

(2, '0.015*"earth" + 0.014*"space" + 0.011*"solar" + 0.010*"spacecraft" + 0.010*"system" + 0.009*"orbit" + 0.009*"planet" + 0.008*"surface" + 0.007*"mission" + 0.007*"mars"')

(3, '0.018*"armenian" + 0.016*"armenians" + 0.013*"turkish" + 0.011*"armenia" + 0.008*"turkey" + 0.007*"russian" + 0.007*"genocide" + 0.007*"azerbaijan" + 0.006*"people" + 0.006*"soviet"')

(4, '0.012*"people" + 0.010*"said" + 0.008*"us" + 0.006*"armenian" + 0.005*"armenians" + 0.005*"know" + 0.005*"went" + 0.004*"go" + 0.004*"children" + 0.004*"time"')

(5, '0.020*"kuwait" + 0.019*"greek" + 0.014*"greece" + 0.010*"turkish" + 0.008*"al" + 0.006*"rights" + 0

In [67]:
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=20, passes=3, alpha=2, eta=2)
print_topics(lda_model, 10)

(0, '0.010*"armenian" + 0.008*"turkish" + 0.007*"armenians" + 0.006*"people" + 0.004*"government" + 0.004*"russian" + 0.004*"turks" + 0.004*"armenia" + 0.004*"genocide" + 0.004*"jews"')

(1, '0.001*"space" + 0.001*"launch" + 0.001*"also" + 0.001*"people" + 0.001*"think" + 0.001*"anyone" + 0.001*"get" + 0.001*"could" + 0.001*"edu" + 0.001*"may"')

(2, '0.001*"know" + 0.001*"please" + 0.001*"anyone" + 0.001*"think" + 0.001*"also" + 0.001*"could" + 0.001*"people" + 0.001*"much" + 0.001*"time" + 0.001*"thanks"')

(3, '0.001*"think" + 0.001*"know" + 0.001*"anyone" + 0.001*"space" + 0.001*"could" + 0.001*"well" + 0.001*"get" + 0.001*"time" + 0.001*"also" + 0.001*"find"')

(4, '0.013*"israel" + 0.007*"israeli" + 0.006*"jews" + 0.005*"arab" + 0.005*"jewish" + 0.004*"ed" + 0.003*"people" + 0.003*"palestinian" + 0.003*"state" + 0.003*"arabs"')

(5, '0.002*"universe" + 0.001*"think" + 0.001*"theory" + 0.001*"larson" + 0.001*"time" + 0.001*"people" + 0.001*"could" + 0.001*"physical" + 0.001*"space

In [68]:
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=20, alpha=0.01, eta=0.01)
print_topics(lda_model, 10)

(0, '0.007*"people" + 0.004*"think" + 0.004*"armenian" + 0.004*"us" + 0.003*"state" + 0.003*"could" + 0.003*"israel" + 0.003*"right" + 0.003*"even" + 0.003*"many"')

(1, '0.006*"edu" + 0.006*"god" + 0.005*"think" + 0.005*"know" + 0.004*"get" + 0.004*"information" + 0.004*"jpeg" + 0.004*"turkey" + 0.004*"people" + 0.003*"book"')

(2, '0.008*"said" + 0.007*"people" + 0.006*"know" + 0.006*"israel" + 0.006*"us" + 0.005*"armenians" + 0.005*"say" + 0.005*"time" + 0.004*"went" + 0.004*"jews"')

(3, '0.008*"people" + 0.007*"armenian" + 0.005*"armenians" + 0.005*"turkish" + 0.005*"jesus" + 0.003*"say" + 0.003*"many" + 0.003*"well" + 0.003*"see" + 0.003*"war"')

(4, '0.006*"armenian" + 0.005*"armenians" + 0.004*"said" + 0.004*"time" + 0.004*"space" + 0.004*"people" + 0.004*"two" + 0.004*"us" + 0.003*"first" + 0.003*"also"')

(5, '0.006*"graphics" + 0.005*"also" + 0.005*"software" + 0.004*"image" + 0.004*"data" + 0.003*"people" + 0.003*"first" + 0.003*"use" + 0.003*"get" + 0.003*"edu"')

(6, '0.0