In [1]:
import sys
print('Python %s on %s' % (sys.version, sys.platform))
sys.path.extend(['E:\\LDA_Abstract_README', 'E:/LDA_Abstract_README'])


from utils import load_data
import gensim
from gensim.utils import simple_preprocess

n_topics = [10, 20, 27, 30]
n_docs = 10000

textPre_FilePath = "../data/readme_corpus.txt"
lda_ModelPath = "./readme_model/"

def sent_to_words(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)  # deacc=True removes punctuations

data = load_data(textPre_FilePath)[:n_docs]
data_words = list(sent_to_words(data))

Python 3.10.0 (tags/v3.10.0:b494f59, Oct  4 2021, 19:00:18) [MSC v.1929 64 bit (AMD64)] on win32


In [2]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [3]:
# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, notAllowed_postags=None):
    """https://spacy.io/api/annotation"""
    if notAllowed_postags is None:
        notAllowed_postags = ['ADJ', 'ADV']
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ not in notAllowed_postags])
    return texts_out

[nltk_data] Downloading package stopwords to E:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, notAllowed_postags=['ADJ', 'ADV'])

In [5]:
import gensim.corpora as corpora
from gensim.models import TfidfModel

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# 创建TF-IDF模型并应用于整个语料库
tfidf_model = TfidfModel(corpus)
corpus = tfidf_model[corpus]

In [None]:
# Build LDA model
import pyLDAvis.gensim
epoch = 500
models = []
for num_topics in n_topics:
    print("Training LDA model for", num_topics, "topics.")
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           chunksize=100,
                                           passes=epoch,
                                           alpha=0.042,
                                           per_word_topics=True,
                                           workers=13)
    models.append(lda_model)
    pic = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    pyLDAvis.save_html(pic, 'lda_readme_pass'+str(num_topics) + '.html')
    lda_model.save(lda_ModelPath+'lda_readme'+str(num_topics))
    print("Save the model.")

Training LDA model for 10 topics.
Save the model.
Training LDA model for 20 topics.
Save the model.
Training LDA model for 27 topics.
