In [92]:
import sys
from gensim.utils import simple_preprocess
from utils import load_data
import gensim
from pprint import pprint
import spacy
import gensim.corpora as corpora
from gensim.models import TfidfModel
from gensim.models import LdaModel

print('Python %s on %s' % (sys.version, sys.platform))
sys.path.extend(['E:\\LDA_Abstract_README', 'E:/LDA_Abstract_README'])

n_topics = [10, 20, 27, 30]


textPre_FilePath = "../data/abstract_corpus.txt"
lda_ModelPath = "./abstract_model/"


def sent_to_words(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)  # deacc=True removes punctuations

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, notAllowed_postags=None):
    """https://spacy.io/api/annotation"""
    if notAllowed_postags is None:
        notAllowed_postags = ['ADJ', 'ADV']
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ not in notAllowed_postags])
    return texts_out

data = load_data(textPre_FilePath)[351:361]
data_words = list(sent_to_words(data))
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, notAllowed_postags=['ADJ', 'ADV'])


# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# tfidf_model = TfidfModel(corpus)
# corpus = tfidf_model[corpus]

models = []
for n_t in n_topics:
    lda = LdaModel.load(lda_ModelPath+'lda_abstract'+str(n_t))
    models.append(lda)

Python 3.10.0 (tags/v3.10.0:b494f59, Oct  4 2021, 19:00:18) [MSC v.1929 64 bit (AMD64)] on win32


[nltk_data] Downloading package stopwords to E:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [108]:
import re
# Print the Keyword in the 10 topics
import pandas as pd
model_outputs = []
for model in models:
    model_outputs.append(model.print_topics(num_topics=30, num_words=10))

models_topics = []
pattern = r'"(.*?)"'
for output in model_outputs:
    models_topics.append([])
    for topics in output:
        words = re.findall(pattern, topics[1])
        models_topics[-1].append(words)
pd.DataFrame(models_topics)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,"[student, teacher, ner, tweet, session, gestur...","[kg, composition, parti, kb, beta, gene, extra...","[equivari, waveform, price, dictionari, star, ...","[equilibrium, dr, theorem, ct, ridg, newton, m...","[protein, gait, pi, genom, contact, cd, sde, b...","[registr, simplif, mot, reaction, induct, driv...","[caption, summar, review, anomali, road, summa...","[hash, fingerprint, epsilon, hsi, homolog, mal...","[model, imag, network, task, method, languag, ...","[skeleton, reid, independ, man, cgan, ib, quan...",...,,,,,,,,,,
1,"[ner, composition, comprehen, price, genom, il...","[session, citat, parti, continu, mobilenet, ga...","[instrument, surrog, hate_speech, dictionari, ...","[bit, entropi, layout, foundat, gpt, tabl, rem...","[salienc, keypoint, edit, corpora, player, tra...","[model, imag, network, task, method, languag, ...","[signatur, star, shortcut, sod, repair, reddit...","[emot, sgd, gestur, oracl, variant, burden, gr...","[beam, conflict, bo, greedi, dst, csi, smartph...","[ood, vit, mot, occup, temperatur, band, heatm...",...,,,,,,,,,,
2,"[norm, imput, cam, curvatur, subword, cp, ci, ...","[waveform, composition, tabl, column, fingerpr...","[formula, conform, cgan, heatmap, logic, algeb...","[rl, molecul, suit, option, parameter, script,...","[toolkit, ode, disc, cortex, inherit, sac, tur...","[wikipedia, star, hoi, read, corefer, sde, asc...","[explan, style, trajectori, energi, textur, ca...","[tag, pi, smartphon, lie, atom, glass, termino...","[rnn, recurr, redund, dp, absolut, obtain, del...","[independ, dg, temperatur, probe, homographi, ...",...,"[memor, gestur, compen, scikit, kd, abus, math...","[person, ct, reid, hop, i, ad, d, person_ident...","[viewpoint, kg, paint, mlp, eye, harmon, drl, ...","[cd, sport, plastic, doctor, ga, ace, fair, co...","[sr, lesion, denois, po, bound, shadow, lr, hr...","[emot, inspect, hash, request, homolog, analyz...","[model, imag, network, task, method, languag, ...",,,
3,"[view, fusion, rank, tensor, rotat, descriptor...","[tracker, lesion, anim, keyword, tabl, mot, ma...","[brain_tumor, tumor, site, da, brat, emb, era,...","[reid, induct, crop, person_identif, sde, mult...","[skeleton, transcript, waveform, patholog, asr...","[ssl, beta, subword, sport, har, satellit, now...","[music, correspond, registr, kg, mri, ray, pla...","[blur, logic, log_likelihood, galleri, isomorp...","[ood, forecast, mention, gp, imbal, ode, likel...","[realiz, ace, mrf, manufactur, multilay_percep...",...,"[quantum, mix, quantum_circuit, pronunci, mpc,...","[sketch, job, atom, draw, glass, disagr, grasp...","[summar, summari, ner, tweet, worker, custom, ...","[equilibrium, extrapol, driver, hypernetwork, ...","[model, imag, network, task, method, languag, ...","[sgd, gcn, phone, momentum, curvatur, encourag...","[mt, moder, minima, homolog, quadratur, pca, c...","[actor, emerg, discour, return, fingerprint, r...","[keypoint, learner, dialog, conver, complet, f...","[lm, drug, squar, analog, center, dictionari, ..."


In [109]:
doc_topics = []
for i, model in enumerate(models):
    topics_dis = []
    for doc in corpus:
        model_words = models_topics[i]
        topic_idx = max(model.get_document_topics(doc), key=lambda item: item[1])[0]
        model_words = model_words[topic_idx]
        topics = []
        doc_words = [id2word[word_id] for word_id, _ in doc]
        for doc_word in doc_words:
            for model_word in model_words:
                if model_word == doc_word:
                    topics.append(model_word)
        topics_dis.append(topics)
    doc_topics.append(topics_dis)

model_doc_topic = pd.DataFrame(doc_topics)
model_doc_topic.index = ["10 Topics", "20 Topics", "27 Topics", "30 Topics"]
columns = []
for i in range(10):
    columns.append("Abstract " + str(i+1))
model_doc_topic.columns = columns
model_doc_topic

Unnamed: 0,Abstract 1,Abstract 2,Abstract 3,Abstract 4,Abstract 5,Abstract 6,Abstract 7,Abstract 8,Abstract 9,Abstract 10
10 Topics,"[approach, method, network]","[imag, languag, task]","[method, model]","[method, network, imag, task]","[approach, method, model]","[method, network, task]","[approach, datum, featur, represent]","[network, languag, task]","[method, network, task, represent]","[method, imag]"
20 Topics,"[approach, method, network]","[imag, languag, task]","[method, model]","[method, network, imag, task]","[approach, method, model]","[method, network, task]","[approach, datum, featur, represent]","[network, languag, task]","[method, network, task, represent]","[method, imag]"
27 Topics,"[approach, method, network]","[imag, languag, task]","[method, model]","[method, network, imag, task]","[approach, method, model]","[method, network, task]","[approach, datum, featur, represent]","[network, languag, task]","[method, network, task, represent]","[method, imag]"
30 Topics,"[approach, method, network]","[imag, languag, task]","[method, model]","[method, network, imag, task]","[approach, method, model]","[method, network, task]","[approach, datum, featur, represent]","[network, languag, task]","[method, network, task, represent]","[method, imag]"
