In [2]:
import sys
from gensim.utils import simple_preprocess
from utils import load_data
import gensim
from pprint import pprint
import spacy
import gensim.corpora as corpora
from gensim.models import TfidfModel
from gensim.models import LdaModel

print('Python %s on %s' % (sys.version, sys.platform))
sys.path.extend(['E:\\LDA_Abstract_README', 'E:/LDA_Abstract_README'])

n_topics = [10, 20, 27, 30]


textPre_FilePath = "../data/readme_corpus.txt"
lda_ModelPath = "./readme_model/"


def sent_to_words(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)  # deacc=True removes punctuations

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, notAllowed_postags=None):
    """https://spacy.io/api/annotation"""
    if notAllowed_postags is None:
        notAllowed_postags = ['ADJ', 'ADV']
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ not in notAllowed_postags])
    return texts_out

data = load_data(textPre_FilePath)[351:361]
data_words = list(sent_to_words(data))
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, notAllowed_postags=['ADJ', 'ADV'])


# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# tfidf_model = TfidfModel(corpus)
# corpus = tfidf_model[corpus]

models = []
for n_t in n_topics:
    lda = LdaModel.load(lda_ModelPath+'lda_readme'+str(n_t))
    models.append(lda)

Python 3.10.0 (tags/v3.10.0:b494f59, Oct  4 2021, 19:00:18) [MSC v.1929 64 bit (AMD64)] on win32


[nltk_data] Downloading package stopwords to E:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import re
# Print the Keyword in the 10 topics
import pandas as pd
model_outputs = []
for model in models:
    model_outputs.append(model.print_topics(num_topics=30, num_words=10))

models_topics = []
pattern = r'"(.*?)"'
for output in model_outputs:
    models_topics.append([])
    for topics in output:
        words = re.findall(pattern, topics[1])
        models_topics[-1].append(words)
pd.DataFrame(models_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,"[phyre, flownet, tenni, censu, plate, breast, ...","[mot, lightgbm, mathematica, sde, kalman, micr...","[googl, research, git, clone, trunk_subdir, fo...","[leakag, banana_banana, gqa, coreset, superres...","[readm, huggingfac, co, md, transform, timm, m...","[darknet, yolov, cfg, yolo, exe_detector, drop...","[mmlab_mmselfsup, heroku, nuscen, mmselfsup, j...","[garden, badg_furi, amr, rashwan_le, powershel...","[model, imag, train, py, pdf, network, dataset...","[probreg, superpixel, udac_drlnd, sage, univer...",...,,,,,,,,,,
1,"[starcraft_ii, pinn, eeg, orbit, pymarl, mcmc,...","[ssd, cascad, capsul, retinanet, fco, nuscen, ...","[check_mark, drug, catalyst, imgur, poetri, gr...","[imag, model, train, pdf, network, py, dataset...","[celeba, triplet, gcn, jindongwang, safeti, op...","[salient, rss, hallucin, superpixel, barycent,...","[garden, tf, tensorflow, model, badg_furi, mml...","[diffus, pointnet, modelnet, gene, lesion, cod...","[bnn, plant, clevr, snli, ape, espnet, gestur,...","[cutout, lightgbm, grad_cam, dart, ode, csi, m...",...,,,,,,,,,,
2,"[imit, forecast, extractor, starcraft_ii, cs_s...","[pointnet, hair, charlesq_pointnet, multifit, ...","[client, trade, pneumonia, voc, disea, wandb, ...","[imag, model, train, pdf, network, py, dataset...","[huggingfac, co, transform, doc, transport, fa...","[physionet, fco, notion, exercis, emnist, scal...","[tweet, defens, cascad, scannet, fl, plant, dy...","[microscopi, srd, ivrl, fluoresc_microscopi, v...","[torchvis, check_mark, forc, capsul, likelihoo...","[mmlab_mmselfsup, crowd, mmselfsup, merlin, pa...",...,"[sar, eng, superresolut, udac_drlnd, tednet, m...","[readm, md, diffus, distil, market, xformer, k...","[gh_reagent, circleci, meme, codecov, signatur...","[cgan, stereo, imput, voxceleb, vizdoom, ode, ...","[mtl, bandit, grad_cam, poetri, inject, greedi...","[breast_cancer, shufflenet, advertis, gestur, ...","[moco, fpn, storag_googleapi, maskrcnn_benchma...",,,
3,"[pointnet, modelnet, scannet, voxel, shapenet,...","[yelp, poetri, fsrcnn, elasticsearch, muhammad...","[drug, dynet, tt, mathematica, xai, gpflow_gpf...","[dblp, kornia, pneumonia, alt, disea, manipul,...","[wavenet, signatur, kd, ssl, phyre, coreset, m...","[particl, mtl, occlus, mcd, fiber, davi, appl_...","[dna, homographi, piano, extractor, imbal, wri...","[mesh, heroku, squeez_excit, hallucin, aw, imp...","[check_mark, codecog_latex, chess, linemod, es...","[carla, selector, cs_vmnih, foreground, tagger...",...,"[client, wasserstein, expans, hrnet, salient, ...","[sentiment, gin, dgl, forecast, hrnet_hrnet, s...","[brat, tumor, brain_tumor, dice, slice, ship, ...","[imag, model, train, pdf, network, py, dataset...","[grad_cam, gat, breast_cancer, patent, probreg...","[gym, replay, mmlab_mmselfsup, dqn, trial, snn...","[ode, shufflenet, bertmodel, cocotalk, soccer,...","[tg_salt, smile, salt, ga, film, corenlp, frag...","[mxnet, jindongwang, gene, uniti, ffhq, cpn, s...","[garden, tf, model, tensorflow, badg_furi, pac..."


In [4]:
doc_topics = []
for i, model in enumerate(models):
    topics_dis = []
    for doc in corpus:
        model_words = models_topics[i]
        topic_idx = max(model.get_document_topics(doc), key=lambda item: item[1])[0]
        model_words = model_words[topic_idx]
        topics = []
        doc_words = [id2word[word_id] for word_id, _ in doc]
        for doc_word in doc_words:
            for model_word in model_words:
                if model_word == doc_word:
                    topics.append(model_word)
        topics_dis.append(topics)
    doc_topics.append(topics_dis)

model_doc_topic = pd.DataFrame(doc_topics)
model_doc_topic.index = ["10 Topics", "20 Topics", "27 Topics", "30 Topics"]
columns = []
for i in range(10):
    columns.append("README " + str(i+1))
model_doc_topic.columns = columns
model_doc_topic

Unnamed: 0,README 1,README 2,README 3,README 4,README 5,README 6,README 7,README 8,README 9,README 10
10 Topics,[],[],[],[doc],[],[],[],[],[doc],"[doc, md, transform]"
20 Topics,[],[],[],[datum],[],"[model, pdf]",[datum],"[model, pdf]","[model, network, train, pdf, py]","[dataset, datum, model, network, test, train, ..."
27 Topics,[],[],[],[datum],[],"[model, pdf]",[datum],"[model, pdf]","[model, network, train, pdf, py]","[dataset, datum, model, network, test, train, ..."
30 Topics,"[dataset, datum, model, network, test, train]",[],"[dataset, datum, model, network, test, train]",[datum],"[dataset, datum, model, network, test, train]","[model, pdf]",[datum],"[model, pdf]","[model, network, train, pdf, py]","[dataset, datum, model, network, test, train, ..."
