In [None]:
import pandas as pd
import numpy as np
import gensim
import nltk
import logging
import pickle

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora, models
from gensim.test.utils import datapath

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from scipy.stats import entropy

np.random.seed(2020)

nltk.download('wordnet')
stemmer = SnowballStemmer('english')

# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
base_path = "D:/work/stackoverflow"
base_model = base_path + "/models/"
base_dataset = base_path + "/dataset/"

In [None]:
processed_docs = pd.Series()
for i in range(1, 2):
  processed_docs = processed_docs.append(pd.read_pickle(f"{base_dataset}/proc_docs{i}.ser"), ignore_index=True)

In [None]:
print(len(processed_docs.index))
print(processed_docs[:5])

In [5]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_above=0.4, keep_n=300000)
# dictionary.filter_extremes(no_below=30, no_above=0.6, keep_n=300000)

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
print(dictionary)
print(type(bow_corpus))
print(type(tfidf))
print(type(corpus_tfidf))

Dictionary(64490 unique tokens: ['add', 'check', 'constraint', 'creat', 'creation']...)
<class 'list'>
<class 'gensim.models.tfidfmodel.TfidfModel'>
<class 'gensim.interfaces.TransformedCorpus'>


In [6]:
lda_model_tfidf = gensim.models.LdaModel.load(datapath(f"{base_model}lda/model_100n"))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    # lets keep with the p,q notation above
    p = query[None,:].T # take transpose
    q = matrix.T # transpose matrix

    new = np.zeros((q.shape[0], q.shape[1]))
    new[:q.shape[0], :1] = p
    p = new
    
    m = 0.5*(p + q)
    return np.sqrt(0.5*(entropy(p,m) + entropy(q,m)))

def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims = jensen_shannon(query,matrix) # list of jensen shannon distances
    return sims.argsort()[:k] # the top k positional index of the smallest Jensen Shannon distances

def clear_text(text):
    text = re.sub('<code>(.|\n)*?<\/code>', '', text)
    text = re.sub(r'(\<(/?[^>]+)>)', '', text)
    return text

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    text = clear_text(text)
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [0]:
% time
doc_topic_dist = np.zeros((len(lda_model_tfidf[corpus_tfidf]), lda_model_tfidf.num_topics))
for row, doc in enumerate(lda_model_tfidf[corpus_tfidf]):
  for word in doc:
    doc_topic_dist[row][word[0]] = word[1]
to_save = doc_topic_dist.tolist()
pickle.dump(f"{base_model}doc_topic_dist")
del to_save
# doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in lda_model_tfidf[corpus_tfidf]])
doc_topic_dist.shape

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 6.91 µs


In [0]:
% time
unseen_document = """I have an external (to the component), observable object that I want to listen for changes on. When the object is updated it emits change events, and then I want to rerender the component when any change is detected.

With a top-level React.render this has been possible, but within a component it doesn't work (which makes some sense since the render method just returns an object).

Here's a code example:"""
new_bow = dictionary.doc2bow(preprocess(unseen_document))
new_tfidf = tfidf[new_bow]
new_doc_distribution = np.zeros(lda_model_tfidf.num_topics)
for tup in lda_model_tfidf.get_document_topics(bow=new_tfidf):
  new_doc_distribution[tup[0]] = tup[1]
# new_doc_distribution = np.array([tup[1] for tup in lda_model_tfidf.get_document_topics(bow=new_tfidf)])

In [0]:
new_doc_distribution.shape

In [0]:
most_sim_ids = get_most_similar_documents(new_doc_distribution, doc_topic_dist)

TypeError: ignored

In [0]:
unseen_document = """Can you force a React component to rerender without calling setState?"""
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}\t index {}".format(score, lda_model_tfidf.print_topic(index, 5), index))

Score: 0.27649393677711487	 Topic: 0.313*"react" + 0.148*"nativ" + 0.048*"context" + 0.043*"constructor" + 0.034*"compon"	 index 54
Score: 0.21279887855052948	 Topic: 0.143*"templat" + 0.100*"extern" + 0.064*"config" + 0.062*"direct" + 0.056*"overrid"	 index 87
Score: 0.20730245113372803	 Topic: 0.176*"extract" + 0.065*"featur" + 0.060*"scale" + 0.049*"extend" + 0.041*"backend"	 index 47
Score: 0.16626188158988953	 Topic: 0.147*"hide" + 0.117*"block" + 0.097*"kotlin" + 0.065*"schedul" + 0.064*"upgrad"	 index 15


In [0]:
unseen_document = """I have an external (to the component), observable object that I want to listen for changes on. When the object is updated it emits change events, and then I want to rerender the component when any change is detected.

With a top-level React.render this has been possible, but within a component it doesn't work (which makes some sense since the render method just returns an object).

Here's a code example:"""
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}\t index {}".format(score, lda_model_tfidf.print_topic(index, 5), index))

Score: 0.4310450553894043	 Topic: 0.102*"render" + 0.091*"state" + 0.082*"ionic" + 0.071*"xamarin" + 0.061*"compon"	 index 72
Score: 0.107362762093544	 Topic: 0.201*"firebas" + 0.089*"retriev" + 0.067*"socket" + 0.067*"drop" + 0.053*"databas"	 index 11
Score: 0.09919276833534241	 Topic: 0.143*"templat" + 0.100*"extern" + 0.064*"config" + 0.062*"direct" + 0.056*"overrid"	 index 87
Score: 0.07991326600313187	 Topic: 0.114*"join" + 0.093*"rail" + 0.091*"selenium" + 0.063*"syntax" + 0.059*"sqlite"	 index 67
Score: 0.07872338593006134	 Topic: 0.133*"event" + 0.108*"handl" + 0.080*"stream" + 0.079*"stop" + 0.068*"materi"	 index 2
Score: 0.059320658445358276	 Topic: 0.131*"download" + 0.117*"detect" + 0.074*"widget" + 0.062*"opencv" + 0.051*"privat"	 index 71
Score: 0.04913027584552765	 Topic: 0.145*"argument" + 0.087*"miss" + 0.078*"specifi" + 0.069*"level" + 0.044*"foreach"	 index 85
Score: 0.04315628856420517	 Topic: 0.165*"jqueri" + 0.123*"charact" + 0.081*"invalid" + 0.067*"uniqu" + 0.06