In [1]:
import pandas as pd
import numpy as np
import gensim
import nltk
import logging
import pickle
import spacy
import math

import gensim.corpora as corpora

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora, models
from gensim.test.utils import datapath

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from scipy.stats import entropy
from tempfile import TemporaryFile

from scipy.special import (entr, rel_entr)
from numpy import (arange, putmask, ravel, ones, shape, ndarray, zeros, floor,
                   logical_and, log, sqrt, place, argmax, vectorize, asarray,
                   nan, inf, isinf, NINF, empty)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

np.random.seed(2020)

nltk.download('wordnet')
stemmer = SnowballStemmer('english')

DOC_COUNT = 10000

[nltk_data] Downloading package wordnet to /home/p4l/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
my_stop_words = STOPWORDS.union(set(['use', 'be', 'work', 'user', 'try', 'cell',
                                     'row', 'want', 'item', 'go', 'get', 'add', 'went', 'tried',
                                    'return', 'sort', 'test', 'run', 'check', 'click']))

In [3]:
base_path = "/home/p4l/work/stackoverflow/"
base_model = base_path + "models_data/"
base_dataset = base_path + "dataset/"
base_model_lda = base_model + "lda/"

In [4]:
def m_entropy(pk, qk=None, base=None, axis=0):
    pk = asarray(pk)
    pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
    if qk is None:
        vec = entr(pk)
    else:
        qk = asarray(qk)
        #if qk.shape != pk.shape:
            #raise ValueError("qk and pk must have same shape.")
        qk = 1.0*qk / np.sum(qk, axis=axis, keepdims=True)
        vec = rel_entr(pk, qk)
    S = np.sum(vec, axis=axis)
    if base is not None:
        S /= log(base)
    return S

def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    # lets keep with the p,q notation above
    p = query[None,:].T # take transpose
    q = matrix.T # transpose matrix

    #new = np.zeros((q.shape[0], q.shape[1]))
    #new[:q.shape[0], :1] = p
    #p = new
    
    m = 0.5*(p + q)
    return np.sqrt(0.5*(m_entropy(p,m) + m_entropy(q,m)))

def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims = jensen_shannon(query,matrix) # list of jensen shannon distances
    out = [dist for dist in sims if not math.isnan(dist)]
    return sims.argsort()[:k], sorted(sims, reverse=True) # the top k positional index of the smallest Jensen Shannon distances

def clear_text(text):
    text = re.sub('<code>(.|\n)*?<\/code>', '', text)
    text = re.sub(r'(\<(/?[^>]+)>)', '', text)
    text = re.sub("[\'\"\\/\@\%\(\)\~\`\{\}]", '', text)
    text = re.sub('\s+', ' ', text)
    
    return text

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    text = clear_text(text)
    result = []
    #result = [token in gensim.utils.simple_preprocess(text, deacc=True) if ((token not in gensim.parsing.preprocessing.STOPWORDS) and len(token) > 1) == True]
    for token in gensim.utils.simple_preprocess(text, deacc=True):
        if (token not in my_stop_words) and len(token) > 1:
            #result.append(lemmatize_stemming(token))
            result.append(token)
    return result

In [5]:
def split_tags(text):
    if text == '' or text == ' ':
        return text
    else:
        return text.replace('|', ' ')

def add_string(text, tags, n=3):
    tags = split_tags(tags)
    tags = ' ' + tags
    for i in range(n):
        text += tags
    return text

In [6]:
data = pd.read_csv(base_dataset + 'large_data.csv')
data = data.head(DOC_COUNT)

titles = data['post_title'].values
tags = data['post_tags'].map(split_tags).values
data = [add_string(row['post_body'], row['post_tags'], 6) for _, row in data.iterrows()]
#data = data['post_body'].values

data = np.append(np.array(data), titles)
data = np.append(data, tags)
data = np.append(data, tags)
data = np.append(data, tags)
data = np.append(data, tags)

print(len(data))
del titles
del tags

60000


In [7]:
doc_sample = data[3]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\ntokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['<p>After', 'mounting', 'the', 'database', 'I', 'tried', 'executing', 'these', 'command-', 'alter', 'database', 'open;', 'emanating', 'from', 'a', 'shutdown', 'initialization', 'error', 'but', 'my', 'oracle', 'kept', 'returning', 'an', 'ORA-00600:', 'internal', 'error', 'code,', 'arguments', '[dbkif_find_next_record_1],', '[],', '[],', '[].\nPlease,', 'how', 'do', 'I', 'proceed', 'from', 'here?</p>', 'database', 'rdbms', 'ordbms', 'database', 'rdbms', 'ordbms', 'database', 'rdbms', 'ordbms', 'database', 'rdbms', 'ordbms', 'database', 'rdbms', 'ordbms', 'database', 'rdbms', 'ordbms']


tokenized and lemmatized document: 
['mounting', 'database', 'executing', 'command', 'alter', 'database', 'open', 'emanating', 'shutdown', 'initialization', 'error', 'oracle', 'kept', 'returning', 'ora', 'internal', 'error', 'code', 'arguments', 'proceed', 'database', 'rdbms', 'ordbms', 'database', 'rdbms', 'ordbms', 'database', 'rdbms', 'ordbms', 'database', 'rdbms', 'ordbms', 'datab

In [8]:
processed_docs = np.array([preprocess(x) for x in data])
del data

In [9]:
bigram = gensim.models.Phrases(processed_docs, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[processed_docs], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [10]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags and not token.lemma_ in my_stop_words])
    return texts_out

In [11]:
processed_docs = make_bigrams(processed_docs)
processed_docs = make_trigrams(processed_docs)

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

processed_docs = lemmatization(processed_docs, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [12]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [13]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [14]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,
                                           id2word=dictionary,
                                           num_topics=200, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=50000,
                                           passes=5)

print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))

coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -10.388947489007123

Coherence Score:  0.47916541378219657


In [15]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.038*"hour" + 0.027*"minute" + 0.026*"difference" + 0.026*"unity" + 0.017*"issue" + 0.015*"code" + 0.013*"long" + 0.012*"second" + 0.011*"cloudwatch" + 0.011*"problem"
Topic: 1 Word: 0.066*"angular" + 0.035*"promise" + 0.021*"follow" + 0.013*"form" + 0.013*"error" + 0.012*"proxy" + 0.011*"import" + 0.010*"html" + 0.010*"file" + 0.009*"observer"
Topic: 2 Word: 0.046*"activity" + 0.037*"mode" + 0.019*"product" + 0.017*"woocommerce" + 0.017*"async" + 0.016*"issue" + 0.015*"page" + 0.013*"custom" + 0.012*"device" + 0.012*"code"
Topic: 3 Word: 0.037*"shape" + 0.025*"table" + 0.025*"css" + 0.023*"operator" + 0.023*"target" + 0.017*"width" + 0.016*"core" + 0.014*"size" + 0.012*"mock" + 0.011*"triangle"
Topic: 4 Word: 0.332*"database" + 0.019*"server" + 0.015*"firebase_firebase_realtime" + 0.012*"file" + 0.011*"connect" + 0.010*"button" + 0.010*"connection" + 0.009*"testing" + 0.009*"folder" + 0.008*"design"
Topic: 5 Word: 0.108*"animation" + 0.022*"code" + 0.014*"change" + 0.0

In [16]:
lda_model.save(datapath(base_model_lda + "model_test200n"))