In [1]:
import pandas as pd
import numpy as np
import gensim
import nltk
import logging
import pickle
import spacy
import math

import gensim.corpora as corpora

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora, models
from gensim.test.utils import datapath

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from scipy.stats import entropy
from tempfile import TemporaryFile

from scipy.special import (entr, rel_entr)
from numpy import (arange, putmask, ravel, ones, shape, ndarray, zeros, floor,
                   logical_and, log, sqrt, place, argmax, vectorize, asarray,
                   nan, inf, isinf, NINF, empty)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

np.random.seed(2020)

nltk.download('wordnet')
stemmer = SnowballStemmer('english')

DOC_COUNT = 100

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\P4L\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
my_stop_words = STOPWORDS.union(set(['use', 'be', 'work', 'user', 'try', 'cell',
                                     'row', 'want', 'item', 'go', 'get', 'add', 'went', 'tried',
                                    'return', 'sort', 'test', 'run', 'check', 'click', 'hour', 'minute', 'second',
                                    'version', 'app', 'paragraph', 'error', 'log', 'press',
                                    'need', 'feed', 'thank', 'way', 'like', 'kill', 'help']))

In [7]:
base_path = "D:/work/stackoverflow"
base_model = base_path + "models_data/"
base_dataset = base_path + "dataset/"
base_model_lda = base_model + "lda/"

In [4]:
def m_entropy(pk, qk=None, base=None, axis=0):
    pk = asarray(pk)
    pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
    if qk is None:
        vec = entr(pk)
    else:
        qk = asarray(qk)
        #if qk.shape != pk.shape:
            #raise ValueError("qk and pk must have same shape.")
        qk = 1.0*qk / np.sum(qk, axis=axis, keepdims=True)
        vec = rel_entr(pk, qk)
    S = np.sum(vec, axis=axis)
    if base is not None:
        S /= log(base)
    return S

def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    # lets keep with the p,q notation above
    p = query[None,:].T # take transpose
    q = matrix.T # transpose matrix

    #new = np.zeros((q.shape[0], q.shape[1]))
    #new[:q.shape[0], :1] = p
    #p = new
    
    m = 0.5*(p + q)
    return np.sqrt(0.5*(m_entropy(p,m) + m_entropy(q,m)))

def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims = jensen_shannon(query,matrix) # list of jensen shannon distances
    out = [dist for dist in sims if not math.isnan(dist)]
    return sims.argsort()[:k], sorted(sims, reverse=True) # the top k positional index of the smallest Jensen Shannon distances

def clear_text(text):
    text = re.sub('<code>(.|\n)*?<\/code>', '', text)
    text = re.sub(r'(\<(/?[^>]+)>)', '', text)
    text = re.sub("[\'\"\\/\@\%\(\)\~\`\{\}]", '', text)
    text = re.sub('\s+', ' ', text)
    
    return text

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    text = clear_text(text)
    result = []
    #result = [token in gensim.utils.simple_preprocess(text, deacc=True) if ((token not in gensim.parsing.preprocessing.STOPWORDS) and len(token) > 1) == True]
    for token in gensim.utils.simple_preprocess(text, deacc=True):
        if (token not in my_stop_words) and len(token) > 1:
            #result.append(lemmatize_stemming(token))
            result.append(token)
    return result

In [5]:
def split_tags(text):
    if not isinstance(text, str) and math.isnan(text):
        return ''
    if text == '' or text == ' ':
        return text
    else:
        return text.replace('|', ' ')

def add_string(text, tags, n=3):
    tags = split_tags(tags)
    tags = ' ' + tags
    i = 0
    for i in range(n):
        if i % 2 == 0:
            text += tags
        else:
            text = tags + text
    return text

In [8]:
data = pd.read_csv(base_dataset + 'large_data.csv')
if DOC_COUNT != -1:
    data = data.head(DOC_COUNT)

titles = data['post_title'].values
tags = data['post_tags'].map(split_tags).values
data = [add_string(row['post_body'], row['post_tags'], 3) for _, row in data.iterrows()]
#data = data['post_body'].values

[data.append(z) for z in titles]
[data.append(z) for z in tags]
[data.append(z) for z in tags]
[data.append(z) for z in tags]
[data.append(z) for z in tags]

#data = np.array(data)
#data = np.append(data, titles)
#data = np.append(data, tags)
#data = np.append(data, tags)
#data = np.append(data, tags)
#data = np.append(data, tags)

print(len(data))
del titles
del tags

FileNotFoundError: [Errno 2] File D:/work/stackoverflowdataset/large_data.csv does not exist: 'D:/work/stackoverflowdataset/large_data.csv'

In [7]:
doc_sample = data[3]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\ntokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['', 'database', 'rdbms', 'ordbms<p>After', 'mounting', 'the', 'database', 'I', 'tried', 'executing', 'these', 'command-', 'alter', 'database', 'open;', 'emanating', 'from', 'a', 'shutdown', 'initialization', 'error', 'but', 'my', 'oracle', 'kept', 'returning', 'an', 'ORA-00600:', 'internal', 'error', 'code,', 'arguments', '[dbkif_find_next_record_1],', '[],', '[],', '[].\nPlease,', 'how', 'do', 'I', 'proceed', 'from', 'here?</p>', 'database', 'rdbms', 'ordbms', 'database', 'rdbms', 'ordbms']


tokenized and lemmatized document: 
['database', 'rdbms', 'ordbmsafter', 'mounting', 'database', 'executing', 'command', 'alter', 'database', 'open', 'emanating', 'shutdown', 'initialization', 'oracle', 'kept', 'returning', 'ora', 'internal', 'code', 'arguments', 'proceed', 'database', 'rdbms', 'ordbms', 'database', 'rdbms', 'ordbms']


In [8]:
processed_docs = np.array([preprocess(x) for x in data])
del data

In [9]:
bigram = gensim.models.Phrases(processed_docs, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[processed_docs], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [60]:
bigram_mod.save(datapath(base_model + "ngrams/bigram_mod"))
trigram_mod.save(datapath(base_model + "ngrams/trigram_mod"))

In [10]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags and not token.lemma_ in my_stop_words])
    return texts_out

In [11]:
processed_docs = make_bigrams(processed_docs)
processed_docs = make_trigrams(processed_docs)

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

processed_docs = lemmatization(processed_docs, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [12]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [13]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [14]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,
                                           id2word=dictionary,
                                           num_topics=300,
                                           random_state=100,
                                           update_every=2,
                                           chunksize=50000,
                                           passes=15)

print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))

coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -18.07849637275526

Coherence Score:  0.47145036332862


In [15]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.265*"range" + 0.194*"certain" + 0.147*"dictionary" + 0.098*"criterion" + 0.080*"spinner" + 0.021*"paper" + 0.020*"toad" + 0.012*"unnecessary" + 0.011*"patient" + 0.011*"bellow"
Topic: 1 Word: 0.194*"drupal" + 0.061*"mule" + 0.036*"resttemplate" + 0.034*"objectid" + 0.033*"silverstripe" + 0.030*"microsoft_graph_sdks" + 0.023*"kendo" + 0.023*"livedata" + 0.021*"turtle" + 0.018*"egg"
Topic: 2 Word: 0.203*"publish" + 0.146*"bluetooth" + 0.141*"synchronization" + 0.065*"synchronize" + 0.041*"mathjax" + 0.031*"fish" + 0.026*"publishing" + 0.025*"deny" + 0.022*"putty" + 0.017*"portfolio"
Topic: 3 Word: 0.414*"mysql" + 0.389*"query" + 0.049*"description" + 0.016*"queryset" + 0.010*"valueerror" + 0.008*"thanks_advance" + 0.006*"lag" + 0.004*"follow" + 0.004*"primary_key" + 0.004*"malforme"
Topic: 4 Word: 0.531*"mongodb" + 0.185*"mongoose" + 0.155*"collection" + 0.018*"mix" + 0.015*"boto" + 0.011*"username" + 0.008*"emit" + 0.006*"aws_lambda" + 0.004*"enforce" + 0.004*"profiler"

In [16]:
lda_model.save(datapath(base_model_lda + "model_semi_final"))

In [17]:
def get_text_bow(text):
    text = preprocess(text)
    text = make_trigrams([text])[0]
    text = lemmatization([text])[0]
    bow_vector = dictionary.doc2bow(text)
    return bow_vector

def test_texts(text1, text2):
    bow1 = get_text_bow(text1)
    bow2 = get_text_bow(text2)
    for index, score in sorted(lda_model[bow1], key=lambda tup: -1*tup[1]):
        print(f"index: {index}, score {score}")
        #print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
    print("_________________________________________")
    for index, score in sorted(lda_model[bow2], key=lambda tup: -1*tup[1]):
        print(f"index: {index}, score {score}")

In [18]:
data = pd.read_csv(base_dataset + 'large_data.csv')
data = data.head(DOC_COUNT)

In [56]:
row_n = 12
test_texts(data.loc[row_n, 'post_title'], data.loc[row_n, 'post_body'])
text1 = """VBA importing udf module from Add In to Workbook"""
text2 = """(TypeError: Cannot set property 'next' of null) When trying to make a linkedlist?"""
#test_texts(text1, text2)
print("______________________________")
print(data.loc[row_n, 'post_title'])
print("______________________________")
print(data.loc[row_n, 'post_body'])

index: 208, score 0.3270482122898102
index: 171, score 0.27251410484313965
index: 297, score 0.2024327516555786
_________________________________________
index: 171, score 0.48688679933547974
index: 154, score 0.07441804558038712
index: 204, score 0.07163151353597641
index: 149, score 0.07157592475414276
index: 91, score 0.039860039949417114
index: 240, score 0.03905803710222244
index: 41, score 0.03845934197306633
index: 73, score 0.036128051578998566
index: 136, score 0.03595936670899391
index: 98, score 0.035835426300764084
index: 250, score 0.03577928990125656
______________________________
Overlap multiple images at some pixel level using Imagemagick
______________________________
<p>For example, I have 4 images with a size of <strong>1000x800</strong>. I want to merge all these images into one image. I know there is a command of </p>

<pre><code>convert +append image[1-4].jpg output.jpg
</code></pre>

<p>But I want to merge the second image into the first image by <strong>overlap

In [57]:
def jensen_shannon_v(p, q):
    p = p[None,:].T
    q = q[None,:].T
    m = 0.5*(p + q)
    return np.sqrt(0.5*(entropy(p,m) + entropy(q,m)))

def title_body_sim(text1, text2, n_topics):
    bow1 = get_text_bow(text1)
    bow2 = get_text_bow(text2)
    p = np.zeros(n_topics)
    q = np.zeros(n_topics)
    for index, score in sorted(lda_model[bow1], key=lambda tup: -1*tup[1]):
        p[index] = score
    for index, score in sorted(lda_model[bow2], key=lambda tup: -1*tup[1]):
        q[index] = score
    return jensen_shannon_v(p, q)

In [59]:
title_body_sim(data.loc[row_n, 'post_title'], data.loc[row_n, 'post_body'], lda_model.num_topics)
#title_body_sim(text1, text2, lda_model.num_topics)

array([0.64129337])