In [1]:
import pandas as pd
import numpy as np
import gensim
import nltk
import logging
import pickle
import spacy
import math

import gensim.corpora as corpora

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora, models
from gensim.test.utils import datapath

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from scipy.stats import entropy
from tempfile import TemporaryFile

from scipy.special import (entr, rel_entr)
from numpy import (arange, putmask, ravel, ones, shape, ndarray, zeros, floor,
                   logical_and, log, sqrt, place, argmax, vectorize, asarray,
                   nan, inf, isinf, NINF, empty)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

np.random.seed(2020)

nltk.download('wordnet')
stemmer = SnowballStemmer('english')

my_stop_words = STOPWORDS.union(set(['use', 'be', 'work', 'user', 'try', 'cell',
                                     'row', 'want', 'item', 'go', 'get', 'add', 'went', 'tried',
                                    'return', 'sort', 'test', 'run', 'check', 'click', 'hour', 'minute', 'second',
                                    'version', 'app', 'paragraph', 'error', 'log', 'press',
                                    'need', 'feed', 'thank', 'way', 'like', 'kill', 'help']))

[nltk_data] Downloading package wordnet to /home/p4l/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
base_path = "/home/p4l/work/stackoverflow/"
base_model = base_path + "models_data/"
base_dataset = base_path + "dataset/"
base_model_lda = base_model + "lda/"

In [61]:
def clear_text(text):
    text = re.sub('<code>(.|\n)*?<\/code>', '', text)
    text = re.sub(r'(\<(/?[^>]+)>)', '', text)
    text = re.sub("[\'\"\\/\@\%\(\)\~\`\{\}]", '', text)
    text = re.sub('\s+', ' ', text)
    
    return text

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    text = clear_text(text)
    result = []
    #result = [token in gensim.utils.simple_preprocess(text, deacc=True) if ((token not in gensim.parsing.preprocessing.STOPWORDS) and len(token) > 1) == True]
    for token in gensim.utils.simple_preprocess(text, deacc=True):
        if (token not in my_stop_words) and len(token) > 1:
            #result.append(lemmatize_stemming(token))
            result.append(token)
    return result

def split_tags(text):
    if not isinstance(text, str) and math.isnan(text):
        return ''
    if text == '' or text == ' ':
        return text
    else:
        return text.replace('|', ' ')

def add_string(text, tags, n=3):
    tags = split_tags(tags)
    tags = ' ' + tags
    i = 0
    for i in range(n):
        if i % 2 == 0:
            text += tags
        else:
            text = tags + text
    return text

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags and not token.lemma_ in my_stop_words])
    return texts_out

def get_text_bow(text):
    text = preprocess(text)
    text = make_trigrams([text])[0]
    text = lemmatization([text])[0]
    bow_vector = dictionary.doc2bow(text)
    return bow_vector

def test_texts(text1, text2):
    bow1 = get_text_bow(text1)
    bow2 = get_text_bow(text2)
    sc1 = 0.0
    sc2 = 0.0
    for index, score in sorted(lda_model[bow1], key=lambda tup: -1*tup[1]):
        print(f"index: {index}, score {score}")
        sc1 += score
        #print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
    print("_________________________________________")
    for index, score in sorted(lda_model[bow2], key=lambda tup: -1*tup[1]):
        print(f"index: {index}, score {score}")
        sc2 += score
    return (sc1, sc2)
        
def jensen_shannon_v(p, q):
    p = p[None,:].T
    q = q[None,:].T
    m = 0.5*(p + q)
    #print(m)
    return np.sqrt(0.5*(entropy(p,m) + entropy(q,m)))

def title_body_sim(text1, text2, n_topics):
    bow1 = get_text_bow(text1)
    bow2 = get_text_bow(text2)
    p = np.zeros(n_topics)
    q = np.zeros(n_topics)
    for index, score in sorted(lda_model[bow1], key=lambda tup: -1*tup[1]):
        p[index] = score
    for index, score in sorted(lda_model[bow2], key=lambda tup: -1*tup[1]):
        q[index] = score

    return jensen_shannon_v(p, q)

def distr(arr):
    max_len = len(arr)
    mat = []
    mat.append([1.0 for i in range(max_len)])
    for i in range(max_len - 1):
        z = [0.0 for k in range(max_len)]
        z[i] = arr[i + 1]
        z[i + 1] = -arr[i]
        mat.append(z)
    vec = np.zeros(max_len)
    vec[0] = 1.0
    mat = np.array(mat)
    print(mat)
    print(vec)
    #return np.linalg.solve(mat, vec)

In [39]:
lda_model = gensim.models.LdaModel.load(datapath(base_model_lda + "model_semi_final"))
bigram_mod = gensim.models.phrases.Phraser.load(datapath(base_model + "ngrams/bigram_mod"))
trigram_mod = gensim.models.phrases.Phraser.load(datapath(base_model + "ngrams/trigram_mod"))
dictionary = gensim.corpora.Dictionary.load(datapath(base_model_lda + "model_semi_final.id2word"))
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [40]:
text1 = """Using index from a list to get another value/element in another list"""

text2 = """                   <p>I have project wherein I have to get the index of certain element in a list, then use that index to get another value in another list. </p>

<p>For example,</p>

<pre class="lang-py prettyprint prettyprinted" style=""><code><span class="pln">j_set </span><span class="pun">=</span><span class="pln"> </span><span class="pun">[</span><span class="lit">1</span><span class="pun">,</span><span class="pln"> </span><span class="lit">2</span><span class="pun">,</span><span class="pln"> </span><span class="lit">3</span><span class="pun">,</span><span class="pln"> </span><span class="lit">4</span><span class="pun">,</span><span class="pln"> </span><span class="lit">5</span><span class="pun">,</span><span class="pln"> </span><span class="lit">6</span><span class="pun">,</span><span class="pln"> </span><span class="lit">7</span><span class="pun">,</span><span class="pln"> </span><span class="lit">8</span><span class="pun">,</span><span class="pln"> </span><span class="lit">9</span><span class="pun">,</span><span class="pln"> </span><span class="lit">10</span><span class="pun">]</span><span class="pln">
on_going </span><span class="pun">=</span><span class="pln"> </span><span class="pun">[</span><span class="lit">1</span><span class="pun">]</span><span class="pln">
e_list </span><span class="pun">=</span><span class="pln"> </span><span class="pun">[[],</span><span class="pln"> </span><span class="pun">[</span><span class="lit">1</span><span class="pun">],</span><span class="pln"> </span><span class="pun">[</span><span class="lit">1</span><span class="pun">],</span><span class="pln"> </span><span class="pun">[</span><span class="lit">2</span><span class="pun">],</span><span class="pln"> </span><span class="pun">[</span><span class="lit">3</span><span class="pun">],</span><span class="pln"> </span><span class="pun">[</span><span class="lit">3</span><span class="pun">],</span><span class="pln"> </span><span class="pun">[</span><span class="lit">5</span><span class="pun">],</span><span class="pln"> </span><span class="pun">[</span><span class="lit">4</span><span class="pun">,</span><span class="pln"> </span><span class="lit">7</span><span class="pun">],</span><span class="pln"> </span><span class="pun">[</span><span class="lit">6</span><span class="pun">],</span><span class="pln"> </span><span class="pun">[</span><span class="lit">8</span><span class="pun">,</span><span class="pln"> </span><span class="lit">9</span><span class="pun">],</span><span class="pln"> </span><span class="pun">[</span><span class="lit">10</span><span class="pun">]]</span></code></pre>

<p>So far, the code looks like this:</p>

<pre class="lang-py prettyprint prettyprinted" style=""><code><span class="kwd">if</span><span class="pln"> isinstance</span><span class="pun">(</span><span class="pln">on_going</span><span class="pun">,</span><span class="pln"> int</span><span class="pun">):</span><span class="pln">
    on_going </span><span class="pun">=</span><span class="pln"> </span><span class="pun">[</span><span class="pln">on_going</span><span class="pun">]</span><span class="pln">
idx </span><span class="pun">=</span><span class="pln"> </span><span class="pun">[</span><span class="pln">y </span><span class="kwd">for</span><span class="pln"> y</span><span class="pun">,</span><span class="pln"> x </span><span class="kwd">in</span><span class="pln"> enumerate</span><span class="pun">(</span><span class="pln">e_list</span><span class="pun">)</span><span class="pln"> </span><span class="kwd">if</span><span class="pln"> x </span><span class="kwd">in</span><span class="pln"> on_going</span><span class="pun">]</span><span class="pln"> </span><span class="com"># code to get index in e_list</span><span class="pln">
</span><span class="kwd">print</span><span class="pun">(</span><span class="pln">idx</span><span class="pun">)</span><span class="pln">

</span><span class="kwd">for</span><span class="pln"> i </span><span class="kwd">in</span><span class="pln"> idx</span><span class="pun">:</span><span class="pln">
    q_active </span><span class="pun">=</span><span class="pln"> j_set</span><span class="pun">.</span><span class="pln">append</span><span class="pun">(</span><span class="pln">i</span><span class="pun">)</span><span class="pln">
    </span><span class="kwd">print</span><span class="pun">(</span><span class="pln">q_active</span><span class="pun">)</span></code></pre>

<p>The objective is to get the corresponding <code>index</code> of value/element in <code>on_going</code> from <code>e_list</code>. Then, use that index to get corresponding activity from <code>j_set</code> and store in <code>q_active</code>.</p>

<p>Expected output is: <code>q_active = [2, 3]</code> from the example shown above.</p>

<p>The problem is, with the code above, I am getting an output for storing values in q_active as:</p>

<pre class="lang-py prettyprint prettyprinted" style=""><code><span class="pun">[</span><span class="lit">1</span><span class="pun">,</span><span class="pln"> </span><span class="lit">2</span><span class="pun">]</span><span class="pln">
</span><span class="kwd">None</span><span class="pln">
</span><span class="kwd">None</span></code></pre>

<p>Any help would be appreciated! Thanks!</p>
    </div>"""

In [62]:
print(test_texts(text1, text2))

index: 123, score 0.5008334517478943
index: 120, score 0.25083300471305847
_________________________________________
index: 33, score 0.16809387505054474
index: 186, score 0.11650429666042328
index: 120, score 0.11124513298273087
index: 30, score 0.08400005102157593
index: 240, score 0.075936459004879
index: 123, score 0.07469457387924194
index: 130, score 0.04107626900076866
index: 276, score 0.040155280381441116
index: 187, score 0.03757336735725403
index: 272, score 0.03751949965953827
index: 125, score 0.03716368228197098
index: 62, score 0.03716365620493889
index: 0, score 0.03716309741139412
index: 53, score 0.037084124982357025
index: 194, score 0.02943793497979641
(0.7516664564609528, 0.9648113008588552)


In [63]:
title_body_sim(text1, text2, lda_model.num_topics)

array([0.66803593])