In [35]:
import pandas as pd

In [36]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from gensim import corpora, models
np.random.seed(2020)

In [37]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\P4L\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [38]:
stemmer = SnowballStemmer('english')

In [39]:
def clear_text(text):
    text = re.sub('<code>(.|\n)*?<\/code>', '', text)
    text = re.sub(r'(\<(/?[^>]+)>)', '', text)
    return text

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    text = clear_text(text)
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [40]:
data = pd.read_csv('../../dataset/optimal_data.csv')
data = data.head(10000)

In [41]:
tags_set = set()
def topic_count(text):
    ww = text.split('|')
    zz = list()
    for i in ww:
        zz.append(lemmatize_stemming(i))
    for i in zz:
        tags_set.add(i)
    return text
data['post_tags'].map(topic_count)

0                                         assembly|msp430
1                                                 unity3d
2        java|algorithm|sorting|data-structures|quicksort
3                                             python|kivy
4                        r|latex|r-markdown|knitr|tinytex
                              ...                        
9995    java|spring-mvc|reactive-programming|spring-we...
9996                 c#|asp.net|blazor|blazor-server-side
9997    javascript|android|cordova|ionic-framework|ionic4
9998                         css|angular|angular-material
9999    amazon-web-services|aws-glue|aws-glue-data-cat...
Name: post_tags, Length: 10000, dtype: object

In [42]:
tags_set

{'aem',
 'boost-process',
 'google-material-icon',
 'cordova',
 'normal',
 'type-paramet',
 'twilio-funct',
 'user32',
 'lora',
 'verif',
 'codepag',
 'webdriv',
 'authent',
 'file-read',
 'opencv-stitch',
 'variadic-funct',
 'log4js-nod',
 'macos-catalina',
 'offline-cach',
 'regex-group',
 'google-apis-explor',
 'rfe',
 'color',
 'samsung-s8',
 'nopcommerc',
 'androidx',
 'pusher',
 'ajaxcontroltoolkit',
 'errorbar',
 'ml.net',
 'union-typ',
 'api',
 'memori',
 'sql-loader',
 'fade',
 'ansibl',
 'ckeditor',
 'spring',
 'databrick',
 'tableview',
 'mapper',
 'filenam',
 'grafana',
 'project-reactor',
 'maco',
 'msix',
 'mine',
 'similar',
 'acumatica-kb',
 'netti',
 'memsql',
 'android-json',
 'python-multithread',
 'content-typ',
 'spss',
 'voiceov',
 'linear-regress',
 'stackdriv',
 'cygwin',
 'changelisten',
 'hot-reload',
 'angular-univers',
 'iana',
 'bayesian-network',
 'node-fetch',
 'uitabbar',
 'django-filt',
 'dfs',
 'react-script',
 'pymysql',
 'gridsom',
 'testflight',
 'x

In [43]:
len(tags_set)

6249

# Preprocessing test

In [44]:
doc_sample = data.loc[3, 'post_body']

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['<p>I', 'want', 'to', 'install', 'kivy', 'on', 'a', 'pc', 'that', "doesn't", 'have', 'an', 'internet', 'connection,', 'so', 'I', "can't", 'install', 'it', 'using', 'pip.', 'Can', 'someone', 'tell', 'me', 'how', 'to', 'install', 'kivy', 'offline?</p>']


 tokenized and lemmatized document: 
['want', 'instal', 'kivi', 'internet', 'connect', 'instal', 'tell', 'instal', 'kivi', 'offlin']


In [45]:
processed_docs = data['post_body'].map(preprocess)

In [46]:
processed_docs[12]

['function',
 'data',
 'firebas',
 'want',
 'data',
 'state',
 'get',
 'declar',
 'array',
 'push',
 'data',
 'setstat',
 'array',
 'state',
 'state',
 'render',
 'issu',
 'work',
 'perfect',
 'know',
 'state',
 'overr',
 'data',
 'item',
 'logger',
 'print',
 'ten',
 'render',
 'data',
 'state',
 'tell',
 'json',
 'valu',
 'type',
 'nsnumber',
 'convert',
 'nsstring',
 'avoid',
 'code',
 'snippet']

In [47]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [48]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [49]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [50]:
dictionary[12]

'click'

In [51]:
tfidf = models.TfidfModel(bow_corpus)

In [52]:
corpus_tfidf = tfidf[bow_corpus]

In [72]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=100, id2word=dictionary, passes=2, workers=4)

In [77]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.119*"airflow" + 0.087*"pod" + 0.080*"blank" + 0.055*"ingress" + 0.036*"escap" + 0.035*"cluster" + 0.027*"data" + 0.015*"html" + 0.008*"method" + 0.008*"password"
Topic: 1 Word: 0.038*"file" + 0.030*"build" + 0.028*"open" + 0.028*"upload" + 0.027*"result" + 0.024*"need" + 0.019*"read" + 0.018*"abl" + 0.018*"know" + 0.016*"perform"
Topic: 2 Word: 0.124*"output" + 0.054*"command" + 0.041*"understand" + 0.038*"look" + 0.036*"tell" + 0.030*"respons" + 0.027*"boolean" + 0.027*"drive" + 0.024*"clear" + 0.021*"error"
Topic: 3 Word: 0.117*"archiv" + 0.050*"disk" + 0.050*"temp" + 0.014*"store" + 0.009*"imag" + 0.005*"data" + 0.005*"command" + 0.005*"script" + 0.005*"file" + 0.005*"program"
Topic: 4 Word: 0.172*"statement" + 0.075*"locat" + 0.066*"center" + 0.051*"bearer" + 0.039*"definit" + 0.026*"evalu" + 0.021*"driver" + 0.019*"hardwar" + 0.019*"window" + 0.017*"optim"
Topic: 5 Word: 0.140*"static" + 0.139*"databas" + 0.081*"tabl" + 0.076*"curl" + 0.027*"destin" + 0.018*"measu

In [75]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=100, id2word=dictionary, passes=2, workers=2)

In [76]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.021*"work" + 0.017*"error" + 0.014*"command" + 0.014*"project" + 0.014*"tri" + 0.013*"follow" + 0.012*"instal" + 0.012*"code" + 0.011*"https" + 0.011*"like"
Topic: 1 
Words: 0.029*"tri" + 0.028*"error" + 0.021*"imag" + 0.019*"work" + 0.017*"file" + 0.015*"note" + 0.015*"select" + 0.012*"build" + 0.012*"version" + 0.012*"creat"
Topic: 2 
Words: 0.034*"grid" + 0.019*"file" + 0.015*"work" + 0.014*"tri" + 0.013*"set" + 0.012*"code" + 0.011*"data" + 0.010*"link" + 0.010*"cell" + 0.010*"follow"
Topic: 3 
Words: 0.020*"code" + 0.019*"tri" + 0.018*"request" + 0.017*"error" + 0.016*"work" + 0.015*"https" + 0.011*"follow" + 0.010*"azur" + 0.010*"need" + 0.009*"token"
Topic: 4 
Words: 0.058*"file" + 0.029*"info" + 0.021*"folder" + 0.021*"exampl" + 0.020*"download" + 0.017*"error" + 0.016*"directori" + 0.015*"path" + 0.014*"work" + 0.014*"tri"
Topic: 5 
Words: 0.052*"work" + 0.021*"error" + 0.019*"code" + 0.011*"request" + 0.011*"file" + 0.009*"tri" + 0.009*"follow" + 0.009*"pos

In [78]:
for index, score in sorted(lda_model[bow_corpus[123]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.3369101285934448	 
Topic: 0.021*"work" + 0.020*"tri" + 0.016*"code" + 0.014*"like" + 0.014*"function" + 0.012*"want" + 0.009*"script" + 0.009*"multipl" + 0.008*"help" + 0.008*"queri"

Score: 0.22533059120178223	 
Topic: 0.084*"python" + 0.042*"file" + 0.038*"line" + 0.022*"code" + 0.015*"user" + 0.015*"problem" + 0.014*"sheet" + 0.013*"packag" + 0.013*"tri" + 0.012*"tensorflow"

Score: 0.22460071742534637	 
Topic: 0.040*"valu" + 0.017*"want" + 0.017*"like" + 0.015*"load" + 0.014*"file" + 0.012*"document" + 0.011*"json" + 0.011*"config" + 0.011*"tri" + 0.010*"look"

Score: 0.13080081343650818	 
Topic: 0.033*"code" + 0.021*"work" + 0.020*"rout" + 0.017*"tri" + 0.017*"messag" + 0.015*"port" + 0.015*"send" + 0.015*"error" + 0.012*"follow" + 0.011*"data"

Score: 0.06751255691051483	 
Topic: 0.036*"notif" + 0.020*"work" + 0.019*"send" + 0.016*"push" + 0.015*"page" + 0.012*"type" + 0.011*"firebas" + 0.011*"build" + 0.011*"user" + 0.011*"time"


In [79]:
for index, score in sorted(lda_model_tfidf[bow_corpus[123]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.28146126866340637	 
Topic: 0.074*"script" + 0.053*"execut" + 0.050*"want" + 0.043*"generat" + 0.041*"command" + 0.036*"advanc" + 0.034*"attempt" + 0.030*"databas" + 0.027*"thank" + 0.027*"follow"

Score: 0.13415464758872986	 
Topic: 0.079*"select" + 0.074*"base" + 0.069*"exampl" + 0.047*"multipl" + 0.045*"instanc" + 0.044*"singl" + 0.043*"switch" + 0.043*"simpl" + 0.042*"bootstrap" + 0.038*"dynam"

Score: 0.10986842215061188	 
Topic: 0.043*"give" + 0.037*"cell" + 0.033*"answer" + 0.026*"come" + 0.026*"swift" + 0.025*"caus" + 0.024*"memori" + 0.024*"help" + 0.022*"push" + 0.022*"ask"

Score: 0.0893721729516983	 
Topic: 0.044*"class" + 0.032*"differ" + 0.030*"number" + 0.029*"return" + 0.029*"object" + 0.029*"project" + 0.027*"method" + 0.027*"depend" + 0.021*"config" + 0.021*"line"

Score: 0.07101408392190933	 
Topic: 0.048*"string" + 0.045*"element" + 0.029*"loop" + 0.028*"field" + 0.024*"content" + 0.021*"delet" + 0.020*"exampl" + 0.018*"look" + 0.016*"creat" + 0.016*"kind"
