In [None]:
import pandas as pd
import numpy as np
import gensim
import nltk
import logging

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.test.utils import datapath

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

np.random.seed(2020)

nltk.download('wordnet')
stemmer = SnowballStemmer('english')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
def clear_text(text):
    text = re.sub('<code>(.|\n)*?<\/code>', '', text)
    text = re.sub(r'(\<(/?[^>]+)>)', '', text)
    return text

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    text = clear_text(text)
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [0]:
data = pd.read_csv('/content/drive/My Drive/Дипломная работа/dataset/large_data.csv', low_memory=True)
data = data[:len(data.index) // 2]
data.head(3)

Unnamed: 0,id_post,post_tags,post_title,post_body
0,48153978,neo4j|cypher|liquigraph,Neo4j Cypher LiquiGraph - make migration scrip...,<p>I have created the following LiquiGraph mig...
1,48182021,android|flashlight,flash light wont turn on / camera error,<p>hi i was looking around flashlight app sour...
2,48183972,javascript|jquery|jquery-ui-autocomplete,Autocomplete function fires before minLength v...,<p>I'm writing autocomplete function that shou...


In [0]:
doc_sample = data.loc[3, 'post_body']

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['<p>After', 'mounting', 'the', 'database', 'I', 'tried', 'executing', 'these', 'command-', 'alter', 'database', 'open;', 'emanating', 'from', 'a', 'shutdown', 'initialization', 'error', 'but', 'my', 'oracle', 'kept', 'returning', 'an', 'ORA-00600:', 'internal', 'error', 'code,', 'arguments', '[dbkif_find_next_record_1],', '[],', '[],', '[].\nPlease,', 'how', 'do', 'I', 'proceed', 'from', 'here?</p>']


 tokenized and lemmatized document: 
['mount', 'databas', 'tri', 'execut', 'command', 'alter', 'databas', 'open', 'eman', 'shutdown', 'initi', 'error', 'oracl', 'keep', 'return', 'intern', 'error', 'code', 'argument', 'proceed']


In [0]:
data['post_body'] = data['post_body'].map(preprocess)
data['post_title'] = data['post_title'].map(preprocess)

In [0]:
data.to_csv('/content/drive/My Drive/Дипломная работа/dd.csv', index=False)

In [0]:
processed_docs = data['post_body']
processed_docs = processed_docs.append(data['post_title'], ignore_index=True)
len(processed_docs)

In [0]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=50, no_above=0.6, keep_n=300000)

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [0]:
lda_model_tfidf = gensim.models.LdaModel(corpus_tfidf, num_topics=200, id2word=dictionary, passes=2, workers=2)

In [0]:
for idx, topic in lda_model_tfidf.print_topics(5):
    print('Topic: {} Word: {}'.format(idx, topic))

In [0]:
temp_file = datapath("/content/drive/My Drive/Дипломная работа/models/large_data/model")
lda_model_tfidf.save(temp_file)

In [0]:
# Test accuracy