In [1]:
import nltk
import re
import numpy as np
import pandas as pd
from pprint import pprint
import json
import codecs
from bs4 import BeautifulSoup
import os

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', '&quot', 'datum', 'work', 'connect', 'code', 'iot', 'make', 'add', 'message', 'follow', 'solution', 'day'])

datafile = json.load(codecs.open('data.json', 'r', 'utf-8-sig'))

data = []
for item in datafile["items"]:
    data.append(item["body"])
    
#Remove Emoji
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)
    
data_notags = []
#Remove tags and break lines
for item in data:
    data_bl = item.replace('\n', ' ').replace('\r', '')
    data_nc = re.sub(r'<pre>.+?</pre>', '', data_bl)
    data_ne = deEmojify(data_nc)
    data_nt = BeautifulSoup(data_ne, "lxml").text

    data_notags.append(data_nt)

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data_notags))

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_trigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
id2word.filter_extremes(no_below=2, no_above=1.0)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

mallet_path = 'C:/Users/ThIaG/PycharmProjects/lda3/mallet-2.0.8/bin/mallet'
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=12, id2word=id2word)


# Show Topics
pprint(ldamallet.show_topics(num_topics = 12, formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\n\n\nGrau de Coerência: ', coherence_ldamallet, '\n\n')

def create_file():
    path = "files/topico"
    ext = ".json"
    for i in range(0,12):
        os.remove(path + str(i) + ext)
        open(path + str(i) + ext, "x")
        

def save_data(index, topic_num, data):
    json_object = json.dumps(data["items"][index])
    with open("files/topico"+ str(topic_num) + ".json", "a") as outfile:
        outfile.write(json_object)


def format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=data, alldata=datafile):
    # Init output
    sent_topics_df = pd.DataFrame()

    create_file()
    
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                save_data(i, topic_num, alldata)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=data, alldata=datafile)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
topic_contribution = topic_contribution.rename_axis('Dominant_Topic').reset_index(name='percentage')

topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']].drop_duplicates()
topic_num_keywords.index = range(len(topic_num_keywords))

df_dominant_topics = pd.merge(topic_contribution, topic_num_keywords, how='inner', on='Dominant_Topic')
df_dominant_topics.head(12)

[(0,
  [('error', 0.07867998263135041),
   ('run', 0.06191923577941815),
   ('file', 0.056534954407294835),
   ('window', 0.04793747286148502),
   ('raspberry_pi', 0.028050369083803733),
   ('core', 0.026747720364741642),
   ('build', 0.024402952670429873),
   ('follow', 0.021623968736430743),
   ('version', 0.020495006513243597),
   ('script', 0.02040816326530612)]),
 (1,
  [('create', 0.055798288731112325),
   ('request', 0.051064991807755325),
   ('aw', 0.04687784452940105),
   ('thing', 0.044693245949390135),
   ('update', 0.04432914618605498),
   ('follow', 0.025122883670125613),
   ('http', 0.021481886036774075),
   ('post', 0.02075368651010377),
   ('button', 0.018478062989259057),
   ('status', 0.016111414527580557)]),
 (2,
  [('send', 0.14107731769879076),
   ('message', 0.059728838402345184),
   ('receive', 0.05954562110663247),
   ('topic', 0.05542323195309637),
   ('publish', 0.04534628068889703),
   ('event', 0.043697325027482595),
   ('broker', 0.03865884939538292),
   ('

FileNotFoundError: [WinError 2] O sistema não pode encontrar o arquivo especificado: 'files/topico12.json'