## Bibliotecas


In [71]:
from pymongo import MongoClient
from gensim.models.wrappers.ldamallet import malletmodel2ldamodel
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.wrappers import LdaMallet
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_documents
from sklearn.pipeline import Pipeline
import pyLDAvis.gensim
import time
import os
import random

from preprocessor import *

## Conectando com o MongoDB

In [3]:
client = MongoClient('localhost', 27017)
db = client["discursoDB"]
discursos = db["discursos"]

## Preprocessamento

In [88]:
start_time = time.time()

# all_discursos = discursos.find()
# discursos_list = random.sample(list(all_discursos), 5000)

discursos_list = discursos.find({"SiglaPartidoParlamentarNaData": {"$eq": "PT"}})

text = []

for disc in discursos_list:
    discurso_text = disc["Conteudo"]
    
    if discurso_text:
        text.append(discurso_text)

pipe = Pipeline( [
    ('cleaning', Cleaner()), 
    ('stopwords', StopWords(lang='portuguese', tokenize=True)),
    ('stemming', Stemmer(lang='portuguese', fit_reuse=True))])
pipe.fit(text)
res = pipe.transform(text)

# preproc = Preprocessor(max_word_lenght=2)
# preproc.fit(text)
# text = preproc.transform(text)        

elapsed_time = time.time() - start_time
print(time.strftime("Discursos preprocessados, demorou %H:%M:%S:%m", time.gmtime(elapsed_time)))


Discursos preprocessados, demorou 00:06:01:01


## Criação do modelo LDA com Mallet

In [89]:
start_time = time.time()
# Create a corpus from a list of texts
data = [a.split() for a in res]

dictionary = Dictionary(data)

corpus = [dictionary.doc2bow(t) for t in data]

# os.environ['MALLET_HOME'] = 'X:\\Programs\\mallet\\mallet-2.0.8\\'
mallet_path = 'X:\\Programs\\mallet\\mallet-2.0.8\\bin\\mallet.bat'

# Train the model on the corpus.
lda = LdaMallet(mallet_path, corpus, id2word=dictionary, num_topics=10)
# lda = LdaModel(corpus, id2word=dictionary, num_topics=10, alpha='auto', eval_every=5, chunksize=10, passes=10, decay=0.9)

elapsed_time = time.time() - start_time
print(time.strftime("Lda model criado, demorou %H:%M:%S:%m", time.gmtime(elapsed_time)))

Lda model criado, demorou 00:15:37:01


## Predictions nos stemmed

In [97]:
for index, topic in lda.show_topics(formatted=False, num_words= 15):
    print('Topic: {} \nWords: {}'.format(index, [pipe.predict([w[0]])[0][0] for w in topic]))

Topic: 0 
Words: ['senador', 'nao', 'exa', 'presidente', 'blocopt', 'todos', 'parte', 'porque', 'eduardo', 'ser', 'fazer', 'governo', 'quero', 'dizer', 'pode']
Topic: 1 
Words: ['todos', 'brasil', 'brasileiro', 'povo', 'presidente', 'vida', 'historia', 'pais', 'grande', 'anos', 'humanos', 'nao', 'mundo', 'dia', 'ser']
Topic: 2 
Words: ['economia', 'governo', 'nao', 'pais', 'brasil', 'brasileiro', 'empresas', 'bilhoes', 'banco', 'aumento', 'politica', 'investimentos', 'ano', 'emprego', 'medida']
Topic: 3 
Words: ['publica', 'federal', 'presidente', 'lei', 'pode', 'nao', 'casa', 'ser', 'nacional', 'estado', 'senado', 'justica', 'comissao', 'projeto', 'constituicao']
Topic: 4 
Words: ['mulheres', 'educacao', 'todos', 'direitos', 'politica', 'trabalho', 'pais', 'pessoas', 'escola', 'sociedade', 'nacional', 'universidade', 'brasileiro', 'violencia', 'tambem']
Topic: 5 
Words: ['trabalho', 'nao', 'senador', 'paim', 'presidente', 'aqui', 'todos', 'paulo', 'dizer', 'tambem', 'quero', 'grande',

In [83]:
for index, topic in lda.show_topics(formatted=False, num_words= 20):
    print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic]))

Topic: 0 
Words: ['estad', 'govern', 'municipi', 'federal', 'tod', 'regia', 'president', 'tamb', 'rio', 'cidad', 'amazon', 'sao', 'senador', 'grand', 'recurs', 'mat', 'projet', 'dess', 'nort', 'faz']
Topic: 1 
Words: ['nao', 'president', 'public', 'pod', 'govern', 'ministr', 'federal', 'polic', 'fat', 'ser', 'contr', 'crim', 'justic', 'sao', 'cpi', 'dev', 'diz', 'dess', 'ministeri', 'denunc']
Topic: 2 
Words: ['nao', 'polit', 'part', 'pod', 'ser', 'pais', 'govern', 'brasileir', 'tod', 'reform', 'president', 'dev', 'moment', 'sociedad', 'republ', 'estad', 'nacional', 'outr', 'debat', 'process']
Topic: 3 
Words: ['nao', 'aqu', 'senador', 'quer', 'porqu', 'faz', 'president', 'diz', 'fal', 'bloc', 'vai', 'enta', 'tod', 'brasil', 'tamb', 'trabalh', 'hoj', 'pod', 'gent', 'agor']
Topic: 4 
Words: ['govern', 'nao', 'econom', 'banc', 'ano', 'bilho', 'brasileir', 'public', 'aument', 'recurs', 'president', 'trabalh', 'empreg', 'pag', 'brasil', 'med', 'pod', 'salari', 'pais', 'minim']
Topic: 5 
Wo

## Listar tópicos

In [98]:
pyLDAvis.enable_notebook()
# vis

In [99]:
model = malletmodel2ldamodel(lda)
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)


  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Visualizar tópicos

In [101]:
vis
pyLDAvis.save_html(vis, 'lda-PT.html')

In [104]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
for t in range(lda.num_topics):
    plt.figure()
    plt.imshow(WordCloud().fit_words(lda.show_topic(t, 200)))
    plt.axis("off")
    plt.title("Topic #" + str(t))
    plt.show()

AttributeError: 'list' object has no attribute 'items'

<Figure size 432x288 with 0 Axes>