In [1]:
import sys
import os
sys.path.append('../scripts')
import pickle
import pandas as pd
import numpy as np
from gensim.models.wrappers import LdaMallet, ldamallet
from gensim.corpora.dictionary import Dictionary

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

In [7]:
def clean_party(party):
    # ajustments
    if (party == 'DEM' or 
        party == 'DEMOCRATAS'):
        party = 'PFL'

    if (party == 'PL' or 
        party == 'PRONA'):
        party = 'PR'
        
    if (party == 'PDC' or 
        party == 'PST' or 
        party == 'PTR' or 
        party == 'PRB' or 
        party == 'PPR' or 
        party == 'PPB' or 
        party == 'PROGRESSISTAS'):
        party = 'PP'

    if (party == 'PC DO B'):
        party = 'PCdoB'
    
    if (party == 'PODEMOS'):
        party = 'PODE'

    if (party == 'S/PARTIDO' or 
        party == 'S/Partido'):
        party = 'S-PARTIDO'

    return party

In [8]:
ids = []
empty_ids = []
unique_pts = {}
disc_by_pt = {}

all_discs = pickle.load(open("../../data/discursos_raw_all.pickle", "rb"))

for i, disc in enumerate(all_discs):

    d = disc["IdentificacaoPronunciamento"]

    if 'SiglaPartidoParlamentarNaData' not in d:
        empty_ids.append(i)

    else:
        party = d["SiglaPartidoParlamentarNaData"].strip()
            
        party = clean_party(party)
            
        if party not in unique_pts:
            unique_pts[party] = 1
        else:
            unique_pts[party] += 1
            
        if party not in disc_by_pt:
            disc_by_pt[party] = [i]
        else:
            disc_by_pt[party].append(i) 
            

In [9]:
len(disc_by_pt["PT"])

14556

In [10]:
parties = sorted(unique_pts.items(), key=lambda x: x[1], reverse=True)
pd.DataFrame(parties, columns=["Partido", "nº discursos"])

Unnamed: 0,Partido,nº discursos
0,PSDB,15227
1,PMDB,15094
2,PT,14556
3,PFL,10424
4,PDT,3690
5,PP,3229
6,PSB,3026
7,PTB,2653
8,PR,1587
9,PCdoB,1106


In [13]:
parties[0:12]

[('PSDB', 15227),
 ('PMDB', 15094),
 ('PT', 14556),
 ('PFL', 10424),
 ('PDT', 3690),
 ('PP', 3229),
 ('PSB', 3026),
 ('PTB', 2653),
 ('PR', 1587),
 ('PCdoB', 1106),
 ('PPS', 855),
 ('PSOL', 755)]

# Filtrando discursos do PT

In [14]:
def get_corpus_dict():
    corpus = pickle.load(open('../../data/corpus.pickle', 'rb'))
    dictionary = pickle.load((open('../../data/dictionary.pickle', 'rb')))
    
    return corpus, dictionary

def get_topics(lda, num_topics):
    res = []
    for index, topic in lda.show_topics(formatted=False, num_words= 10, num_topics=num_topics):
        res.append([a[0] for a in topic])
    
    return res

In [15]:
corpus, dictionary = get_corpus_dict()

In [16]:
num_topics = 400

In [28]:
for selected_party, _ in parties[0:12]:

    model_path = "../../models/lda_model_400_stem_it_1000.mdl"

    lda_mallet = LdaMallet.load(model_path)
    lda = ldamallet.malletmodel2ldamodel(lda_mallet)

    topics_map = {}

    for i, doc in enumerate(lda[corpus]):
        if i not in disc_by_pt[selected_party]:
            continue
        sorted_doc = sorted(lda[corpus][i], key=lambda x: (x[1]), reverse=True)[0:10]
        for topic, weight in sorted_doc:
            if topic not in topics_map:
                topics_map[topic] = weight
            else:
                topics_map[topic] += weight

    sorted_topics_map = sorted(topics_map.items(), key=lambda x: x[1], reverse=True)

    topics = get_topics(lda, num_topics)

    for i, row in enumerate(sorted_topics_map):
        sorted_topics_map[i] = [row[0], row[1], ", ".join(topics[row[0]])]


    with open('party_topics_{}.txt'.format(selected_party), 'a') as f:
        f.write('\t'.join(['i', 'topic', 'value', 'words']) + '\n')
        for i, row in enumerate(sorted_topics_map):
            f.write('{}\t{}\t{}\t{}\n'.format(
                i, 
                row[0], 
                round(row[1], 2), 
                row[2]
            ))

In [29]:
pd.DataFrame(sorted_topics_map, columns=["Tópico", "Peso total", "Palavras"])

Unnamed: 0,Tópico,Peso total,Palavras
0,333,0.106001,"maranhao, lobao, edison, senador, presidente, tambem, luis, outro, discurso, haver"
1,350,0.087214,"igreja, deus, catolico, papa, padre, religioso, cristao, bispo, jesus, pastor"
2,151,0.068951,"senador, casa, senado, amigo, trabalho, agradecer, mandato, obrigar, companheiro, querer"
3,93,0.064518,"querer, tambem, agradecer, presidente, registro, cumprimentar, importante, registrar, trabalho, hoje"
4,270,0.056704,"senador, helio, nobre, jose, presidente, querer, importante, entao, questao, casa"
5,132,0.048521,"voce, gente, falar, querer, entao, olhar, pessoa, achar, ficar, mundo"
6,153,0.044389,"senhor, senador, presidente, aspa, senado, abrir, figueiro, palavra, discurso, falar"
7,102,0.036593,"joao, senador, presidente, alberto, batista, motta, souza, obrigar, pronunciar, tenorio"
8,286,0.024801,"filho, familia, vida, casa, irmao, amigo, neto, esposa, trabalhar, marisa"
9,51,0.023119,"senado, casa, federal, senador, mesa, plenario, diretora, gabinete, trabalho, presidencia"


In [70]:
'\n'.join(['i', 'topic', 'value', 'words'])

'i\ntopic\nvalue\nwords'