In [2]:
import re
import pandas as pd
from os import listdir
from os.path import isfile, join

In [35]:
df = pd.DataFrame([["Aposentadorias", "'CONCEDER', 'APOSENTAR'", "Não"],
                  ["Reversões", "REVERTER A ATIVIDADE", "Não"],
                  ["Atos tonados sem efeitos (Aposentadoria)", "TORNAR SEM EFEITO", "Sim"]],
                  index = [4, 5, 6],
                  columns = ["Atos", "Regex", "Multiplos"])

style_df = pd.DataFrame(df).style.set_properties(**{'text-align': 'left'}).set_table_styles([ dict(selector='th', props=[('text-align','left')] ) ])
style_df

Unnamed: 0,Atos,Regex,Multiplos
4,Aposentadorias,"'CONCEDER', 'APOSENTAR'",Não
5,Reversões,REVERTER A ATIVIDADE,Não
6,Atos tonados sem efeitos (Aposentadoria),TORNAR SEM EFEITO,Sim


In [4]:
def print_dataframe(df):
    style_df = (df.style.set_properties(**{'text-align': 'left'})
                                        .set_table_styles([ dict(selector='th',
                                                                 props=[('text-align','left')])])
                   )
    return style_df

def get_txts(path):
    years = [join(path, x) for x in listdir(path) if not isfile(join(path, x))]
    txts = []
    for year in years:
        months = [join(year, x) for x in listdir(year) if not isfile(join(year, x))]
        for month in months:
            txts += [join(month, x) for x in listdir(month) if isfile(join(month, x))]
    return txts

In [5]:
dodfs_space_dir = "./dodfs_txt_espaco"
dodfs_space_files = get_txts(dodfs_space_dir)

dodfs_n_dir = "./dodfs_txt_barra_n"
dodfs_n_files = get_txts(dodfs_n_dir)

output = "./results"

In [48]:
class Regex:
    
    def __init__(self, text):
        self._text = text
        self._raw_acts = {}
        self._acts = []
        self._columns = []
        self.data_frame = pd.DataFrame()
    
    def find_all(self, rule, flag=0):
        return re.findall(rule, self._text, flags=flag)
    
    def find_in_act(self, rule, act):
        match = re.search(rule, act) 
        if match:
            return match.groups()
        return "nan"
    
    def _build_dataframe(self):
        if len(self._acts) > 0:
            df = pd.DataFrame(self._acts, columns=self._columns)
            #df.columns = self._columns
            return df
        return pd.DataFrame()

# Nomeação

In [66]:
# Classe Nomeação para cargos comissionados
class NomeacaoComissionados(Regex):
    
    def __init__(self,text):
        super().__init__(text)
        self._columns = ['nome','cargo_efetivo','matricula','siape','simbolo','cargo_comissao','lotacao','orgao']
        self.rules = {
            "nome": r"(^[A-ZÀ-Ž\s]+[A-ZÀ-Ž])",
            "cargo_efetivo": r"",
            "matricula": r"matr[í|i]cula\s?n?o?\s([\s\S]*?)[,|\s]",
            "siape": r"[S|s][I|i][A|a][P|p][E|e]\s[N|n]?[o|O]?\s([\s\S]*?)[,| | .]",
            "simbolo": r"[S|s][í|i]mbolo\s?n?o?\s([\s\S]*?)[,|\s]",
            "cargo_comissao": "",
            "lotacao": "",
            "orgao": ""
        }
        self._raw_acts = self._extract_instances()
        self._acts = self._acts_props()
        self.data_frame = self._build_dataframe()
        
        
    
    def _act_props(self, act_raw):
        act = {}
        for key in self.rules:
            try:
                act[key], = self.find_in_act(self.rules[key], act_raw)
            except:
                act[key] = "nan"
        return act
    
    def _acts_props(self):
        acts = []
        for raw in self._raw_acts:
            act = self._act_props(raw)
            acts.append(act)
        return acts        
        
    def _extract_instances(self):
        start = r"(NOMEAR)"
        body = r"([\s\S]*?)"
        end = "\."
        rule = start + body + end
        found = self.find_all(rule)
        results = []
        for instance in found:
            start, body = instance
            results.append(body)
            
        return results

In [67]:
res_dfs = []
l_ret = []
for txt in dodfs_n_files:
    txt_str = open(txt, "r").read()
    ret = NomeacaoComissionados(txt_str)
    l_ret.append(ret)
    if not ret.data_frame.empty:
        res_dfs.append(ret.data_frame)

rets_final = pd.concat([pd.DataFrame(df) for df in res_dfs],
                        ignore_index=True)
#print_dataframe(rets_final)


In [73]:
l_ret[0]._raw_acts[13]

' MARINA RODRIGUES DE FONTES para exercer o Cargo em Comissao, Simbolo D FA - 1 0 ,\nde Assessor Tecnico, da Coordenacao Executiva, da Administracao Regional do Park Way do Distrito\nFederal'

In [72]:
l_ret[0]._acts[13]

{'nome': ' MARINA RODRIGUES DE FONTES',
 'cargo_efetivo': 'nan',
 'matricula': 'nan',
 'siape': 'nan',
 'simbolo': 'D',
 'cargo_comissao': 'nan',
 'lotacao': 'nan',
 'orgao': 'nan'}

In [70]:
def _build_dataframe(_acts, _columns):
    if len(_acts) > 0:
        df = pd.DataFrame(_acts)
        df.columns = _columns
        return df
    return pd.DataFrame()
_build_dataframe(ret._acts, ret._columns)

Unnamed: 0,nome,cargo_efetivo,matricula,siape,simbolo,cargo_comissao,lotacao,orgao
0,,,,,RAMON SANTANA LOPES AZEVEDO,,,CNE-03


In [74]:
rets_final.head()

Unnamed: 0,nome,cargo_efetivo,matricula,siape,simbolo,cargo_comissao,lotacao,orgao
0,LUCENIR RODRIGUES,,,,CNE-07,,,
1,MATHEUS DE ARAUJO SIQUEIRA,,,,DFG-14,,,
2,ERIKA KARINE TELES LOPES DE SOUSA,,,,DFG-14,,,
3,MARCIO HUMBERTO DA SILVA ROCHA,,,,DFG-14,,,
4,FERNANDO NEPOMUCENO COELHO,,,,DFA-,,,
