In [1]:
import re
import pandas as pd
from os import listdir
from os.path import isfile, join

In [2]:
def print_dataframe(df):
    style_df = (df.style.set_properties(**{'text-align': 'left'})
                                        .set_table_styles([ dict(selector='th',
                                                                 props=[('text-align','left')])])
                   )
    return style_df

def get_txts(path):
    years = [join(path, x) for x in listdir(path) if not isfile(join(path, x))]
    txts = []
    for year in years:
        months = [join(year, x) for x in listdir(year) if not isfile(join(year, x))]
        for month in months:
            txts += [join(month, x) for x in listdir(month) if isfile(join(month, x))]
    return txts

In [3]:
dodfs_space_dir = "./dodfs_txt_espaco"
dodfs_space_files = get_txts(dodfs_space_dir)

dodfs_n_dir = "./dodfs_txt_barra_n"
dodfs_n_files = get_txts(dodfs_n_dir)

output = "./results"

In [4]:
class Regex:
    
    def __init__(self, text):
        self._text = text
        self._raw_acts = {}
        self._acts = []
        self._columns = []
        self.data_frame = pd.DataFrame()
    
    def find_all(self, rule, flag=0):
        return re.findall(rule, self._text, flags=flag)
    
    def find_in_act(self, rule, act):
        match = re.search(rule, act) 
        if match:
            return match.groups()
        return "nan"
    
    def _build_dataframe(self):
        if len(self._acts) > 0:
            df = pd.DataFrame(self._acts, columns=self._columns)
            #df.columns = self._columns
            return df
        return pd.DataFrame()

# Exoneração

In [30]:
# Classe Nomeação para cargos comissionados
class Exoneracao(Regex):
    
    def __init__(self,text):
        super().__init__(text)
        self._columns = ['nome','matricula','simbolo','cargo_comissao','lotacao','orgao','vigencia','pedido',
                         'cargo_efetivo','siape','motivo']
        self.rules = {
            "nome": r"([A-ZÀ-Ž\s]+[A-ZÀ-Ž])",
            "matricula": r"matr[í|i]cula\s?n?o?\s([\s\S]*?)[,|\s]",
            "simbolo": r"[S|s][í|i]mbolo\s?n?o?\s([\s\S]*?)[,|\s]",
            "cargo_comissao": "",
            "lotacao": "",
            "orgao": "",
            "vigencia": "",
            "pedido": r"(a pedido)",
            "cargo_efetivo": "",
            "siape": r"[S|s][I|i][A|a][P|p][E|e]\s[N|n]*[o|O]*\s?([\s\S]*?)[,| | .]",
            "motivo": ""
        }
        self._raw_acts = self._extract_instances()
        self._acts = self._acts_props()
        self.data_frame = self._build_dataframe()
        
        
    
    def _act_props(self, act_raw):
        act = {}
        for key in self.rules:
            try:
                act[key], = self.find_in_act(self.rules[key], act_raw)
            except:
                act[key] = "nan"
        return act
    
    def _acts_props(self):
        acts = []
        for raw in self._raw_acts:
            act = self._act_props(raw)
            acts.append(act)
        return acts        
        
    def _extract_instances(self):
        start = r"(EXONERAR)"
        body = r"([\s\S]*?)"
        end = "\."
        rule = start + body + end
        found = self.find_all(rule)
        results = []
        for instance in found:
            start, body = instance
            results.append(body)
            
        return results

In [31]:
res_dfs = []
l_ret = []
for txt in dodfs_n_files:
    txt_str = open(txt, "r").read()
    ret = Exoneracao(txt_str)
    l_ret.append(ret)
    if not ret.data_frame.empty:
        res_dfs.append(ret.data_frame)

rets_final = pd.concat([pd.DataFrame(df) for df in res_dfs],
                        ignore_index=True)
#print_dataframe(rets_final)


In [32]:
l_ret[13]._raw_acts[10]

' EDIRANI SANTOS ARAUJO do Cargo em Comissao, Simbolo DFA-10, de Assessor\nTecnico, da Agencia de Atendimento ao Trabalhador do Guara, da Diretoria de Acoes para o Trabalhador,\nda Coordenacao de Acoes para o Trabalhador e o Empregador, da Subsecretaria de Atendimento ao\nTrabalhador e Empregador, da Secretaria Adjunta do Trabalho, da Secretaria de Estado de Trabalho do\nDistrito Federal'

In [33]:
l_ret[0]._acts[13]

{'nome': ' ALVARO GOMES DA SILVA JUNIOR',
 'matricula': 'nan',
 'simbolo': 'CNE-',
 'cargo_comissao': 'nan',
 'lotacao': 'nan',
 'orgao': 'nan',
 'vigencia': 'nan',
 'pedido': 'a pedido',
 'cargo_efetivo': 'nan',
 'siape': 'nan',
 'motivo': 'nan'}

In [34]:
def _build_dataframe(_acts, _columns):
    if len(_acts) > 0:
        df = pd.DataFrame(_acts)
        df.columns = _columns
        return df
    return pd.DataFrame()
_build_dataframe(ret._acts, ret._columns)

In [35]:
rets_final

Unnamed: 0,nome,matricula,simbolo,cargo_comissao,lotacao,orgao,vigencia,pedido,cargo_efetivo,siape,motivo
0,MATHEUS DE ARAUJO SIQUEIRA,,DFG-14,,,,,,,,
1,MARCIO HUMBERTO DA SILVA ROCHA,,DFA-10,,,,,,,,
2,ADAILTON LINO DOS SANTOS,,DFG-14,,,,,,,,
3,BRUNA ESTRELA NUNES,,DFG-14,,,,,,,,
4,IVANUZA ARAUJO FERREIRA DE SOUZA,,DFG-14,,,,,,,,
5,RODRIGO NUNES SANTANA,,DFG-14,,,,,,,,
6,KATIA REGINA DA SILVA CABRAL,,DFA-10,,,,,,,,
7,PEDRO HENRIQUE MONTEIRO DE JESUS,,DFA-10,,,,,,,,
8,ROBSON ELIAS SOUSA FERRAZ,,DFA-08,,,,,,,,
9,GLAUCILENE ROSA SASSI,,DFG-14,,,,,a pedido,,,
