![purple-divider](https://user-images.githubusercontent.com/7065401/52071927-c1cd7100-2562-11e9-908a-dde91ba14e59.png)
# Classification of documents in official daily juridical processes of state of São Paulo: 
### Example of a official daily of 2 jun. 2020

<p align="center">
    <img src="assets/officialDailySP.png" width=900 height=700> 

    
    In this project we want to classify the type of a content of a document present in a official daily
</p>

<p>
    In simple words we want to classify the jucicial processes extract from official daily of period in 2015 to 2017, in sentences (processes which the judge finish the process) and documents which are not sentences.
</p>

![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)
### 1. Reading the data, library import, and seeying our data:
<!-- <blockquote>
csv, xlsx and json data formats
</blockquote> -->

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

# sp_data is a pandas dataframe of our data
sp_data = pd.read_json('data/Amostra100KFiltrada.json', lines=True)

In [4]:
# seeying our data
count_separator = [] # this is a vector which contains the frequency of ' - ' in each Document

count_Counteudo_length = [] # this is a vector which contains the lenght of "Conteúdo" in each Document

# for each Document in data, count the frequency of ' - ' and the len of "Conteúdo"
for i, Document in enumerate(sp_data['Conteúdo']):
    count_separator.append(Document.count(' - '))

data = np.array(count_separator) # vector which contains the amount of ' - ' in each document

# ------------------------------------------------------------------------------------------
# FILTER THE CONTEUDO OF NOT SENTENCES DOCUMENTS WHICH HAS 4 OR MORE ' - '
# ------------------------------------------------------------------------------------------

# regex for search of a subcontent of sentences in our data
# regex = r"( julgo | homologo.*o acordo)" # deu 97%??? deve ta errado
regex = r"(?i)(.JULGO PROCEDENTE.[^EM PARTE]|.JULGO TOTALMENTE PROCEDENTE.|.JULGO PROCEDENTE EM PARTE.|.JULGO PARCIALMENTE PROCEDENTE.|.JULGO IMPROCEDENTE.|.JULGO TOTALMENTE IMPROCEDENTE.|.JULGO EXTINTO.*sem.(julgamento|resolução).de.mérito.|.hom(o|ó)logo o acordo.|.(art\.?|artigo) 284.)"
Doc_series = pd.Series(sp_data['Conteúdo']) # Transform the pandas Data.frame to a series for initial interting spot

# Filter the documents which has the regex in "conteúdo"
sentencas = Doc_series.str.contains(regex, case=False)
sentencas = sentencas.to_numpy()

ind_range = (data > 4) & (sentencas == False)

data_4_more = sp_data['Conteúdo'][ind_range]

positions = []

processos = []

classes = []

assuntos = []

partes = []

conteudo = []

advs = []

for document in data_4_more:

    # ----------------------------------------------------------
    # Filtering the processes
    # ----------------------------------------------------------

    position = re.search(r' - ', document)
    processos.append(document[0:position.start()])

    doc_no_processos = document[position.end():]

    # ----------------------------------------------------------
    # Filtering the classes
    # ----------------------------------------------------------
    position = re.search(r' - ', doc_no_processos)
    classes.append(doc_no_processos[0:position.start()])

    doc_no_classes_to = doc_no_processos[position.end():]

    # ----------------------------------------------------------
    # Filtering the assuntos
    # ----------------------------------------------------------
    position = re.search(r' - ', doc_no_classes_to)
    assuntos.append(doc_no_classes_to[0:position.start()])

    doc_no_assuntos_to = doc_no_classes_to[position.end():]

    # ----------------------------------------------------------
    # Filtering the parts
    # ----------------------------------------------------------
    position = re.search(r' - ', doc_no_assuntos_to)
    partes.append(doc_no_assuntos_to[0:position.start()])

    doc_no_parts_to = doc_no_assuntos_to[position.end():]

    # ----------------------------------------------------------
    # Filtering the content
    # ----------------------------------------------------------

    # ----------------------------------------------------------
    # Filtering the ADV
    # ----------------------------------------------------------
    position = re.search(r"(?i)- ADV", doc_no_parts_to)
    if position:
        advs.append(doc_no_parts_to[position.start():])

        conteudo.append(doc_no_parts_to[0: position.start()])
    else:
        conteudo.append(doc_no_parts_to)

    # ----------------------------------------------------------
len(data_4_more)

# ------------------------------------------------------------------------------------------
# FILTER THE CONTEUDO OF SENTENCES DOCUMENTS WHICH HAS 4 OR MORE ' - '
# ------------------------------------------------------------------------------------------

C_sentencas = sp_data[sentencas]['Conteúdo']
cont_sentencas = data[sentencas]

more = cont_sentencas > 4

teste = C_sentencas[more]

processos_sentencas = []

classes_sentencas = []

assuntos_sentencas = []

partes_sentencas = []

conteudo_sentencas = []

advs_sentencas = []

for i, document in enumerate(teste):

    # ----------------------------------------------------------
    # Filtering the processes
    # ----------------------------------------------------------

    position = re.search(r' - ', document)
    if position:
        processos_sentencas.insert(i, document[0:position.start()])

        doc_no_processos = document[position.end():]

    # ----------------------------------------------------------
    # Filtering the classes
    # ----------------------------------------------------------
    position = re.search(r' - ', doc_no_processos)

    if position:
        classes_sentencas.insert(i, doc_no_processos[0:position.start()])

        doc_no_classes_to = doc_no_processos[position.end():]
    else:
        conteudo_sentencas.insert(i, doc_no_processos)
    # ----------------------------------------------------------
    # Filtering the assuntos
    # ----------------------------------------------------------
    position = re.search(r' - ', doc_no_classes_to)
    if position:
        assuntos_sentencas.insert(i, doc_no_classes_to[0:position.start()])

        doc_no_assuntos_to = doc_no_classes_to[position.end():]

    # ----------------------------------------------------------
    # Filtering the parts
    # ----------------------------------------------------------
    position = re.search(r' - ', doc_no_assuntos_to)
    if position:
        partes_sentencas.insert(i, doc_no_assuntos_to[0:position.start()])

        doc_no_parts_to = doc_no_assuntos_to[position.end():]

    # ----------------------------------------------------------
    # Filtering the content
    # ----------------------------------------------------------

    # ----------------------------------------------------------
    # Filtering the ADV
    # ----------------------------------------------------------
    position = re.search(r"(?i)- ADV", doc_no_parts_to)
    if position:
        advs_sentencas.insert(i, doc_no_parts_to[position.start():])

        conteudo_sentencas.insert(i, doc_no_parts_to[0: position.start()])
    else:
        conteudo_sentencas.insert(i, doc_no_parts_to)

len(conteudo_sentencas)

# ------------------------------------------------------------------------------------------
# TOKENIZER THE "CONTEUDO" OF SENTENCES 
# ------------------------------------------------------------------------------------------

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer

# create the tokenizer
t = Tokenizer(num_words=100)
# fit the tokenizer on the documents
t.fit_on_texts(conteudo_sentencas)
# ------------------------------------------------------------------------------------------
# GETTING THE TRAIN AND TEST DATASETS 
# ------------------------------------------------------------------------------------------

series = pd.Series(conteudo_sentencas)
not_sentence = pd.Series(conteudo)

X_train = pd.Series(conteudo_sentencas)
not_sentence_train = not_sentence.sample(len(X_train), random_state=123)

Y_train = [1] * len(X_train)
Y_sentence_train = [0] * len(X_train)

X_train = pd.concat([X_train, not_sentence_train])
Y_train = Y_train + Y_sentence_train

from sklearn.model_selection import train_test_split
sentences_train, sentences_test, y_train, y_test = train_test_split(
   X_train, Y_train, test_size=0.2, random_state=100)

# X_train = t.texts_to_matrix(sentences_train, mode='count')
# X_teste = t.texts_to_matrix(sentences_test, mode='count')

# X_train = t.texts_to_matrix(sentences_train, mode='freq')
# X_teste = t.texts_to_matrix(sentences_test, mode='freq')

  return func(self, *args, **kwargs)


In [22]:
ind = [i for i in range(len(Y_train))]
df = pd.DataFrame({'Content': X_train, 'Class': Y_train})
df.index = ind
df.to_csv('SentencesAndNotSentences.csv',index=False)

In [23]:
pd.read_csv('SentencesAndNotSentences.csv')

Unnamed: 0,Content,Class
0,Banco Itauleasing S/A - Diante do exposto e de...,1
1,Telefônica Brasil S/A - Em execução individual...,1
2,CONCLUSÃO Em 22 de outubro de 2012 faço estes ...,1
3,Uma vez que o autor quedou-se inerte quanto à ...,1
4,"Bradesco - Banco Brasileiro de Descontos S/a, ...",1
...,...,...
9987,Seguradora Líder dos Consórcios DPVAT - Fls. 2...,0
9988,Justiça Pública - Anderson Reginaldo Rizzo e o...,0
9989,Antonio Silva dos Santos - Vistos.Fl. 71. Inde...,0
9990,UNIMED DE GUARULHOS COOPERATIVA DE TRABALHO ME...,0
