# Configuração inicial

## Imports

In [None]:
!pip install codetiming --quiet
!pip install spacy --quiet
!pip install rouge-score --quiet
!spacy download pt_core_news_sm --quiet

  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
2022-10-22 00:00:31.832469: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[K     |████████████████████████████████| 13.0 MB 59.6 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


In [None]:
import json
import pandas as pd
import matplotlib as plt
import numpy as np
np.set_printoptions(threshold=7)
import networkx as nx
from math import ceil, floor
# from numpy.core.memmap import dtype

# NLTK
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords as NLTK_stopwords
from nltk.tokenize import sent_tokenize as NLTK_sent_tokenize, word_tokenize as NLTK_word_tokenize, RegexpTokenizer as NLTK_RegexpTokenizer
from nltk.cluster.util import cosine_distance as NLTK_cosine_distance
from nltk.stem import WordNetLemmatizer

# Spacy
import spacy
nlp = spacy.load('pt_core_news_sm')

# Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Widgets iterativos
from google.colab import output
output.enable_custom_widget_manager()
from ipywidgets import interact
from ipywidgets import IntSlider, FloatSlider, SelectionSlider, Dropdown
from ipywidgets import Checkbox, SelectMultiple, Text
import ipywidgets as w

# Métricas de validação
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)

# Permite type hint de funções e list no colab
from typing import Callable, List, Dict

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Download e leitura do corpus

In [None]:
!git clone https://github.com/diego-feijo/rulingbr/ # Download dos arquivos
!tar xJvf rulingbr/rulingbr-v1.2.tar.xz # Extrai pasta compactada com o corpus rulingbr-v1.2

Cloning into 'rulingbr'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
^C
tar (child): rulingbr/rulingbr-v1.2.tar.xz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now


In [None]:
# Load docs
docs = []
with open('rulingbr-v1.2.jsonl') as f:
    docs = list(map(
        lambda line: json.loads(line), 
        f
    ))

FileNotFoundError: ignored

# Exploração do Corpus

Lista de seções por ordem de relevância
 - Ementa: Gold Standart
 ---
 - Relatório: General description about the case.
 - Voto: Judge vote(s). It contains one or more votes from the judges. This document presents the critical aspects considered for the decision.
 - Acordão: This is the final decision. It is the compilation of the votes.
 - Area: Broad topic about the decision. [Lista de possiblidade aqui](https://github.com/diego-feijo/rulingbr#area)
 ---
 - Recursos: [Lista de possiblidade aqui](https://github.com/diego-feijo/rulingbr#a%C3%A7%C3%B5es-aut%C3%B4nomas)
 - Classe: It is the specific juridic instrument. [It can be one of the following](https://github.com/diego-feijo/rulingbr#classe). They may also be combined.
 ---
 - Extrato: Additional information. This section is missing in near 50% of the cases.
 - Relator: The name of the judge responsible for composing the report and the first vote of the decision.

## Visualização inicial


In [None]:
# Print do primeiro documento (ementa, acordão, relatório, voto, extrato, relator, classe, area)
doc = docs[0]
print(f"{doc.keys()}", end="\n\n")
for key in doc.keys():
  print(f"{key} -> {doc[key]}")

print("=---------=")
print(f"{doc['relator']}: {doc['ementa'][:52]}")

dict_keys(['ementa', 'acordao', 'relatorio', 'voto', 'extrato', 'relator', 'classe', 'area'])

ementa -> EMBARGOS DE DECLARAÇÃO - INOCORRÊNCIA DE CONTRADIÇÃO, OBSCURIDADE OU OMISSÃO - PRETENDIDO REEXAME DA CAUSA CARÁTER INFRINGENTE - INADMISSIBILIDADE – EMBARGOS DE DECLARAÇÃO REJEITADOS. - Não se revelam admissíveis os embargos de declaração, quando a parte recorrente - a pretexto de esclarecer uma inexistente situação de obscuridade, omissão ou contradição - vem a utilizá-los com o objetivo de infringir o julgado e de viabilizar, assim, um indevido reexame da causa. Precedentes. - A nulidade radical que afeta os diversos atos de interrogatório judicial, quando provocada pela conduta do magistrado que arbitrariamente nega, ao réu, o direito – por este titularizado – de formular reperguntas aos demais litisconsortes penais passivos, contamina, por efeito causal, todos os atos subseqüentes do processo, notadamente aqueles de índole probatória, como a inquirição de testemunhas arroladas p

## Visualização iterativa + Correção de sentença + Abreviações

#### Abreviações

In [None]:
# https://www.stf.jus.br/arquivo/cms/publicacaoLegislacaoAnotada/anexo/siglas_cf.pdf
ABREVIATION_DICT = {
    "fls.": "folhas"          ,
    "fl." : "folha"           ,
    "art.": "artigo"          ,
    "ART.": "ARTIGO"          ,
    "Min.": "Ministro"        ,"min.": "Ministro",
    "Rel.": "relator"         ,"rel.": "relator",
    "Des.": "Desembargador"   ,"des.": "Desembargador",
    "dec.": "decisão"         ,
    "j."  : "julgado em"      ,
}

is_abreviation = lambda word: True if word.lower() in ABREVIATION_DICT.keys() else False

def put_abreviations_in_lower_case(sentences):
    for i in range(len(sentences)):
        sentence = sentences[i]         
        sentences[i] = "".join(list(map(
            lambda word: word.lower() if is_abreviation(word) else word, 
            sentence
        )))
    return sentences

#### Correção de sentença

In [None]:
def merge_wrong_splitted_sentence(sentences):
    first_sentence_index = None
    indexes_to_update = []
    indexes_to_eliminate = []
    for index, sentence in enumerate(sentences):

        if first_sentence_index != index and first_sentence_index != None:
            merged_sentence = sentences[first_sentence_index] + " " + sentence
            indexes_to_update.append((merged_sentence, index))
            indexes_to_eliminate.append(first_sentence_index)
            first_sentence_index = None

        if any(abreviation in sentence for abreviation in ABREVIATION_DICT.values()):
            first_sentence_index = index
        

    for merged_sentence, index in indexes_to_update:
        sentences[index] = merged_sentence
        # print("=------=")
        # print(merged_sentence)
        # print("=------=")

    for index in indexes_to_eliminate[::-1]: 
        sentences.pop(index)
    # map(lambda i: sentences.pop(index), indexes_to_eliminate)


    return sentences

In [None]:
def correct_sentences(sentences):
    sentences = put_abreviations_in_lower_case(sentences)

    # Substitui as abreviações por palavras completas
    for abreviation, word in ABREVIATION_DICT.items():
        sentences = list(map(lambda s: s.replace(abreviation, word), sentences))

    sentences = merge_wrong_splitted_sentence(sentences) # junta as sentenças separadas erroneamente
    
    return sentences

In [None]:
def funcao(numero):
    return numero * 2

In [None]:
funcao(16)

32

#### Visualização iterativa

In [None]:
def explore_text(
                doc_index: int, 
                section_name: int,
                put_emoji: bool,
                RegEx: str
                ):
  
  current_doc = docs[doc_index]
  text = current_doc[section_name]

  if( put_emoji ):
    emoji = " 👈"
  else:
    emoji = ""

  # Divide em sentenças
  if (RegEx):
    tokenizer = NLTK_RegexpTokenizer(RegEx)
    sentences = tokenizer.tokenize(text) # Com regex
  else:
    sentences = NLTK_sent_tokenize(text) # Sem regex

  sentences = correct_sentences(sentences)

  for sentence in sentences:
    print(f"{sentence}{emoji}")

In [None]:
interact(
    explore_text,

    doc_index    = IntSlider(continuous_update=False, min=0, max=len(docs)-1, step=1, value=0),
    section_name = Dropdown(options=[("Ementa", "ementa"), ("Relatório", "relatorio"), ("Area", "area"), ("Relator", "relator"), ("Voto", "voto"), ("Acordão", "acordao"), ("Classe", "classe"), ("Extrato", "extrato") ], value="ementa"),
    put_emoji    = Checkbox(description="Marcar fim de sentença com 👈", value=True),
    RegEx        = w.Text(value="", placeholder="Digite RegEx aqui")
)
print("---------------") # doc_index para testar regex: 1278, 1195

interactive(children=(IntSlider(value=0, continuous_update=False, description='doc_index', max=10573), Dropdow…

---------------


In [None]:
interact(
    explore_text,

    doc_index    = IntSlider(continuous_update=False, min=0, max=len(docs)-1, step=1, value=0),
    section_name = Dropdown(options=[("Ementa", "ementa"), ("Relatório", "relatorio"), ("Area", "area"), ("Relator", "relator"), ("Voto", "voto"), ("Acordão", "acordao"), ("Classe", "classe"), ("Extrato", "extrato") ], value="ementa"),
    put_emoji    = Checkbox(description="Marcar fim de sentença com 👈", value=True),
    RegEx        = w.Text(value="", placeholder="Digite RegEx aqui")
)
print("---------------") # doc_index para testar regex: 1278, 1195

interactive(children=(IntSlider(value=0, continuous_update=False, description='doc_index', max=10573), Dropdow…

---------------


## Criação da lista de stop words

In [None]:
my_stop_list = ["p", "p/"]
NLTK_stop_words = NLTK_stopwords.words("portuguese")
Spacy_stop_words = list(nlp.Defaults.stop_words)

STOP_WORDS = list(set(
    my_stop_list + 
    NLTK_stop_words + 
    Spacy_stop_words
))

print(STOP_WORDS)

['houve', 'eu', 'novo', 'pontos', 'nuns', 'geral', 'vós', 'puderam', 'local', 'algo', 'daquela', 'atrás', 'seria', 'logo', 'talvez', 'tiverem', 'tenham', 'tua', 'tanta', 'dar', 'estávamos', 'mesmo', 'fazeis', 'entre', 'aos', 'debaixo', 'ser', 'ligado', 'te', 'do', 'cinco', 'será', 'dez', 'disso', 'certamente', 'foste', 'fazes', 'por', 'pois', 'mais', 'para', 'ali', 'algumas', 'delas', 'nível', 'pegar', 'estamos', 'longe', 'poderá', 'tive', 'nesta', 'conhecida', 'estivermos', 'tão', 'segunda', 'menos', 'querem', 'aquela', 'quieto', 'tipo', 'meses', 'forem', 'pelos', 'ir', 'final', 'aquelas', 'até', 'estavam', 'cento', 'enquanto', 'porquanto', 'terceira', 'número', 'tiveste', 'inclusive', 'vinte', 'bastante', 'tivemos', 'vosso', 'tivesse', 'formos', 'esse', 'breve', 'tente', 'haja', 'sem', 'nova', 'estivesse', 'os', 'tentei', 'irá', 'fomos', 'último', 'bem', 'sejamos', 'vezes', 'num', 'suas', 'obrigado', 'ter', 'cedo', 'dois', 'quinze', 'vem', 'estivessem', 'à', 'estiveste', 'houverem', 

# Extração de features beaseada em heurísticas para **pontuação de sentenças**

Observações:
A pontuação pode ser positiva ou negativa e mais de uma pontuação pode ser usada. As heurísticas são escolhidas de forma iterativa no final deste colab.

 - TF
 - IDF
 - TF_IDF
 - Quantidade de palavras iniciando com letra maiúscula
 - Existência de número
 - Quantidade de nomes próprios
 - Posição
 - Tamanho
 - Quantidade de conjunções
 <!-- - Coocorrência/Relaçãos Sintagmática (Frequência de n-grams) -->
 <!-- - Similaridade Léxica (Word Wise) -->
  <!-- - TODO: Encontrar heurísticas para similaridade entre palavras -->
  <!-- - https://pt.frwiki.wiki/wiki/Similarit%C3%A9_lexicale -->
  <!-- - Exemplo: Início, Meio, Fim, Ementa, Acordão, Relatório, Voto, Extrato, Relator, Classe, Area -->
  <!-- - TODO: Encontrar lendo os documentos do corpus uma boa posição -->
 <!-- - Centralidade (repetição de informação entre sentenças) -->
 <!-- - Comparação com título  -->

### Funções auxiliares

Criando dataframe

In [None]:
df = pd.DataFrame(docs,columns=docs[0].keys())
train, test = train_test_split(df)

In [None]:
def split_sentence(sentence: str):
    words = nlp(sentence)
    lemma = []
    pos = []
    for word in words:
        lemma.append(word.lemma_)
        pos.append(word.pos_)
    return lemma, pos

In [None]:
def create_bag_of_word(words: List[str]):
    bag = {}
    unique_words = list(set(words))
    for word in unique_words:
        bag[word] = 0.0
    return bag

In [None]:
def bag_count(bag_zero, words):
    bag = bag_zero.copy()
    for word in words:
        bag[word] += 1

    return bag

In [None]:
def sentences_split(sentences: List[str]):
    corpus = {
        'sentences': [],
        'pos': [],
        'bags': [],
        'features': {},
        'b0': {}
    }
    for i in range(len(sentences)):
        lemma, pos = split_sentence(sentences[i])
        corpus["sentences"].append(lemma)
        corpus["pos"].append(pos)
    return corpus

### Extração de features baseadas em palavras

#### TF/IDF/TF-IDF

In [None]:
def term_frequency(bag, size):
    frequency = bag.copy()
    for word in bag:
        frequency[word] = bag[word] / size
    return np.fromiter(frequency.values(), dtype=float) # list(frequency.values())

In [None]:
def inverse_document_frequency(corpus_bags, bag, bag_zero):
    count = bag_zero.copy()
    for corpus_bag in corpus_bags:
        for word in bag:
            if corpus_bag[word] > 0 and bag[word] > 0:
                count[word] += 1
    size = len(corpus_bags)
    for word in count:
        if not count[word] == 0:
            count[word] = np.log10(size/count[word])

    return np.fromiter(count.values(), dtype=float) # list(count.values())

In [None]:
def feature_TFIDF(corpus: List[str], query: List[str]):
    for doc in [corpus, query]:
        doc['features']['tf']    = [] # lista com frequência de cada palavra
        doc['features']['idf']   = [] # lista com frequência inversa de cada documento
        doc['features']['tfidf'] = []

    all_sentences: List[str] = [] # Lista de palavras de todas as sentenças
    for doc in [corpus, query]:
        for sentence_ in doc["sentences"]:
            all_sentences += sentence_

    bag_zero = create_bag_of_word(all_sentences)
    corpus['b0'] = bag_zero
    for doc in [corpus, query]:
        for i in range(len(doc['sentences'])):
            doc['bags'].append(bag_count(bag_zero, doc['sentences'][i]))
    
    for doc in [corpus, query]:
        for i in range(len(doc['bags'])):
            tf = term_frequency(doc['bags'][i], len(doc['sentences'][i]))
            idf = inverse_document_frequency(corpus['bags'],doc['bags'][i], bag_zero)
            doc['features']['tf'].append(tf)
            doc['features']['idf'].append(idf)
            doc['features']['tfidf'].append(tf * idf)

####Letra maíscula e nome proprio

In [None]:
def feature_UPPER_PROP(corpus: List[str], query: List[str]):
    for doc in [corpus, query]:
        doc['features']['upper'] = []
        doc['features']['prop'] = []
        for i in range(len(doc['pos'])):
            bag = doc['bags'][i]
            count_upper = 0
            count_prop = 0
            for ii in range(len(doc['pos'][i])):
                word = doc['sentences'][i][ii]
                if word[0].isupper(): 
                    count_upper += bag[word]
                if doc['pos'][i][ii] == 'PROPN':
                    count_prop += bag[word]

            doc['features']['upper'].append(count_upper)
            doc['features']['prop'].append(count_prop)


###Extração de features baseadas em Sentenças

####Conter númerais


In [None]:
def feature_CONTAIN_NUMBER(corpus: List[str], query: List[str]):
    for doc in [corpus, query]:
        doc['features']['number'] = []
        
        for pos_list in doc['pos']:
            contains = 0
            for pos in pos_list:
                if pos == 'NUM':
                    contains = 1
                    break

            doc['features']['number'].append(contains)

####Tamanho da sentença


In [None]:
def feature_LENGTH(corpus: List[str], query: List[str]):
    for doc in [corpus, query]:
        doc['features']['length'] = []
    
        for sentence in doc['sentences']:
            doc['features']['length'].append(len(sentence))

####Posição da sentença


In [None]:
def feature_POSITION(corpus: List[str], query: List[str], metric='INV'):
    for doc in [corpus, query]:
        l = len(doc['sentences'])
        position = []
        if metric == 'ORD': position = np.array(range(1,l+1), dtype=float)
        elif metric == 'INV': position = np.array(range(l+1,1,-1), dtype=float)
        elif metric == 'MID':
            start = np.array(range(1,int(l/2)), dtype=float)
            end = np.array(range(int(l/2)-1, 0,-1), dtype=float)
            position = np.concatenate(start,end)
        elif metric == 'EXT': 
            start = np.array(range(1,int(l/2)), dtype=float)
            end = np.array(range(int(l/2)-1, 0,-1), dtype=float)
            position = np.concatenate(end,start)
            
        doc['features']['position'] = position

####Quantidade de conjunções


In [None]:
def feature_CUE_PHRASES(corpus: List[str], query: List[str]):
    for doc in [corpus, query]:
        doc['features']['cue'] = []
    
        for bag in doc['bags']:
            count_cue = 0
            for word in bag:
                if doc['pos'][0] == 'CONJ':
                    count_cue += bag[word]

            doc['features']['cue'].append(count_cue)


In [None]:
def stract_info(sentences: List[str], analysed_sentence: List[str]):
    sentences = sentences_split(sentences)
    analysed_sentence = sentences_split(analysed_sentence)
    
    # Feature baseados em palavras
    feature_TFIDF(sentences, analysed_sentence)
    feature_UPPER_PROP(sentences, analysed_sentence) 

    # Feature baseados em sentenças
    feature_CUE_PHRASES(sentences, analysed_sentence)
    feature_CONTAIN_NUMBER(sentences, analysed_sentence)
    feature_LENGTH(sentences, analysed_sentence)
    feature_POSITION(sentences, analysed_sentence)

    return analysed_sentence['features']

In [None]:
stract_info(
    [
        'O Rodemarck restaurante italiano tem uma ÓTIMA pizza',
        'Restaurante americano tem o melhor hamburguer',
        'Restaurante coreano tem o melhor bibimbap',
        'O MELHOR dos melhores restaurantes é americano'
    ],
    [
        'O MELHOR dos Melhores restaurantes é americano',
    ]
)

{'tf': [array([0.14285714, 0.        , 0.14285714, ..., 0.14285714, 0.        ,
         0.14285714])],
 'idf': [array([0.60205999, 0.        , 0.60205999, ..., 0.        , 0.        ,
         0.        ])],
 'tfidf': [array([0.08600857, 0.        , 0.08600857, ..., 0.        , 0.        ,
         0.        ])],
 'upper': [2.0],
 'prop': [1.0],
 'cue': [0],
 'number': [0],
 'length': [7],
 'position': array([2.])}

### Aplica heuristicas no score de cada frase

In [None]:
def apply_heuristic_score(raw_score: Dict,      # Pontuação de cada sentença com algoritmo de grafo 
                          sentences_info: Dict, # Informações extraidas das sentenças
                          heuristics: List[str], # Lista de quais heurísticas aplicar
                          print_scores = False,
                          ):
  # positives = ['tf+', 'idf+', 'tfidf+', 'upper+', 'prop+', 'cue+', 'number+', 'length+', 'position+']
  # negatives = ['tf-', 'idf-', 'tfidf-', 'upper-', 'prop-', 'cue-', 'number-', 'length-', 'position-']

  scores = list(raw_score.values())

  # Soma os tf/idf/tfidf das palavras para gerar o valor da frase
  sentences_info['tf'] = list(map(sum, sentences_info['tf']))
  sentences_info['idf'] = list(map(sum, sentences_info['idf']))
  sentences_info['tfidf'] = list(map(sum, sentences_info['tfidf']))

  if (print_scores): print(f"Pontuação antes -> {scores}")

  for i in range(len(raw_score)):
    for heuristic in heuristics:
      positive_negative = heuristic[-1] # Pega a parte + ou -
      positive_negative = +1 if positive_negative == "+" else -1

      name = heuristic[:-1] # Pega o nome da heuristica

      # print(sentences_info[name][i])
      scores[i] += sentences_info[name][i] * positive_negative # Atualiza o score

  if (print_scores): print(f"Pontuação depois -> {scores}")

  return scores

# score[i] += sentences_info['tf'][i] * positive_negative
# score[i] += sentences_info['idf'][i] * positive_negative
# score[i] += sentences_info['tfidf'][i] * positive_negative
# score[i] += sentences_info['upper'][i] * positive_negative
# score[i] += sentences_info['prop'][i] * positive_negative
# score[i] += sentences_info['cue'][i] * positive_negative
# score[i] += sentences_info['number'][i] * positive_negative
# score[i] += sentences_info['length'][i] * positive_negative
# score[i] += sentences_info['position'][i] * positive_negative

# if 'tf+' in heuristics:

# if 'tf-' in heuristics:
#   score[i] -= sentences_info['tf'][i]

# if '' in heuristics:
#   score[i] -= sentences_info['tf'][i]
# if 'tfidf+' in heuristics:
# for key in sentences_info:
#     print(len(sentences_info[key]))
#   print(len(df["processed_sentences"]))
# # teste
# all_words = map(lambda word: word.lower(), NLTK_word_tokenize(' '.join(sentences)))
# all_words = list(set(all_words))

# Geração da matrix de similaridade entre cada par de sentenças usando BOW / TF / TF-IDF

BOW &emsp; 👉 Apenas frequênzia <br>
TF  &emsp; 👉 Frequência dividida pelo tamanho da frase <br>
TF-IDF     👉 TF * IDF

https://pt.wikipedia.org/wiki/Tf%E2%80%93idf

https://drive.google.com/file/d/1fCAJo2s6fu7edE0jiP3VUW9Bxo3aSGge/view


In [None]:
def build_similarity_matrix(sentences: List[str], similarity_function: Callable):

  len_sentences = len(sentences)

  # Create an empty similarity matrix
  similarity_matrix = np.zeros( shape=(len_sentences, len_sentences) )

  for idx1 in range(len_sentences):
    for idx2 in range(len_sentences):
      if idx1 == idx2: 
        continue # ignore if both are same sentences

      similarity_matrix[idx1][idx2] = similarity_function(sentences[idx1], sentences[idx2])

  return similarity_matrix

################################################################################

def sentence_similarity_by_BOW(sentence_1, sentence_2):

    splited_sentence_1 = NLTK_word_tokenize(sentence_1)
    splited_sentence_2 = NLTK_word_tokenize(sentence_2)
 
    all_words = splited_sentence_1 + splited_sentence_2

    bag_of_word_1 = create_bag_of_word(all_words)
    bag_of_word_2 = create_bag_of_word(all_words)
 
    for word in splited_sentence_1:
        bag_of_word_1[word] += 1
        
    for word in splited_sentence_2:
        bag_of_word_2[word] += 1

    vector_1 = list(bag_of_word_1.values())
    vector_2 = list(bag_of_word_2.values())

    return 1 - NLTK_cosine_distance(vector_1, vector_2)

################################################################################

def sentence_similarity_by_TF(sentence_1, sentence_2):

    splited_sentence_1 = NLTK_word_tokenize(sentence_1)
    splited_sentence_2 = NLTK_word_tokenize(sentence_2)
 
    all_words = splited_sentence_1 + splited_sentence_2

    bag_of_word_1 = create_bag_of_word(all_words)
    bag_of_word_2 = create_bag_of_word(all_words)
 
    for word in splited_sentence_1:
        bag_of_word_1[word] += 1
        
    for word in splited_sentence_2:
        bag_of_word_2[word] += 1

    for word in splited_sentence_1:
        bag_of_word_1[word] = bag_of_word_1[word] / len(splited_sentence_1)
        
    for word in splited_sentence_2:
        bag_of_word_2[word] = bag_of_word_2[word] / len(splited_sentence_2)
    
    vector_1 = list(bag_of_word_1.values())
    vector_2 = list(bag_of_word_2.values())

    return 1 - NLTK_cosine_distance(vector_1, vector_2)

################################################################################

def sentence_similarity_by_TFIDF(sentence_1, sentence_2):

    splited_sentence_1 = NLTK_word_tokenize(sentence_1)
    splited_sentence_2 = NLTK_word_tokenize(sentence_2)
 
    all_words = splited_sentence_1 + splited_sentence_2

    bag_of_word_1_idf = create_bag_of_word(all_words)
    bag_of_word_2_idf = create_bag_of_word(all_words)

    bag_of_word_1_tf = create_bag_of_word(all_words)
    bag_of_word_2_tf = create_bag_of_word(all_words)
 
    for word in splited_sentence_1:
        bag_of_word_1_idf[word] += 1
        bag_of_word_2_idf[word] += 1

        bag_of_word_1_tf[word] += 1
        
    for word in splited_sentence_2:
        bag_of_word_1_idf[word] += 1
        bag_of_word_2_idf[word] += 1

        bag_of_word_2_tf[word] += 1

    for word in splited_sentence_1:
        bag_of_word_1_tf[word] = bag_of_word_1_tf[word] / len(splited_sentence_1)
        
    for word in splited_sentence_2:
        bag_of_word_2_tf[word] = bag_of_word_2_tf[word] / len(splited_sentence_2)

    M = 3 # 2 sentenças +1
    for word in all_words:
        if bag_of_word_1_idf[word] > 0: 
            bag_of_word_1_idf[word] = np.log10( M / bag_of_word_1_idf[word] )

        if bag_of_word_2_idf[word] > 0:
            bag_of_word_2_idf[word] = np.log10( M / bag_of_word_2_idf[word] )
    
    idf_1 = np.fromiter(bag_of_word_1_idf.values(), dtype=float)
    idf_2 = np.fromiter(bag_of_word_2_idf.values(), dtype=float)

    tf_1 = np.fromiter(bag_of_word_1_tf.values(), dtype=float)
    tf_2 = np.fromiter(bag_of_word_2_tf.values(), dtype=float)

    vector_1 = tf_1 * idf_1
    vector_2 = tf_2 * idf_2

    return 1 - NLTK_cosine_distance(vector_1, vector_2)


# Pontuação de sentenças com grafo de similaridade

Documentação: https://networkx.org/documentation/stable/tutorial.html

Rank em Grafo:
 - Bushy path of the node
 - Similaridade agregada
 - [Page Pank](https://youtu.be/MG0fIXfrT9A?t=114)

In [None]:
def bush_path(similarity_graph: nx.Graph, threshold = 0.5) -> Dict[str, float]:
  G = similarity_graph

  scores = {
      # Sentença/Node: Score
  }

  for node in G.nodes():
    scores[node] = 0 # Inicializa a pontuação de todas as sentenças

    for neighbor_node in G.neighbors(node):
      similarity = G.get_edge_data(node, neighbor_node)['weight']
      
      if similarity > threshold:
        scores[node] += 1
  
  return scores

  # rows, cols = similarity_matrix.shape
      
  # apply_threshold = lambda value: 1 if value > threshold else 0
  # vectorized_apply_threshold = np.vectorize(apply_threshold)

  # thresholded_matrix = vectorized_apply_threshold(similarity_matrix)

  # print(thresholded_matrix)

  # Iteração sobre o triângulo de cima da matrix simétrica
  # https://people.revoledu.com/kardi/tutorial/VB/tips/Symmetric-Matrix.html
  # for i in range(1, rows -1):
  #   for j in range(i +1, cols):
  #     if similarity_matrix[i, j] > threshold:
  #       pass

################################################################################

def similaridade_agregada(similarity_graph: nx.Graph) -> Dict[str, float]:
  G = similarity_graph

  scores = {
      # Sentença/Node: Score
  }

  for node in G.nodes():
    scores[node] = 0 # Inicializa a pontuação de todas as sentenças

    for neighbor_node in G.neighbors(node):
      similarity = G.get_edge_data(node, neighbor_node)['weight']
      scores[node] += similarity
  
  return scores

################################################################################

def text_rank(similarity_graph: nx.Graph) -> Dict[str, float]:
  return nx.pagerank(similarity_graph)

# Sumarização com Algoritmos de Resumo em Grafo + Heurísticas

In [None]:
def generate_summary(doc_index: int, 
                     return_text: bool,
                     filter_stop_word: bool,
                     lemmatize: bool,
                     join: bool,
                     show_matrix: bool,
                     print_scores: bool,
                     heuristics: List[str], 
                     section_names: List[str], 
                     RegEx: str, 
                     algorithm: str,
                     threshold: float,
                     similarity: str,
                     top_n: int,
                     validation: str
                     ):
  
  doc = docs[doc_index] # Documento que vai ser resumido

  ### Step 1 - Junta o texto das seções do documento que vai ser resumido
  text = list(map(lambda section_name: doc[section_name], section_names))
  text = ". ".join(text)

  ### Step 2 - Divide o texto em sentenças
  if (RegEx):
    tokenizer = NLTK_RegexpTokenizer(RegEx)
    sentences = tokenizer.tokenize(text) # Com regex
  else:
    sentences = NLTK_sent_tokenize(text) # Sem regex
    sentences: List[str] = correct_sentences(sentences)
  

  # Declara dataframe de auxílio
  df = pd.DataFrame()
  df["original_sentences"]  = sentences
  df["processed_sentences"] = sentences.copy() # Começa como uma cópia do original
  df["sentence_scores"]     = [0.0] * len(sentences) 


  ### Step 3 - Aplica Pre-processamento - Remove frase pequena - Filtra stop word - Lematiza o texto
  df = df[df["original_sentences"].map(NLTK_word_tokenize).map(len) >= 5]
  df.reset_index(drop = False, inplace = True)

  if (filter_stop_word):
    df["processed_sentences"] = df["processed_sentences"].apply(
        lambda sentence: " ".join(list(filter(lambda word: word.lower() not in STOP_WORDS, NLTK_word_tokenize(sentence))))
    )

  if (lemmatize):
    df["processed_sentences"] = df["processed_sentences"].apply(
        lambda sentence: " ".join(list(map(lambda word: word.lemma_, nlp(sentence))))
    )
  
  ### Step 4 - Gerar matrix de similaridade entre sentenças
  if similarity == "BOW":
    similarity_function = sentence_similarity_by_BOW
  elif similarity == "TF":
    similarity_function = sentence_similarity_by_TF
  elif similarity == "TF-IDF":
    similarity_function = sentence_similarity_by_TFIDF

  sentence_similarity_martix = build_similarity_matrix(df["processed_sentences"], similarity_function)
  if (show_matrix): 
    print("=" + "-"*20 + "=")
    print(sentence_similarity_martix)
    print("=" + "-"*20 + "=")

  ### Step 5 - Pontua sentença usando a matrix de similaridade no algoritmo escolhido
  if algorithm == "Bush Path Node":
    algorithm_function = bush_path
  elif algorithm == "Similaridade Agregada":
    algorithm_function = similaridade_agregada
  elif algorithm == "Text Rank":
    algorithm_function = text_rank
  
  sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
  args = {
      "Bush Path Node":        [sentence_similarity_graph, threshold,],
      "Similaridade Agregada": [sentence_similarity_graph,],
      "Text Rank":             [sentence_similarity_graph,]
  }
  args = args[algorithm]
  raw_scores = algorithm_function(*args)

  ### Stop 6 - Extrair informações sobre cada sentença e pontuar a heurística
  sentences_info = stract_info([], df["processed_sentences"])

  ### Stop 7 - Pontuar sentenças com heurística
  final_score = apply_heuristic_score(raw_scores, sentences_info, heuristics, print_scores)
  df["sentence_scores"] = final_score
  # df["sentence_scores"] = list(raw_scores.values())

  ### Step 8 - Ordena as sentenças por pontuação
  df.sort_values(by=["sentence_scores"], ascending=False, inplace=True)

  ### Step 9 - Escolhe as melhores sentences
  summarize_text = df["original_sentences"][:top_n]
  out_text = " ⚪ ".join(summarize_text) if join else summarize_text

  ### Step 10 - Avalia a pontuação do resumo
  print("=" + "-"*20 + "=")
  if validation == "ROUGE":
    score = scorer.score(" ".join(summarize_text), doc['ementa'])
    p1, r1, f1 = score['rouge1'].precision, score['rouge1'].recall, score['rouge1'].fmeasure
    p2, r2, f2 = score['rouge2'].precision, score['rouge2'].recall, score['rouge2'].fmeasure
    pL, rL, fL = score['rougeL'].precision, score['rougeL'].recall, score['rougeL'].fmeasure
    print(f"SCORE ROUGE1 -> precision = {p1:.2f}, recall = {p1:.2f}, fmeasure = {p1:.2f}")
    print(f"SCORE ROUGE2 -> precision = {p2:.2f}, recall = {p2:.2f}, fmeasure = {p2:.2f}")
    print(f"SCORE ROUGEL -> precision = {pL:.2f}, recall = {pL:.2f}, fmeasure = {pL:.2f}")
  # elif validation == "Pyramid":
  #   pass
  # elif validation == "Sentence Mapping":
  #   pass
  print("=" + "-"*20 + "=")


  return out_text if return_text else score

################################################################################
heuristics_options = [
    ("TF+", "tf+"), ("IDF+", "idf+"), ("TF-IDF+", "tfidf+"), ("Upper+", "upper+"), ("Prop+", "prop+"), ("Cue Phrases+", "cue+"), ("Have Number+", "number+"), ("Length+", "length+"), ("Position+", "position+"),
    ("TF-", "tf-"), ("IDF-", "idf-"), ("TF-IDF-", "tfidf-"), ("Upper-", "upper-"), ("Prop-", "prop-"), ("Cue Phrases-", "cue-"), ("Have Number-", "number-"), ("Length-", "length-"), ("Position-", "position-"),
]
interact(
    generate_summary,

    filter_stop_word = Checkbox(description="Filter Stop Word", value=True),
    return_text      = Checkbox(description="Return Text", value=True),
    lemmatize        = Checkbox(description="Preprocess with lemma", value=True),
    join             = Checkbox(description="Return joined sentences", value=True),
    show_matrix      = Checkbox(description="Show similarity matrix", value=False),
    print_scores     = Checkbox(description="Show heuristic score change", value=False),
    doc_index        = IntSlider(continuous_update=False, min=0, max=len(docs)-1, step=1, value=0),
    heuristics       = SelectMultiple(options=heuristics_options, description='Heuristicas', value=[("upper+")]),
    section_names    = SelectMultiple(options=[("Ementa", "ementa"), ("Relatório", "relatorio"), ("Area", "area"), ("Relator", "relator"), ("Voto", "voto"), ("Acordão", "acordao"), ("Classe", "classe"), ("Extrato", "extrato") ], description='Ctrl + Click', value=[("relatorio")]),
    RegEx            = Text(value="", description="RegEx", placeholder="Digite RegEx aqui"),
    algorithm        = Dropdown(options=["Bush Path Node", "Similaridade Agregada", "Text Rank"], description="Algoritmo", value="Bush Path Node"),
    threshold        = FloatSlider(continuous_update=False, description="Limiar Bush", min = 0.0, max = 1.0, step = 0.05, value = 0.5), # if algorithm == "Bush Path Node" else 0.5
    similarity       = Dropdown(options=["BOW", "TF", "TF-IDF"], description="Similaridade", value="BOW"),
    top_n            = IntSlider(continuous_update=False, min=1, max=15, step=1, value=3),
    validation       = Dropdown(options=["ROUGE"], description="Validação", value="ROUGE"), # , "Pyramid", "Sentence Mapping"
)
print("#" + "-"*20 + "#") # doc_index: 1278

interactive(children=(IntSlider(value=0, continuous_update=False, description='doc_index', max=10573), Checkbo…

#--------------------#


## Resume e cálcula o score de 200 documentos

In [None]:
args_1 = [False, True, True, True, False, False, ['tfidf+', 'upper+', 'prop+', 'cue+', 'length+', 'position-'], ['relatorio'], '', 'Text Rank', 0.5, 'TF-IDF', 15, 'ROUGE']
teste_1_scores = list(map(lambda doc_index: generate_summary(doc_index, *args_1), range(200)))

args_2 = [False, True, True, True, False, False, ['tfidf+', 'upper+', 'length+', 'position-'], ['relatorio'], '', 'Text Rank', 0.5, 'TF-IDF', 15, 'ROUGE']
teste_2_scores = list(map(lambda doc_index: generate_summary(doc_index, *args_2), range(200)))

args_3 = [False, True, True, True, False, False, ['tfidf+', 'upper+', 'prop+', 'cue+'], ['relatorio'], '', 'Bush Path Node', 1.00, 'BOW', 15, 'ROUGE']
teste_3_scores = list(map(lambda doc_index: generate_summary(doc_index, *args_3), range(200)))

=--------------------=
SCORE ROUGE1 -> precision = 0.67, recall = 0.67, fmeasure = 0.67
SCORE ROUGE2 -> precision = 0.29, recall = 0.29, fmeasure = 0.29
SCORE ROUGEL -> precision = 0.41, recall = 0.41, fmeasure = 0.41
=--------------------=
=--------------------=
SCORE ROUGE1 -> precision = 0.86, recall = 0.86, fmeasure = 0.86
SCORE ROUGE2 -> precision = 0.58, recall = 0.58, fmeasure = 0.58
SCORE ROUGEL -> precision = 0.56, recall = 0.56, fmeasure = 0.56
=--------------------=
=--------------------=
SCORE ROUGE1 -> precision = 0.81, recall = 0.81, fmeasure = 0.81
SCORE ROUGE2 -> precision = 0.47, recall = 0.47, fmeasure = 0.47
SCORE ROUGEL -> precision = 0.70, recall = 0.70, fmeasure = 0.70
=--------------------=
=--------------------=
SCORE ROUGE1 -> precision = 0.49, recall = 0.49, fmeasure = 0.49
SCORE ROUGE2 -> precision = 0.25, recall = 0.25, fmeasure = 0.25
SCORE ROUGEL -> precision = 0.19, recall = 0.19, fmeasure = 0.19
=--------------------=
=--------------------=
SCORE ROUGE1 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


=--------------------=
SCORE ROUGE1 -> precision = 0.74, recall = 0.74, fmeasure = 0.74
SCORE ROUGE2 -> precision = 0.33, recall = 0.33, fmeasure = 0.33
SCORE ROUGEL -> precision = 0.41, recall = 0.41, fmeasure = 0.41
=--------------------=
=--------------------=
SCORE ROUGE1 -> precision = 0.41, recall = 0.41, fmeasure = 0.41
SCORE ROUGE2 -> precision = 0.11, recall = 0.11, fmeasure = 0.11
SCORE ROUGEL -> precision = 0.15, recall = 0.15, fmeasure = 0.15
=--------------------=
=--------------------=
SCORE ROUGE1 -> precision = 0.26, recall = 0.26, fmeasure = 0.26
SCORE ROUGE2 -> precision = 0.13, recall = 0.13, fmeasure = 0.13
SCORE ROUGEL -> precision = 0.19, recall = 0.19, fmeasure = 0.19
=--------------------=
=--------------------=
SCORE ROUGE1 -> precision = 0.51, recall = 0.51, fmeasure = 0.51
SCORE ROUGE2 -> precision = 0.17, recall = 0.17, fmeasure = 0.17
SCORE ROUGEL -> precision = 0.27, recall = 0.27, fmeasure = 0.27
=--------------------=
=--------------------=
SCORE ROUGE1 

In [None]:
# Estatísicas dos testes

for test_score in [teste_1_scores, teste_2_scores, teste_3_scores]:
  df = pd.DataFrame()
  p1s, r1s, f1s = [], [], []
  p2s, r2s, f2s = [], [], []
  pLs, rLs, fLs = [], [], []

  for score in test_score:
    p1, r1, f1 = score['rouge1'].precision, score['rouge1'].recall, score['rouge1'].fmeasure
    p2, r2, f2 = score['rouge2'].precision, score['rouge2'].recall, score['rouge2'].fmeasure
    pL, rL, fL = score['rougeL'].precision, score['rougeL'].recall, score['rougeL'].fmeasure

    p1s.append(p1)
    r1s.append(r1)
    f1s.append(f1)

    p2s.append(p2)
    r2s.append(r2)
    f2s.append(f2)
    
    pLs.append(pL)
    rLs.append(rL)
    fLs.append(fL)

  df['ROUGE-1 Precision'] = p1s
  df['ROUGE-1 Recall']    = r1s
  df['ROUGE-1 fmeasure']  = f1s

  df['ROUGE-2 Precision'] = p2s
  df['ROUGE-2 Recall']    = r2s
  df['ROUGE-2 fmeasure']  = f2s

  df['ROUGE-L Precision'] = pLs
  df['ROUGE-L Recall']    = rLs
  df['ROUGE-L Fmeasure']  = fLs

  display(df.describe())

Unnamed: 0,ROUGE-1 Precision,ROUGE-1 Recall,ROUGE-1 fmeasure,ROUGE-2 Precision,ROUGE-2 Recall,ROUGE-2 fmeasure,ROUGE-L Precision,ROUGE-L Recall,ROUGE-L Fmeasure
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,0.532339,0.318858,0.329544,0.250206,0.142621,0.1501,0.306936,0.171036,0.177246
std,0.199412,0.191353,0.146558,0.155868,0.120452,0.111516,0.147991,0.102693,0.070499
min,0.045279,0.022487,0.042984,0.0,0.0,0.0,0.034682,0.016667,0.032086
25%,0.381774,0.153717,0.214845,0.136725,0.058357,0.083307,0.187013,0.091293,0.132177
50%,0.548094,0.295104,0.331804,0.219307,0.109915,0.129293,0.283398,0.154151,0.173694
75%,0.708338,0.48078,0.43118,0.335466,0.189126,0.185822,0.408235,0.229795,0.212207
max,0.913043,0.829146,0.798742,0.762144,0.730479,0.716535,0.754386,0.578125,0.508511


Unnamed: 0,ROUGE-1 Precision,ROUGE-1 Recall,ROUGE-1 fmeasure,ROUGE-2 Precision,ROUGE-2 Recall,ROUGE-2 fmeasure,ROUGE-L Precision,ROUGE-L Recall,ROUGE-L Fmeasure
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,0.531601,0.318874,0.329618,0.250299,0.142454,0.150114,0.304949,0.170118,0.176218
std,0.198954,0.190694,0.145987,0.15592,0.119611,0.110947,0.14637,0.101633,0.068443
min,0.045279,0.023352,0.044561,0.0,0.0,0.0,0.034682,0.016544,0.031858
25%,0.381774,0.153717,0.222307,0.137255,0.059302,0.083195,0.184733,0.091275,0.133605
50%,0.546886,0.295104,0.329504,0.218573,0.110636,0.129293,0.287166,0.151645,0.174112
75%,0.701848,0.477527,0.43118,0.335089,0.192491,0.185822,0.403646,0.227327,0.209632
max,0.913043,0.829146,0.798742,0.763819,0.730479,0.71811,0.754386,0.578125,0.508511


Unnamed: 0,ROUGE-1 Precision,ROUGE-1 Recall,ROUGE-1 fmeasure,ROUGE-2 Precision,ROUGE-2 Recall,ROUGE-2 fmeasure,ROUGE-L Precision,ROUGE-L Recall,ROUGE-L Fmeasure
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,0.518173,0.326805,0.332185,0.238858,0.144729,0.148949,0.300444,0.176558,0.180393
std,0.192502,0.193153,0.141879,0.146191,0.12128,0.107066,0.145379,0.106412,0.074181
min,0.045279,0.029228,0.056112,0.0,0.0,0.0,0.034682,0.020367,0.038797
25%,0.372996,0.163346,0.231501,0.134774,0.060926,0.083497,0.185258,0.099944,0.135668
50%,0.533279,0.307157,0.333333,0.209139,0.114815,0.131477,0.281622,0.157053,0.178025
75%,0.67768,0.481385,0.433341,0.318411,0.189126,0.181793,0.400679,0.239167,0.211794
max,0.891304,0.821549,0.818792,0.725293,0.730185,0.727731,0.754386,0.572391,0.57047
