# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers transformers_interpret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import matplotlib.pyplot as plt
import numpy as np
import torch.nn.functional as F
from os import chdir
import pandas as pd

In [4]:
REPO_DIR = '/content/drive/MyDrive/pantanal.dev/artificial-intelligence'
chdir(REPO_DIR)

In [5]:
seed = 42
np.random.seed(seed)

In [6]:
train_df = pd.read_csv('datasets/train_df.csv', sep='|')
val_df = pd.read_csv('datasets/val_df.csv', sep='|')
test_df = pd.read_csv('datasets/test_df.csv', sep='|')

In [49]:
# Carregar o tokenizer e o modelo BERT
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/pantanal.dev/artificial-intelligence/trainings/bert-base-multilingual-cased-06/pruned/', output_attentions=True)
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/pantanal.dev/artificial-intelligence/trainings/bert-base-multilingual-cased-06/pruned')

# Classification result insights

In [9]:
!pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993243 sha256=8b0fe920126dbf29962b6453225f6ce45cd5a4a83282c0b50e37b0338d5747b9
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
if language == 'auto':
        language = detect(text)
        if language == 'en':
            language = 'english'
        else:
            language = 'portuguese'

In [10]:
import nltk
from nltk.corpus import stopwords
from langdetect import detect
nltk.download('stopwords')
nltk.download('punkt')

# Função para remover stopwords de um texto
def remove_stopwords(text: str, language: str = "auto") -> str:
    """
    Remove stopwords de um texto em português ou inglês.

    Args:
        text (str): O texto para remover as stopwords.
        language (str, optional): O idioma das stopwords. Por padrão, é utilizado "auto" para detectar o idioma automaticamente.

    Returns:
        str: O texto sem stopwords.

    Raises:
        ValueError: Se o idioma detectado automaticamente não for suportado pela aplicação.

    Exemplo de uso:
        >>> texto = "Este é um exemplo de texto em português que será processado para remoção de stopwords."
        >>> texto_sem_stopwords = remove_stopwords(texto, language="portuguese")
        >>> print(texto_sem_stopwords)
        "exemplo texto português processado remoção stopwords."
    """
    if language == 'auto':
        language = detect(text)
        if language == 'en':
            language = 'english'
        else:
            language = 'portuguese'

    words = nltk.word_tokenize(text)
    stopwords_list = set(stopwords.words(language))
    filtered_words = [word for word in words if word.lower() not in stopwords_list]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
from string import punctuation

def merge_subtokens(tokenizer, word_attributions: list[tuple]) -> list[tuple]:
    """
    Agrupa as pontuações de tokens divididos do BERT em um único token.
    
    Args:
        tokenizer: Tokenizer BERT para usar.
        word_attributions (List[tuple]): As atribuições de palavras do tokenizador.

    Returns:
        List[tuple]: Atribuições de palavras com tokens divididos mesclados.
    """
    merged_attributions = []
    merged_token = ''
    merged_value = 0.0

    for token, value in word_attributions:
        detokenized_token = tokenizer.convert_tokens_to_string([token]).strip()
        if detokenized_token:
            if detokenized_token in punctuation or detokenized_token in {'[CLS]', '[SEP]', '[UNK]'}:
                continue
            if token.startswith('##'):
                merged_token += token[2:]
                merged_value += value
            else:
                if merged_token:
                    merged_attributions.append((merged_token, merged_value))
                    merged_token = ''
                    merged_value = 0.0
                merged_token = detokenized_token
                merged_value = value
        else:
            merged_token += token.replace('##', '')
            merged_value += value

    if merged_token:
        merged_attributions.append((merged_token, merged_value))

    return merged_attributions

In [12]:
def format_attributions(word_attributions: list[tuple[str, float]]) -> list[tuple[str, str]]:
    """
    Formata as atribuições de palavras para uma representação mais intuitiva.

    Args:
        word_attributions: Uma lista de tuplas contendo as atribuições de palavras. Cada tupla contém uma palavra/token
            e um valor float representando sua importância.

    Returns:
        Uma lista de tuplas contendo as palavras formatadas e suas atribuições em formato de porcentagem, ordenadas por
        importância.
    """
    # Obter o valor total das atribuições
    total = sum(abs(score) for _, score in word_attributions)

    # Formatar cada atribuição como uma tupla (token, porcentagem)
    formatted_attributions = []
    for token, score in word_attributions:
        # Calcular a porcentagem e arredondar para duas casas decimais
        percentage = round((abs(score) / total) * 100, 2)
        formatted_attributions.append((token, percentage))

    # Ordenar as atribuições por porcentagem descendente
    formatted_attributions = sorted(formatted_attributions, key=lambda x: x[1], reverse=True)

    return formatted_attributions

In [50]:
from transformers_interpret import SequenceClassificationExplainer
cls_explainer = SequenceClassificationExplainer(model, tokenizer)

In [51]:
%timeit SequenceClassificationExplainer(model, tokenizer)

202 µs ± 32.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [52]:
def classify_text(input_text):
    input_text = remove_stopwords(input_text)
    tokenized_input = tokenizer(input_text, truncation=True, max_length=100, return_tensors='pt')
    truncated_input_text = tokenizer.decode(tokenized_input['input_ids'][0])
    word_attributions = cls_explainer(truncated_input_text)
    word_attributions = merge_subtokens(tokenizer, word_attributions)
    word_attributions = sorted(word_attributions, key=lambda x: (-x[1], x[0]))
    word_attributions = format_attributions(word_attributions)

    return {
        'predicition_class_name': cls_explainer.predicted_class_name,
        'prediction_index': cls_explainer.predicted_class_index,
        'prediction_probatility': cls_explainer.pred_probs,
        'influential_words': word_attributions
    }

In [53]:
index = 8
text = test_df.loc[index, 'raw_text'], test_df.loc[index, 'label']
text

('O lucro operacional, excluindo itens não recorrentes, totalizou EUR 1,0 mn, abaixo dos EUR 1,6 mn.',
 0)

In [54]:
%timeit classify_text(text[0])

12.5 s ± 629 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [55]:
classify_text(text[0])

{'predicition_class_name': 'LABEL_0',
 'prediction_index': array(0),
 'prediction_probatility': tensor(0.7438),
 'influential_words': [('abaixo', 72.11),
  ('totalizou', 6.62),
  ('EUR', 5.61),
  ('EUR', 3.87),
  ('mn', 3.07),
  ('itens', 2.71),
  ('lucro', 1.74),
  ('0', 1.01),
  ('1', 0.8),
  ('1', 0.78),
  ('mn', 0.58),
  ('excluindo', 0.45),
  ('6', 0.27),
  ('recorrentes', 0.19),
  ('operacional', 0.19)]}