In [4]:
!pip install transformers
!pip install docx2txt
!pip install rouge



In [20]:
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib
from pprint import pprint
import matplotlib.pyplot as plt
from google.colab import drive
import docx2txt
from rouge import Rouge 
import nltk
from nltk import tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

##Funcoes

In [32]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

def content_sentences(body):
        sentences = tokenize.sent_tokenize(body, language='portuguese')
        return [c for c in sentences]

def tokenize_input(text):
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        return torch.tensor([indexed_tokens])

def extract_embeddings(text):
        tokens_tensor = tokenize_input(text)
        hidden_states, pooled = model(tokens_tensor)[-2:]
        return pooled

##Classes

In [7]:
class ClusterFeatures(object):
    def __init__(self, features, algorithm='kmeans', pca_k=2):
        if pca_k:
            self.features = PCA(n_components=pca_k).fit_transform(features)
        else:
            self.features = features
        self.algorithm = algorithm
        self.pca_k = pca_k

    def __get_model(self, k):
        if self.algorithm == 'gmm':
            return GaussianMixture(n_components=k)
        if self.algorithm == 'affinity':
            return AffinityPropagation()
        return KMeans(n_clusters=k)

    def __get_centroids(self, model):
        if self.algorithm == 'gmm':
            return model.means_
        return model.cluster_centers_

    def __find_closest_args(self, centroids):
        centroid_min = 1e7
        cur_arg = -1
        args = {}
        used_idx = []
        for j, centroid in enumerate(centroids):
            for i, feature in enumerate(self.features):
                value = np.sum(np.abs(feature - centroid))
                if value < centroid_min and i not in used_idx:
                    cur_arg = i
                    centroid_min= value
            used_idx.append(cur_arg)
            args[j] = cur_arg
            centroid_min = 1e7
            cur_arg = -1
        return args

    def cluster(self, ratio=0.1):
        k = 1 if ratio * len(self.features) < 1 else int(len(self.features) * ratio)
        model = self.__get_model(k).fit(self.features)
        centroids = self.__get_centroids(model)
        cluster_args = self.__find_closest_args(centroids)
        sorted_values = sorted(cluster_args.values())
        return sorted_values

    def create_plots(self, k=4, plot_location='./cool_model.png', title = ''):
        if self.pca_k != 2:
            raise RuntimeError("Must be dimension of 2")

        model = self.__get_model(k)
        model.fit(self.features)
        y = model.predict(self.features)
        plt.title(title)
        plt.scatter(self.features[:, 0], self.features[:, 1], c=y, s=50, cmap='viridis')
        centers = model.cluster_centers_
        plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
        plt.savefig(plot_location)

##Aplicacao

In [8]:
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
texts = {}

i = 0

while i < 100:
  body = docx2txt.process("/content/drive/My Drive/IC/textos_sumarios/textos/{0}texto_completo.docx".format(i))

  if i <= 49:
    aux = 1
  else:
    aux = 0

  texts[i] = {"label": aux, "text": body}

  i+=1

In [43]:
data = pd.DataFrame.from_dict(texts, orient='index', columns=["label", "text"])

df_train = data.loc[:24, :].append(data.loc[50:74, :])

df_test = data.loc[25:49, :].append(data.loc[75:, :])

print("Tamanho treino: {}; tamanho teste: {}".format(len(df_train),len(df_test)))

df_train.head()

Tamanho treino: 50; tamanho teste: 50


Unnamed: 0,label,text
0,1,A SENHORA MINISTRA CÁRMEN LÚCIA - (Relatora): ...
1,1,O SENHOR MINISTRO DIAS TOFFOLI (RELATOR): Para...
2,1,A SENHORA MINISTRA CÁRMEN LÚCIA - (Relatora): ...
3,1,A Senhora Ministra Rosa Weber (Relatora): Cont...
4,1,A Senhora Ministra Rosa Weber (Relatora): Trat...


In [33]:
# initial_sentences = content_sentences(body)

model = AutoModel.from_pretrained('neuralmind/bert-large-portuguese-cased')

tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=648.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1342014951.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=209528.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=155.0, style=ProgressStyle(description_…




In [47]:
# train_dataset = df_train.to_dataset()

# test_dataset = df_test.to_dataset()

# train_dataset.head()

In [48]:
# train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))

# test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset), language='portuguese')

# train_dataset.head()

In [None]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluate_during_training=True,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs