In [1]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
file_path = './casos.csv'
intermediate_file_path = './casos_with_embedding.csv'

In [3]:
main_dataframe = pd.read_csv(file_path, sep=';', header=0, index_col=False)

In [4]:
work_dataframe = main_dataframe.copy()

In [5]:
work_dataframe.drop_duplicates(subset='descricao', inplace=True)
work_dataframe.dropna(axis=0, subset='descricao', inplace=True)

In [6]:
work_dataframe.shape[0]

1198

In [7]:
model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)

In [8]:
embedding_inputs = list(work_dataframe['descricao'].values)

In [9]:
max(list(map(lambda x: len(x),embedding_inputs))) # Tamanho do maior texto

2687

In [12]:
inputs = tokenizer(embedding_inputs, padding="max_length", truncation=True, max_length = 100, return_tensors="pt")
simple_array = []
embeddings = []
with torch.no_grad():
    outs = model(**inputs)    
    print(outs.last_hidden_state.shape)    
    for i in range(0, len(outs.last_hidden_state)):
      encoded = outs.last_hidden_state[i, 1:-1]    
      embedding = (torch.mean(encoded, 0))      
      embeddings.append(embedding.tolist())
    work_dataframe['embedding'] = embeddings
      

torch.Size([1198, 100, 768])


In [13]:
work_dataframe['embedding']

0       [-0.11463958024978638, 0.10435016453266144, 0....
6       [-0.10816885530948639, -0.0386231355369091, 0....
13      [0.14696739614009857, 0.0133186811581254, 0.62...
14      [-0.14168746769428253, -0.001603385666385293, ...
25      [-0.11508359760046005, -0.15865623950958252, 0...
                              ...                        
1308    [-0.13071759045124054, -0.23188351094722748, 0...
1309    [-0.01808825321495533, -0.11963064968585968, 0...
1310    [-0.038996048271656036, -0.30379369854927063, ...
1312    [-0.20098061859607697, -0.25110238790512085, 0...
1313    [-0.23962900042533875, -0.21648769080638885, 0...
Name: embedding, Length: 1198, dtype: object

In [15]:
work_dataframe.to_pickle(intermediate_file_path)