In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import numpy as np
import re,unicodedata,nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer, TFBertForSequenceClassification 
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import keras
# import transformers
# import tensorflow_hub as hub
# from tqdm import tqdm
# import pickle
# from keras.models import Model
# import keras.backend as K
# from sklearn.metrics import confusion_matrix,f1_score,classification_report
# import matplotlib.pyplot as plt
# from keras.callbacks import ModelCheckpoint
# import itertools
# from keras.models import load_model
# from transformers import TrainingArguments, Trainer

nltk.download('stopwords')
stops_nltk = nltk.corpus.stopwords.words('portuguese')


In [None]:
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    for gpu in gpus:
        print("Found a GPU with the name:", gpu)
else:
    print("Failed to detect a GPU.")

In [None]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('portuguese')
    words = w.split()
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words)

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    # w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w

In [None]:
df = pd.read_excel("querys.xlsx",sheet_name='Planilha1')
df_to_use = df.loc[:,["Question","NEW INTENT"]].dropna()
data=df_to_use.rename(columns = {'NEW INTENT': 'label', 'Question': 'text'}, inplace = False)

In [None]:
data=data.dropna()                                                           # Drop NaN valuues, if any
data=data.reset_index(drop=True)                                             # Reset index after dropping the columns/rows with NaN values
data = shuffle(data)                                                         # Shuffle the dataset
print('Available labels: ',data.label.unique())                              # Print all the unique labels in the dataset
data['text']=data['text'].map(preprocess_sentence)
data['gt'] = pd.factorize(data['label'], sort=True)[0] + 1

In [None]:
sentences=data['text']
labels=data['gt']
num_classes=len(data.label.unique())+1
len(sentences),len(labels)

In [None]:
model_id = 'neuralmind/bert-base-portuguese-cased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = TFBertForSequenceClassification.from_pretrained(model_id,num_labels=num_classes)

In [None]:
input_ids=[]
attention_masks=[]

for sent in sentences:
    bert_inp=tokenizer.encode_plus(sent,add_special_tokens = True,max_length =64, pad_to_max_length = True,return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(labels)

In [None]:
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2)

In [None]:
log_dir='tensorboard_data/tb_bert'
model_save_path='./models/bertimbau_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]

print('\nBERTimbau Model',model.summary())

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08)

model.compile(loss=loss,optimizer=optimizer,metrics=[metric])

In [None]:
history=model.fit([train_inp,train_mask],train_label,batch_size=1,epochs=4,validation_data=([val_inp,val_mask],val_label),callbacks=callbacks)

In [None]:
model_id = 'neuralmind/bert-base-portuguese-cased'
log_dir='tensorboard_data/tb_bert'
model_save_path='./models/bertimbau_model.h5'
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08)

trained_model = TFBertForSequenceClassification.from_pretrained(model_id,num_labels=12)
trained_model.compile(loss=loss,optimizer=optimizer, metrics=[metric])
trained_model.load_weights(model_save_path)

preds = trained_model.predict([val_inp,val_mask],batch_size=1)
pred_labels = np.argmax(preds.logits, axis=1)
f1 = f1_score(val_label,pred_labels,average='micro')
print('F1 score',f1)
print('Classification Report')
print(classification_report(val_label,pred_labels,target_names=list(data.label.unique())))

print('Training and saving built model.....')

In [None]:
teste = "Qual total de departamentos"

In [None]:
input_ids=[]
attention_masks=[]

bert_inp=tokenizer.encode_plus(teste,add_special_tokens = True,max_length =64, pad_to_max_length = True,return_attention_mask = True)
input_ids.append(bert_inp['input_ids'])
attention_masks.append(bert_inp['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)

In [None]:

preds = trained_model.predict([input_ids,attention_masks],batch_size=1)


In [None]:
preds.logits

In [None]:
pred_labels = np.argmax(preds.logits, axis=1)

In [None]:
pred_labels

In [None]:
guia = data.label.unique()

In [None]:
guia[pred_labels]