# Etapa 1: Importação das bibliotecas

In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive

In [None]:
!pip install bert-for-tf2

In [None]:
!pip install sentencepiece

In [None]:
%tensorflow_version 2.x

In [None]:
import tensorflow as tf
tf.__version__

In [None]:
import tensorflow_hub as hub

In [None]:
from tensorflow.keras import layers
import bert

# Etapa 2: Carregamento da base de dados

In [None]:
drive.mount('/content/drive')

In [None]:
cols = ['sentiment', 'id', 'date', 'query', 'user', 'text']

In [None]:
data = pd.read_csv('/content/drive/MyDrive/ProjetoIA/training.1600000.processed.noemoticon.csv',
                   header = None,
                   names = cols,
                   engine = 'python',
                   encoding='latin-1')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.drop(['id', 'date', 'query', 'user'],
          axis=1,
          inplace=True)

# Etapa 3: Limpeza dos textos

In [None]:
def limpar_tweet(tweet):
  tweet = BeautifulSoup(tweet, 'lxml').get_text() #Deixar o texto em um formato adequado para fazer a limpeza
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
  tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
  tweet = re.sub(r" +", ' ', tweet)

  return tweet

In [None]:
teste = data['text'][0]
teste

In [None]:
result = limpar_tweet(teste)
result

In [None]:
data_limpo = [limpar_tweet(tweet) for tweet in data.text] #Utiliza a função para limpar todos os tweets

In [None]:
data_limpo[0:4]

In [None]:
data_labels = data.sentiment.values #Ajustando os valores do sentimento positivo
data_labels[data_labels == 4] = 1

# Etapa 4: Tokenização

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', trainable=False) #Pegar modelo
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() #Adicionando o arquivo de vocabulario
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() #Deixando transformar em minusculo
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
def encode_sentence(sentence):
  return ["[CLS]"] + tokenizer.tokenize(sentence) + ["[SEP]"]

In [None]:
data_entradas = [encode_sentence(sentence) for sentence in data_limpo]

# Etapa 5: Criação da base de dados

In [None]:
def get_ids(tokens):
  return tokenizer.convert_tokens_to_ids(tokens) #Converter os tokens para id

In [None]:
def get_mask(tokens):
  return np.char.not_equal(tokens, '[PAD]').astype(int) #Vefica se há um padding

In [None]:
def get_segments(tokens): #Faz a mudança de 0 e 1
  seg_ids = []
  current_seg_id = 0
  for tok in tokens:
    seg_ids.append(current_seg_id)
    if tok == "[SEP]":
      current_seg_id = 1 - current_seg_id
  return seg_ids

In [None]:
my_sent = ["[CLS]"] + tokenizer.tokenize("Rose are red") + ['[SEP]']
my_sent

In [None]:
bert_layer([
    tf.expand_dims(tf.cast(get_ids(my_sent), tf.int32), 0),
    tf.expand_dims(tf.cast(get_mask(my_sent), tf.int32), 0),
    tf.expand_dims(tf.cast(get_segments(my_sent), tf.int32), 0),
])

In [None]:
data_with_len = [[sent, data_labels[i], len(sent)] for i, sent in enumerate(data_entradas)]

In [None]:
random.shuffle(data_with_len)
sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
                sent_lab[1])
              for sent_lab in data_with_len]

In [None]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32)) #Tranformar os dados para que o tensorflow consiga interpretar

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((3,None), ()), padding_values=(0, 0)) #Definir o numero de registros em cada batch e criar uma lista

In [None]:
next(iter(all_batched))

In [None]:
NB_BATCHES = len(sorted_all) // BATCH_SIZE #Quantidade de batches para treinamento
NB_BATCHES

In [None]:
NB_BATCHES_TEST = NB_BATCHES // 10 #Quantidade de batches para teste
NB_BATCHES_TEST

In [None]:
all_batched.shuffle(NB_BATCHES) #Embaralhar os dados

In [None]:
test_dataset = all_batched.take(NB_BATCHES_TEST) #Criar base de dados de teste
train_dataset = all_batched.skip(NB_BATCHES_TEST) #Crirar base de dados de treinamento

# Etapa 6: Criação do modelo

In [None]:
class DCNNBERTEmbedding(tf.keras.Model):

  def __init__(self,
               nb_filters = 50,
               FFN_units=512,
               nb_classes = 2,
               dropout_rate=0.1,
               name="dcnn"):
    super(DCNNBERTEmbedding, self).__init__(name=name)

    self.bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1',
                                     trainable=False)

    #Definição dos filtros
    self.bigram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 2,
                                padding = 'valid',
                                activation = 'relu')

    self.trigram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 3,
                                padding = 'valid',
                                activation = 'relu')
    self.fourgram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 4,
                                padding = 'valid',
                                activation = 'relu')

    self.pool = layers.GlobalMaxPool1D() #Camada de pooling

    self.dense_1 = layers.Dense(units = FFN_units, activation = 'relu') #Camada mais densa

    self.dropout = layers.Dropout(rate = dropout_rate) #Camada de dropout

    if nb_classes == 2:
      self.last_dense = layers.Dense(units = 1, activation = 'sigmoid')
    else:
      self.leat_dense = layers.Dense(units = nb_classes, activation = 'softmax')

  def embed_with_bert(self, all_tokens):
    _, emb = self.bert_layer([all_tokens[:,0,:],
                              all_tokens[:,1,:],
                              all_tokens[:,2,:]])
    return emb

  def call(self, inputs, training):
    x = self.embed_with_bert(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool(x_2)
    x_3 = self.fourgram(x)
    x_3 = self.pool(x_3)

    merged = tf.concat([x_1, x_2, x_3], axis = -1)
    merged = self.dense_1(merged)
    merged = self.dropout(merged, training)
    output = self.last_dense(merged)

    return output


# Etapa 7: Treinamento do modelo

In [None]:
NB_FILTERS = 100
FFN_UNITS = 256 #Número de neuronios na camada densa
NB_CLASSES = 2
DROPOUT_RATE = 0.2 #Zerar neuronios
NB_EPOCHS = 5

In [None]:
Dcnn = DCNNBERTEmbedding(nb_filters = NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes = NB_CLASSES,
            dropout_rate = DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
  Dcnn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
else:
  Dcnn.compile(loss = 'sparce_categorical_crossentropy', optimizer = 'adam', metrics = ['sparce_categorical_accuracy'])

In [None]:
checkpoint_path = "/content/drive/MyDrive/embedding"

In [None]:
cktp = tf.train.Checkpoint(Dcnn=Dcnn)

In [None]:
cktp_manager = tf.train.CheckpointManager(cktp, checkpoint_path, max_to_keep=10)

In [None]:
if cktp_manager.latest_checkpoint:
  cktp.restore(cktp_manager.latest_checkpoint)
  print('Ultimo checkpoint resturado!')

In [None]:
class MyCustomCallBack(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    cktp_manager.save()
    print("Checkpoint salvo!".format(checkpoint_path))

In [None]:
history = Dcnn.fit(train_dataset,
                   epochs = NB_EPOCHS,
                   steps_per_epoch = 100,
                   callbacks=[MyCustomCallBack()]) #Treinamento do modelo

# Etapa 8: Avaliação do modelo

In [None]:
history.history.keys()

In [None]:
plt.plot(history.history['loss'])
plt.title('Loss progress')

In [None]:
plt.plot(history.history['accuracy'])
plt.title('Accuracy progress')

In [None]:
results = Dcnn.evaluate(test_dataset)
print(results)

In [None]:
def get_prediction(sentence):
  tokens = encode_sentence(sentence)

  input_ids = get_ids(tokens)
  input_mask = get_mask(tokens)
  segment_ids = get_segments(tokens)

  inputs = tf.stack(
     [
      tf.cast(input_ids, dtype=tf.int32),
      tf.cast(input_mask, dtype=tf.int32),
      tf.cast(segment_ids, dtype=tf.int32),
     ], axis=0)
  inputs = tf.expand_dims(inputs, 0)
  output = Dcnn(inputs, training=False)
  sentiment = math.floor(output*2)

  if sentiment == 0:
    print("Output of the model: {}\nPredicted sentiment: negative".format(output))
  if sentiment == 1:
    print("Output of the model: {}\nPredicted sentiment: positive".format(output))

In [None]:
get_prediction("I beautiful girl!")