In [1]:
# https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630

import pandas as pd
import numpy as np
import os
import tensorflow_hub as hub
import tensorflow_text as text 
import tensorflow as tf

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances



from tqdm import tqdm

%matplotlib inline


df=pd.read_csv(f"../data/transformed/comments_clean_sem_pequenos_e_grande.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marccost\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# https://www.youtube.com/watch?v=7kLi8u2dJz0
# * Carregando modelos online
"""
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4")"""
# * Carregando modelos Local

os.environ["TFHUB_CACHE_DIR"] = f"../data/model/"
bert_preprocess = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3"
)
bert_encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4"
)


In [3]:
text_test = df["comment_clean"].loc[
    df["clean_comment_word_count"] == df["clean_comment_word_count"].max()
]
text_preprocessed = bert_preprocess(text_test)


In [4]:
bert_results = bert_encoder(text_preprocessed)

print(f"Keys       : {list(text_preprocessed.keys())}")
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')
print(f"Bert Results   : {bert_results.keys()}")


Keys       : ['input_mask', 'input_type_ids', 'input_word_ids']
Shape      : (5, 128)
Word Ids   : [  101 31266 32965 29514 23601 10280 10173 73657 14356 13000 68868 12772]
Input Mask : [1 1 1 1 1 1 1 1 1 1 1 1]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]
Bert Results   : dict_keys(['default', 'encoder_outputs', 'pooled_output', 'sequence_output'])


In [5]:
def prepara_texto(texto):
    texto_preprocessado = bert_preprocess(texto)
    return bert_encoder(texto_preprocessado).get("pooled_output")


In [None]:
# Gerando df com os dados
if os.path.exists(f"../data/tmp/tmp120000.csv") and os.path.exists(
    f"../data/tmp/sample120000.csv"
):
    pass
else:
    NUM_CASES = 120000
    MAX_CASES_LOOP = 400
    sample_df = pd.DataFrame(
        df.sample(NUM_CASES, random_state=1).copy().dropna()
    )
    comments = sample_df["comment_clean"].values
    tmp_df = pd.DataFrame()
    # Loop carregando aos poucos
    for i in tqdm(range(int(NUM_CASES / MAX_CASES_LOOP))):
        tmp = prepara_texto(
            comments[i * MAX_CASES_LOOP : (i + 1) * MAX_CASES_LOOP]
        )
        tmp_df = pd.concat([tmp_df, pd.DataFrame(tmp.numpy())])


In [7]:
if os.path.exists(f"../data/tmp/tmp120000.csv") and os.path.exists(
    f"../data/tmp/sample120000.csv"
):
    pass
else:
    tmp_df.to_csv(f"../data/tmp/tmp120000.csv", index=False)
    sample_df.to_csv(f"../data/tmp/sample120000.csv", index=False)
