In [10]:
import os
import re
import nltk
import numpy as np
import json
import csv
import random
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/paoebom/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
TOTAL = True

with open("../pre/global/not_prune.csv", "r") as f:
    not_prune = list(csv.reader(f, delimiter=","))[0]

directory = "../data"

if TOTAL:

    data_list = not_prune
else:

    data_list = random.sample(not_prune, k=10000)
    """ directory = "../pre/sample"

    data_list = os.listdir(directory)
    data_list = [data for data in data_list if data in not_prune] """

In [12]:
USE_SENTENCE_TRANSFORMERS = True

if USE_SENTENCE_TRANSFORMERS:
    #model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device='cuda')
    model = SentenceTransformer('ulysses-camara/legal-bert-pt-br', device='cuda')

else:
    model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
    tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)

In [None]:
""" with open("../pre/zipf/stopwords-0.10-0.50.csv", "r") as f:
    context_stopwords = csv.reader(f)
    context_stopwords = list(context_stopwords)
    context_stopwords = [words[0] for words in context_stopwords] """

doc_embeddings = {}
# stop_words = set(stopwords.words('portuguese') + context_stopwords)

if USE_SENTENCE_TRANSFORMERS:
    for file in tqdm(data_list, total=len(data_list)):
        with open(f'{directory}/{file}', 'r') as f:
            text = f.read()
            text = text.lower()
            """ text = re.sub(r'[^\w0-9- ]+', '', text, flags=re.UNICODE)
            text = [x.strip() for x in text.split() if len(x) > 0 and x not in stop_words]
            text = ' '.join(text) """

            doc_embeddings[file] = list(map(float, (model.encode(text, device='cuda'))))

else:
    for file in tqdm(data_list, total=len(data_list)):
        with open(f'{directory}/{file}', 'r') as f:
            text = f.read()
            text = text.lower()
            """ text = re.sub(r'[^\w0-9- ]+', '', text, flags=re.UNICODE)
            text = [x.strip() for x in text.split() if len(x) > 0 and x not in stop_words]
            text = ' '.join(text) """

            input_ids = tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=512)
            with torch.no_grad():
                outs = model(input_ids)
                encoded = outs[0][0, 1:-1]

            mean_pool_encoded = np.array(encoded).mean(axis=0)
            doc_embeddings[file] = list(map(float, mean_pool_encoded))

In [None]:
if TOTAL:
    with open("../pre/global/full_embed.json", 'w') as f: json.dump(doc_embeddings, f)
else:
    with open("../pre/global/partial_embed_no_clean.json", 'w') as f: json.dump(doc_embeddings, f)

In [None]:
def get_similar_docs(document_file:str, embed_json_path:str="../pre/global/full_embed.json", max_recs:int|str=5, simplified:bool=False) -> list|int:

    if max_recs is not int and max_recs != 'MAX':
        print(f'Invalid value of maximum recommendations! It must  be either \'MAX\' or an interger. Current value = {max_recs}')
        return -1
    
    with open(embed_json_path, 'r') as f:
        embed_dict = json.load(f)

    if simplified:
        doc_embeds = np.array(embed_dict[document_file])
    else: 
        # stop_words = set(stopwords.words('portuguese'))
        with open(document_file, 'r') as f:
            text = f.read()
            text = text.lower()
            """ text = re.sub(r'[^\w0-9- ]+', '', text, flags=re.UNICODE)
            text = [x.strip() for x in text.split() if len(x) > 0 and x not in stop_words]
            text = ' '.join(text) """

            doc_embeds = np.array(model.encode(text, device='cuda'))
    
    
    results = []
    for file in embed_dict.keys():
        embed_value = np.array(embed_dict[file])
        similarity = cosine_similarity(doc_embeds.reshape(1, -1), embed_value.reshape(1, -1))
        results.append((file, similarity[0][0]))


    ordered_results = sorted(results, reverse=True, key=lambda x:x[1])

    if simplified: ordered_results = ordered_results[1:]

    if max_recs is int and len(ordered_results) >= max_recs+1:
        return ordered_results[: max_recs]

    return ordered_results

  """ text = re.sub(r'[^\w0-9- ]+', '', text, flags=re.UNICODE)


In [15]:
sample_num = 100
test_docs = random.sample(data_list, k=sample_num)

max_total = 0
min_total = 0
total = 0
for doc in tqdm(test_docs, total=len(test_docs)):
    #print(f'Documento em questão: {doc}')
    results = get_similar_docs(doc, max_recs='MAX', simplified=True, embed_json_path="../pre/global/full_embed.json")
    #print(results[:5])
    average = sum([value[1] for value in results])/len(results)
    max_total += results[0][1]
    min_total += results[-1][1]
    total += average
    #print(f'Maximum Similarity = {results[0]}')
    #print(f'Minimum Similarity = {results[-1]}')
    #print(f'Average Similarity = {average}')
    #print()

print()
print(f'Maxima geral de similaridade entre documentos: {max_total/sample_num}')
print(f'Minima geral de similaridade entre documentos: {min_total/sample_num}')
print(f'Média geral de similaridade entre documentos: {total/sample_num}')

100%|██████████| 100/100 [21:58<00:00, 13.19s/it]


Maxima geral de similaridade entre documentos: 0.8923135436801104
Minima geral de similaridade entre documentos: 0.027243755740902462
Média geral de similaridade entre documentos: 0.4762353500585687



