# Importações

In [1]:
import os
from getpass import getpass
import torch
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared, operations
from unstructured_client.models.errors import SDKError
from unstructured.staging.base import dict_to_elements
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from huggingface_hub.hf_api import HfFolder
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import warnings
warnings.filterwarnings('ignore')

# Usando Ferramenta de Tratamento de Dados Não Estruturados

In [2]:
os.environ["UNSTRUCTURED_API_KEY"] = getpass("Cole sua api aqui: ")

Cole sua api aqui:  ········


In [3]:
unstructured_api_key = os.environ.get("UNSTRUCTURED_API_KEY")

In [4]:
client = UnstructuredClient(api_key_auth= unstructured_api_key, server_url="https://api.unstructuredapp.io")

## Extraindo dados de Textos de Arquivos PDF

In [5]:
path_file = "ArticleDSA.pdf"

In [6]:
with open(path_file, "rb") as f:
    files = shared.Files(content=f.read(), file_name=path_file)
    req = shared.PartitionParameters(files = files, chunking_strategy = "by_title", max_characters = 512)
    partition_request = operations.PartitionRequest(partition_parameters=req)

    try:
        resp = client.general.partition(request=partition_request)
    except SDKError as e:
        print(e)

INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"


In [7]:
elements = dict_to_elements(resp.elements)

In [8]:
elements

[<unstructured.documents.elements.CompositeElement at 0x7dc94bfe0670>,
 <unstructured.documents.elements.CompositeElement at 0x7dc94bfe0820>,
 <unstructured.documents.elements.CompositeElement at 0x7dca78390550>,
 <unstructured.documents.elements.CompositeElement at 0x7dc94bfaf8b0>,
 <unstructured.documents.elements.CompositeElement at 0x7dca78393790>,
 <unstructured.documents.elements.CompositeElement at 0x7dc94bfe9570>,
 <unstructured.documents.elements.CompositeElement at 0x7dc94bfea4a0>,
 <unstructured.documents.elements.CompositeElement at 0x7dc94bfea680>,
 <unstructured.documents.elements.CompositeElement at 0x7dc94bfea860>,
 <unstructured.documents.elements.CompositeElement at 0x7dc94bfeaa40>,
 <unstructured.documents.elements.CompositeElement at 0x7dc94bfeaec0>,
 <unstructured.documents.elements.CompositeElement at 0x7dc94bfeaf80>,
 <unstructured.documents.elements.CompositeElement at 0x7dc94bfeb160>,
 <unstructured.documents.elements.CompositeElement at 0x7dc94bfeb340>,
 <unst

## Carregando os Vetores dos Dados de Texto no Banco de Dados Vetorial

In [9]:
documents = []

In [10]:
for element in elements:
    metadados = element.metadata.to_dict()
    documents.append(Document(page_content= element.text, metadata= metadados))

In [11]:
metadados

{'filetype': 'application/pdf',
 'languages': ['eng'],
 'page_number': 2,
 'orig_elements': 'eJy1k09r3DAQxb+K0TkSlqx/7i2lPaYEtrdtWGRpdlfFllxb28029LtXshsKZSk0JEf9Rk8z8x7aPiHoYYCQdt6hdxVqlTRGNwrLVnWYOy6wpkrjjjqh9wJo2zl0U6EBknEmmax5QjbGyflgEszLuTeXeEq7I/jDMWXCmqbNmt/47F06Zkql4JmO0YdUdNstFYzki1RSTtqHm+oPEILIBVDdEHadLKKM0HyZEwxlm3v/CP1mNBbQz1xwkMAmH8PO9maed+MUu3ytJlqxMuDe95AuIyzS+zu0zBwOJ3NYFtsiCAdUOoyZ7MJp6GAq65W3EzyWVdGXE6up3cBXU72PQwVDdTtOEBxMS8WR6s6EBOFo8AylPEQXKxvDnAqvHFRmEfgfxkVSZnge6ZOZJpP8d/hceuWmf4e3V5JrJjusrNljLh3HXcMkFk4rKZmley3eNDxectAt4c/hFaAatqbZNILQa2CVvDA6UevXyu7jt5Mfofqwuf0v2xnjlqnW4TwJYK44w7ruNLZNx6hpudAAb2g7J7KYKhWpV9tXoGu9/hBJBdHXwCp5oe264fyVbD+fz6T4MlsPwYKxxsFwITYOpJv+kcTDL+B/aJA=',
 'filename': 'ArticleDSA.pdf'}

In [12]:
documents

[Document(metadata={'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'orig_elements': 'eJy1Uk1v2zAM/SuEzrFhx9+7FduA5dCtwIpdsiKgLcbRIEueJGctiv73UU5XDENPw3bTe+SjSD7uHwVpmsiEg5LiDYgcOzw2sk6akrKkxIGSvq3LpMt6KZtWdtWxERsQEwWUGJA1j2Kw1kllMJBfscYHu4TDidR4Csxsi6JjzTP9Q8lwYjavq5LZ2SoTom6/z6sy3W4gr6q0uNvAC+6atF1xluWvE6uCGeEffKApTnKj7kl/nnkC8cQBSYGGoKw5DBq9P8zO9pyWpVVdNVxCHJWm8DDTqr25FmvDZlxwXKfaCzKjiF/MzBzMMvXk4hSxeKD7OKe4gg/YK60kSoJrVB5202xdQBMIDMJ7hyARdgy1Gr8uWUZoBoVw5YI6Kn7p+O+vNm5V0Gv3f7pUF0PedP0xwaLaJuV2wKSlvk3yruiGsqio6vv/6FKVNhtOz9P84tKKi7J9dq2smzR/hbgo/s6lrsjb+p+5NKORNPHepYW3n77s3vHmgHvQ5OwCFpwKk41BSZ7M2eqzittnrEYVUANNECzHLUyLkXYD31Yzc/i+EIQlMizlI3C0GBVjx4o8YFjTOhis8YsO6GG6MFIN6DfANxAbMFoZSmHnvYXZchlv0bGIm0I9WvDLTI4jXgV1tunvN/MRnUNm6TZO+3T3EyxqMkw=', 'filename': 'ArticleDSA.pdf'}, page_content='A Habilidade Mais Importante na Era da Inteligência Artificial\n\nA pandemia do COVID-19 acelerou o ritmo do desenvolvimento digital em todo o mundo, já que tudo, desde reuniõ

In [13]:
vector_db = FAISS.from_documents(documents, HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-m3"))

INFO: Use pytorch device_name: cuda
INFO: Load pretrained SentenceTransformer: BAAI/bge-m3


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

INFO: Loading faiss with AVX2 support.
INFO: Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
INFO: Loading faiss.
INFO: Successfully loaded faiss.


In [14]:
type(vector_db)

langchain_community.vectorstores.faiss.FAISS

In [15]:
print(vector_db.__doc__)

FAISS vector store integration.

    See [The FAISS Library](https://arxiv.org/pdf/2401.08281) paper.

    Setup:
        Install ``langchain_community`` and ``faiss-cpu`` python packages.

        .. code-block:: bash

            pip install -qU langchain_community faiss-cpu

    Key init args — indexing params:
        embedding_function: Embeddings
            Embedding function to use.

    Key init args — client params:
        index: Any
            FAISS index to use.
        docstore: Docstore
            Docstore to use.
        index_to_docstore_id: Dict[int, str]
            Mapping of index to docstore id.

    Instantiate:
        .. code-block:: python

            import faiss
            from langchain_community.vectorstores import FAISS
            from langchain_community.docstore.in_memory import InMemoryDocstore
            from langchain_openai import OpenAIEmbeddings

            index = faiss.IndexFlatL2(len(OpenAIEmbeddings().embed_query("hello world")))

     

In [16]:
retriver = vector_db.as_retriever(search_type = 'similarity', search_kwargs = {'k': 4})

## Carregando o LLM Open-Source com Processo de Quantização

In [17]:
hf_api = getpass("Cole sua api aqui: ")

Cole sua api aqui:  ········


In [18]:
HfFolder.save_token(hf_api)

In [19]:
nome_llm = "meta-llama/Meta-Llama-3-8B-Instruct"

In [20]:
bnb_config = BitsAndBytesConfig(load_in_4bit= True,
                                bnb_4bit_use_double_quant= True,
                                bnb_4bit_quant_type= 'nf4',
                                bnb_4bit_compute_dtype= torch.bfloat16)

In [21]:
model = AutoModelForCausalLM.from_pretrained(nome_llm, quantization_config = bnb_config)

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [22]:
tokenizador = AutoTokenizer.from_pretrained(nome_llm)

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

## Preparando o Pipeline

In [23]:
tokens_terminators = [tokenizador.eos_token_id, tokenizador.convert_tokens_to_ids('<|eot_id|>')]

In [25]:
text_generation_pipeline = pipeline(model = model,
                                   tokenizer = tokenizador,
                                   task = 'text-generation',
                                   temperature = 0.2,
                                   do_sample = True,
                                   repetition_penalty = 1.1,
                                   return_full_text = False,
                                   max_new_tokens = 200,
                                   eos_token_id = tokens_terminators)

In [26]:
llm = HuggingFacePipeline(pipeline = text_generation_pipeline)

## Definindo o Prompt Template com LangChain

Formato do prompt:

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>

{{ user_msg_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{{ model_answer_1 }}<|eot_id|>

In [27]:
prompt = """
<|start_header_id|>user<|end_header_id|>
Você é um assistente para tirar dúvidas sobre Inteligência Artificial.
Você recebe as partes extraídas de um documento longo e uma pergunta. Forneça uma resposta coloquial.
Se você não souber a resposta, basta dizer “Não sei”. Não invente uma resposta. Responda em português do Brasil.
Pergunta: {question}
Contexto: {context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

In [29]:
prompt = PromptTemplate(input_variables = ['context', 'question'], template = prompt)

In [30]:
llm_chain = prompt | llm | StrOutputParser()

In [31]:
chain = {'context': retriver, 'question': RunnablePassthrough()} | llm_chain

## Usando o LLM para Responder Perguntas Usando o VectorDB como Fonte de Consulta

In [32]:
pergunta1 = 'O que a pandemia do Covid-19 gerou no desenvolvimento digital?'

In [33]:
chain.invoke(pergunta1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'A pandemia do Covid-19 gerou um impulso significativo no desenvolvimento digital em todo o mundo. Com a necessidade de mudanças rápidas em como as pessoas trabalham, se comunicam e se conectam, muitas empresas e indivíduos foram obrigados a adaptar-se às tecnologias digitais para sobreviver.\n\nIsso incluiu a expansão do uso de ferramentas de colaboração remota, como videoconferências e aplicativos de comunicação instantânea, bem como a implementação de soluções de trabalho em home office. Além disso, a pandemia também acelerou a adoção de tecnologias emergentes, como inteligência artificial, machine learning e blockchain, para melhorar a eficiência e reduzir custos.\n\nNo entanto, essa rápida transição digital também trouxe desafios, como a falta de habilidades digitais em muitos'

In [34]:
pergunta2 = 'Quanto as principais economias do mundo podem perder até 2028 em crescimento potencial?'

In [35]:
chain.invoke(pergunta2)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'Não sei. A informação fornecida não especifica quantos trilhões de dólares as principais economias do mundo podem perder em crescimento potencial até 2028. No entanto, a fonte menciona que a empresa global de consultoria e serviços profissionais Accenture estima que as principais economias do mundo podem perder US$ 11,5 trilhões em crescimento potencial até 2028 se não conseguirem preencher a lacuna de habilidades.'