In [None]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

In [None]:
def load_document(file):
    import os
    from langchain.document_loaders import PyPDFLoader
    print(f'Carregando {file}')
    loader = PyPDFLoader(file)
    return loader.load()
data = load_document('../dataset/CLT.pdf')

In [None]:
def chunk_data(data, chunk_size=1000):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter =  RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

In [None]:
data = load_document('../dataset/CLT.pdf')
chunks = chunk_data(data)

In [None]:
def embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Tokens Total: { total_tokens}')
    print(f'Embedding cost in USD: {total_tokens / 1000*0.0001:.6f}')

In [None]:
embedding_cost(chunks)

In [None]:
def insert_embeddings(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings()
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name in pinecone.list_indexes():
        print(f'Index {index_name}')
        vector_store = Pinecone.from_existing_index(index_name,embeddings)
        print('OK')
    else:
        print(f'Criando index {index_name}')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store=Pinecone.from_documents(chunks,embeddings,index_name=index_name)
        print('OK')
    return vector_store
        

In [None]:
def delete_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print('Deletando todos os indices...')
        for index in indexes:
            pinecone.delete_index(index)
    else:
        print(f'Deletando_indice {index_name}...')
        pinecone.delete_index(index_name)

In [None]:
index_name = 'linuxtips'
vector_store = insert_embeddings(index_name)

In [None]:
def get_answer(vector_store,q):
    
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gtp-3.5-turbo', temperature=1)
    
    retriever = vector_store.as_retriever(search_type = 'similarity',search_kwargs={'k':3})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

    answer = chain.run(q)
    
    return asnwer

def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})

    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question':question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))

    return result, chat_history

In [None]:
q = 'O que é o décimo terceiro salário?'
answer = get_answer(vector_store,q)
print(answer)

In [None]:
import time
i = 1
print('Digite sair para encerrar.')
while True:
    q = input(f'Pergunta: #{i}: ')
    i=i+1
    if q.lower() in ['sair']:
        print('Encerrando ...')
        time.sleep(2)
        break
    answer = get_answer(vector_store, q)
    print(f'\nResposta: {answer} ')
    print(f'\n {"_" * 50} \n')

In [None]:
chat_history = []
question = 'em que ano a clt foi criada?'
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])