In [1]:
# # if you get an error when connecting to pinecone, uncomment this block and run it first
# !pip install -qU \
#     openai==0.27.7 \
#     "pinecone-client[grpc]"==2.2.1 \
#     datasets==2.12.0 \
#     tqdm

In [2]:
# set pdf path and define query
pdf_path = "/Users/seeker/Desktop/Axé Engineering/sample documents/APPLE_US_TERMS_COND-0056.pdf"
query = "What is the supplier code of conduct?"

In [3]:
import os
import dotenv
import openai

# authenticate with openai
dotenv.load_dotenv(dotenv_path="./.env.local")
openai.api_key = os.environ["gpt_api_secret"]

# openai.Engine.list()  # check we have authenticated

In [4]:
import pinecone

# initialize connection to pinecone
api_key = os.environ["pinecone_api_key"]
env = "us-west1-gcp-free"

pinecone.init(api_key=api_key, environment=env)
pinecone.whoami()

  from tqdm.autonotebook import tqdm


WhoAmIResponse(username='b42c5d9', user_label='default', projectname='4796d1f')

In [5]:
# write a function to query the engine
def complete(prompt):
    # query text-davinci-003
    res = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        temperature=0,
        max_tokens=400,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return res['choices'][0]['text'].strip()

def retrieve(query):
    res = openai.Embedding.create(
        input=[query],
        engine=embed_model
    )

    # retrieve from Pinecone
    xq = res['data'][0]['embedding']
                     
    # get relevant contexts
    res = index.query(xq, top_k=1, include_metadata=True)
    context = res['matches'][0]['metadata']['text']

    # build our prompt with the retrieved context
    prompt_start = (
        "Answer the question based on the context below.\n\n"+
        "context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\n\nAnswer:"
    )
    prompt = prompt_start + context + prompt_end
    return prompt

# function to extract tet from pdf
import PyPDF2
import nltk

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as pdf_file_obj:
        pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page_obj = pdf_reader.pages[page_num]
            text += page_obj.extract_text()
    return text

In [6]:
# extract text from pdf
text = extract_text_from_pdf(pdf_path)

## convert text to knowledge base
# split the text into chunks of 10000 characters 
# with an overlap of 300 characters
chunks = []
chunk_size = 10000
overlap_size = 5000

for i in range(0, len(text), chunk_size - overlap_size):
    chunks.append(text[i:i + chunk_size])

embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=chunks,
    engine=embed_model
)

In [7]:
# create index
index_name = "regqa"

# check if index already exists 
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='cosine' # optional
        # metadata_config={}
    )
# connect to index
index = pinecone.GRPCIndex(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [8]:
# make a list of embeddings to upsert
to_upsert = []
for i in range(len(res['data'])):
    # make a tuple of (id, embedding)
    chunk = (f"id{i}", res['data'][i]['embedding'], {"text": chunks[i]})
    to_upsert.append(chunk)

# upsert embeddings
index.upsert(vectors=to_upsert)

upserted_count: 7

In [9]:
query_with_context = retrieve(query)
complete(query_with_context)

'The Apple Supplier Code of Conduct is a set of standards that suppliers must adhere to in order to do business with Apple. It covers topics such as labor and employment, anti-discrimination and anti-harassment, freedom of association, environmental protection, hazardous substances management, pollution prevention and resource sustainability, waste management, recycling, protection of intellectual property, and anti-corruption.'

In [10]:
# delete index
pinecone.delete_index(index_name)