In [39]:
import os
import dotenv
import openai

# authenticate with openai
dotenv.load_dotenv(dotenv_path="./.env.local")
openai.api_key = os.environ["gpt_api_secret"]

openai.Engine.list()  # check we have authenticated

<OpenAIObject list at 0x17fe63530> JSON: {
  "data": [
    {
      "created": null,
      "id": "whisper-1",
      "object": "engine",
      "owner": "openai-internal",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "babbage",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "davinci",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-davinci-edit-001",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "babbage-code-search-code",
      "object": "engine",
      "owner": "openai-dev",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-similarity-babbage-001",
      "object": "engine",
      "owner": "

In [40]:
# write a function to query the engine
def complete(prompt):
    # query text-davinci-003
    res = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        temperature=0,
        max_tokens=400,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return res['choices'][0]['text'].strip()



In [50]:
import PyPDF2
import nltk

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as pdf_file_obj:
        pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page_obj = pdf_reader.pages[page_num]
            text += page_obj.extract_text()
    return text

pdf_path = "/Users/seeker/Desktop/Axé Engineering/sample documents/APPLE_US_TERMS_COND-0056.pdf"
text = extract_text_from_pdf(pdf_path)
print("tokens: " + str(len(nltk.word_tokenize(text))))
print("characters: " + str(len(text)))
# print first 2100 tokens from text
print(text[1900:2100])

tokens: 10738
characters: 59034
eller to 
the terms hereof and shipment of the Goods or begin ning performance of any Services by Seller shall constitute such 
assent. Apple hereby reserves the right to reschedule any delivery or ca


In [51]:
# split the text into chunks of 10000 characters 
# with an overlap of 300 characters
chunks = []
chunk_size = 10000
overlap_size = 300

for i in range(0, len(text), chunk_size - overlap_size):
    chunks.append(text[i:i + chunk_size])

print("number of chunks: " + str(len(chunks)))



number of chunks: 7


In [None]:
embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=chunks,
    engine=embed_model
)

# res

<OpenAIObject list at 0x2a2e7d0d0> JSON: {
  "data": [
    {
      "embedding": [
        0.00816996768116951,
        -0.027662744745612144,
        -0.020424921065568924,
        0.0026713465340435505,
        -0.020822452381253242,
        0.014407108537852764,
        -0.007230970077216625,
        -0.007820414379239082,
        0.009869759902358055,
        -0.025346092879772186,
        0.010932129807770252,
        0.012864958494901657,
        -0.008163114078342915,
        0.014078116044402122,
        -0.0037114410661160946,
        -0.005342693068087101,
        0.01230293046683073,
        -0.020315255969762802,
        -0.024674400687217712,
        -0.005586009938269854,
        -0.009958862327039242,
        0.011590113863348961,
        -0.028265895321965218,
        0.005150781013071537,
        0.010143919847905636,
        0.008108282461762428,
        0.03331043943762779,
        -0.04060309752821922,
        0.03311852738261223,
        0.012858103960752487,
      

In [68]:
len(res['data'][0]['embedding'])

1536

In [64]:
res.usage

<OpenAIObject at 0x2ac20df70> JSON: {
  "prompt_tokens": 12609,
  "total_tokens": 12609
}

In [71]:
import pinecone

# initialize connection to pinecone
api_key = os.environ["pinecone_api_key"]
env = "us-west1-gcp-free"

pinecone.init(api_key=api_key, environment=env)
pinecone.whoami()

WhoAmIResponse(username='b42c5d9', user_label='default', projectname='4796d1f')

In [52]:
index_name = "regqa"

In [155]:
# check if index already exists 
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='cosine' # optional
        # metadata_config={}
    )
# connect to index
index = pinecone.GRPCIndex(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 7}},
 'total_vector_count': 7}

In [120]:
# index.describe_index_stats()

In [122]:
# make a list of embeddings to upsert
to_upsert = []
for i in range(len(res['data'])):
    # make a tuple of (id, embedding)
    chunk = (f"id{i}", res['data'][i]['embedding'], {"text": chunks[i]})
    to_upsert.append(chunk)

In [123]:
index.upsert(vectors=to_upsert)

upserted_count: 7

In [124]:
query = "What is the definition of a 'consumer'?"
query_embedding = openai.Embedding.create(
    input=[query],
    engine=embed_model
)

# query_embedding
xq = query_embedding['data'][0]['embedding']

In [142]:
# get relevant contexts
results = index.query(
    xq, 
    top_k=7,
    include_values = True,
    include_metadata = True
)

In [143]:
results

{'matches': [{'id': 'id4',
              'metadata': {'text': 'a view toward securing business \n'
                                   'from Apple or influencing the terms, cond '
                                   'itions or performance of this Agreement or '
                                   'any PO.   \n'
                                   '15. TERMINATION. Apple may terminate this '
                                   'Agreement upon written notice to Seller if '
                                   'Seller fails to perform or \n'
                                   'otherwise breaches this Agreement, files a '
                                   'petition in bankruptcy, becomes insolvent, '
                                   'or di ssolves. In the event of  \n'
                                   'OL-AMR -56 v. 2. 9 \n'
                                   ' such termination, Apple shall pay Seller '
                                   'for the portion of the Services '
                   

In [149]:
len(results['matches'])

7

In [148]:
results['matches'][0]['metadata']['text']

In [151]:
def retrieve(query):
    res = openai.Embedding.create(
        input=[query],
        engine=embed_model
    )

    # retrieve from Pinecone
    xq = res['data'][0]['embedding']
                     
    # get relevant contexts
    res = index.query(xq, top_k=1, include_metadata=True)
    context = res['matches'][0]['metadata']['text']

    # build our prompt with the retrieved context
    prompt_start = (
        "Answer the question based on the context below.\n\n"+
        "context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\n\nAnswer:"
    )
    prompt = prompt_start + context + prompt_end
    return prompt

In [152]:
query_with_context = retrieve(query)
query_with_context

'Answer the question based on the context below.\n\ncontext:\na view toward securing business \nfrom Apple or influencing the terms, cond itions or performance of this Agreement or any PO.   \n15. TERMINATION. Apple may terminate this Agreement upon written notice to Seller if Seller fails to perform or \notherwise breaches this Agreement, files a petition in bankruptcy, becomes insolvent, or di ssolves. In the event of  \nOL-AMR -56 v. 2. 9 \n such termination, Apple shall pay Seller for the portion of the Services satisfactorily performed and those conforming \nGoods delivered to Apple through the date of termination, less appropriate offsets, including any additional cos ts to \nbe incurred by Apple in completing the Services. Apple may terminate this Agreement for any other reason upon ten \n(10) days\' written notice to Seller. Seller shall cease to perform Services and/or provide Goods under this Agreement \non the date of ter mination specified in such notice. In the event of su

In [153]:
complete(query_with_context)

"There is no definition of a 'consumer' in the context provided."

In [118]:
# delete index
pinecone.delete_index(index_name)