# Open AI Embeddings

In [10]:
import openai
from dotenv import load_dotenv
import os
load_dotenv()  # take environment variables from .env.

OPENKEY_API = os.getenv("OPENAI_KEY")
ORGANIZATION_ID = os.getenv("ORGANIZATION_ID")
openai.organization = ORGANIZATION_ID
# get this from top-right dropdown on OpenAI under organization > settings
openai.api_key = OPENKEY_API
# get API key from top-right dropdown on OpenAI website

openai.Engine.list()  # check we have authenticated

<OpenAIObject list at 0x7f34798b6410> JSON: {
  "data": [
    {
      "created": null,
      "id": "babbage",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "ada",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "davinci",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-embedding-ada-002",
      "object": "engine",
      "owner": "openai-internal",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "babbage-code-search-code",
      "object": "engine",
      "owner": "openai-dev",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-similarity-babbage-001",
      "object": "engine",
      "owner": "op

In [11]:
MODEL = "text-similarity-babbage-001"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=MODEL
)

In [12]:
# extract embeddings to a list
embeds = [record['embedding'] for record in res['data']]

In [19]:
print( len(embeds[0]    ) )

2048


# Populate the Index

## Load the Dataset

In [41]:
from datasets import load_dataset
from rich import print

faq_dataset = load_dataset("csv" , data_files="/home/null/code/utd_chatbot/FAQ on UTD Student Accesebility - Sheet1.csv")
print(faq_dataset)
## remove all the None values
faq_dataset = faq_dataset.filter(lambda x: x['Question'] is not None and x['Answering'] is not None)

Using custom data configuration default-b5279144beba40c5
Found cached dataset csv (/home/null/.cache/huggingface/datasets/csv/default-b5279144beba40c5/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 371.84it/s]


Loading cached processed dataset at /home/null/.cache/huggingface/datasets/csv/default-b5279144beba40c5/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-d8b863d6763fbde1.arrow


In [42]:
print(len(faq_dataset['train']))

## Init the pinecon client

In [1]:
import pinecone
PINECONE_APIKEY = os.getenv("PINECONE_APIKEY")
# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key = "3bfba84d-b7a4-43f2-8931-c61ddddd69b1",
    environment="us-east1-gcp"
)

2141478c-ff8b-4881-b0b9-b78aa5bed0b9


In [44]:
index = pinecone.Index('elements')

# Populating the Index

In [45]:
from tqdm.auto import tqdm  # this is our progress bar

batch_size = 12  # process everything in batches of 32
for i in tqdm(range(0, len(faq_dataset['train']), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(faq_dataset['train']))
    # get batch of lines and IDs
    lines_batch = faq_dataset["train"]['Question'][i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    print(lines_batch)
    res = openai.Embedding.create(input=lines_batch, engine=MODEL)
    embeds = [record['embedding'] for record in res['data']]
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))


  0%|          | 0/2 [00:00<?, ?it/s]

 50%|█████     | 1/2 [00:00<00:00,  1.28it/s]

100%|██████████| 2/2 [00:01<00:00,  1.75it/s]


In [47]:
query = "I have ADHD. What do I do?"

xq = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']
res = index.query([xq], top_k=5, include_metadata=True)
print(res)

In [63]:
from rich import print
print( res["matches"][0]["metadata"]["text"] )
template = """
Please act as a University of Texas Counselor. I will provide you with an individual 
looking for guidance at the University of Texas at Dallas, and your task is to help them 
solve their problem\n
"""
for query_response in  res["matches"]:
    answer = faq_dataset['train'].filter(lambda x: x['Question'] == query_response['metadata']['text'])['Answering'][0]
    template += f"Q: {query_response['metadata']['text']}\nA: {answer}\n"
prompt = template + f"Q: {query}\nA: "

100%|██████████| 1/1 [00:00<00:00, 355.45ba/s]
100%|██████████| 1/1 [00:00<00:00, 556.42ba/s]
100%|██████████| 1/1 [00:00<00:00, 442.02ba/s]
100%|██████████| 1/1 [00:00<00:00, 255.58ba/s]
100%|██████████| 1/1 [00:00<00:00, 192.80ba/s]


In [66]:
print(prompt)

In [64]:
response = openai.Completion.create(
  model="text-davinci-002",
  prompt =  prompt,
  temperature=0.3,
  max_tokens=150,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

In [65]:
print(response["choices"][0]["text"])