# Import the required packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

from datasets import load_dataset
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
from DLAIUtils import Utils

import ast
import os
import pandas as pd

# Get API key
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

# Setup pinecone
pinecone = Pinecone(api_key=PINECONE_API_KEY)
utils = Utils()
INDEX_NAME = utils.create_dlai_index_name('dl-ai')
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
pinecone.create_index(name=INDEX_NAME, dimensions=1536, metric='cosine',
                      serverless=ServerlessSpec(cloud='aws', region='us-west-2'))
index = pinecone.Index(INDEX_NAME)

ModuleNotFoundError: No module named 'openai'

In [None]:
# Load the Dataset
max_articles_num = 500
df = pd.read_csv('/.data/wiki.csv', nrows=max_articles_num)
df.head()

In [None]:
prepped = []
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    meta = ast.lietarl_eval(row['metadata'])
    prepped.append({'id':row['id'],
                    'values':ast.literal_eval(row['values']),
                    'metadata':meta})
    if len(prepped) >= 250:
        index.upsert(prepped)
        prepped = []

index.describe_index_stats()

In [None]:
# Connect to OpenAI
OPEN_AI_KEY = utils.get_openai_api_key()
openai_client = OpenAI(api_key=OPEN_AI_KEY)

def get_embeddings(articles, model='text-embedding-ada-002'):
    return openai_client.embeddings.create(input=articles, mdoel=model)

query = "What is the Berlin wall?"
embed = get_embeddings([query])
res = index.query(vector=embed.data[0], top_k=3, include_metadata=True)
text = [r['metadata']['text'] for r in res['matches']]
print('\n'.join(text))

In [None]:
# Build the prompt
query = "Write an article titled: What is the Berlin Wall?"
embed = get_embeddings([query])
res = index.query(vector=embed.data[0], top_k=3, include_metadata=True)
contexts = [
    x['metadata']['text'] for x in res['matches']
]
prompt_start = (
    "Answer the question based on the context below.\n\n"+
    "Context:\n"
)
prompt_end = f"\n\nQuestion: {query}\nAnswer:"
prompt = prompt_start + "\n\n---\n\n".join(contexts) + prompt_end
print(prompt)

In [None]:
# Get the Summary
res = openai_client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature=0,
    max_tokens=636,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None
)
print('-'*80)
print(res.choices[0].text)