<a href="https://colab.research.google.com/github/Pavun-KumarCH/Research-Notebooks/blob/main/RAG_VDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title requirements
%pip install --q openai pinecone datasets

In [None]:
# Load Dependecies
import os
import ast
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
from IPython.display import display, Markdown

import warnings
warnings.filterwarnings('ignore')

from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
PINECONE_API_KEY= userdata.get('PINECONE_API_KEY')

In [None]:
def create_dlai_index_name(index_name):
  openai_key = ""
  try:
    # For Google Colab
    from google.colab import userdata
    openai_key = userdata.get("OPENAI_API_KEY")
  except ImportError:
    # For Jupyter or other environments
    openai_key = os.getenv("OPENAI_API_KEY")

    # Ensure openai_key is not empty
  if not openai_key:
    raise ValueError("OpenAI API key is missing.")

  return f'{index_name}-{openai_key[-36:].lower().replace("_", "-")}'

In [None]:
# Setup Pinecone

pinecone = Pinecone(api_key = PINECONE_API_KEY)

INDEX_NAME = create_dlai_index_name("rag-dlai")
if INDEX_NAME in[index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(
  name = INDEX_NAME,
  dimension = 1536,
  metric = "cosine",
  spec = ServerlessSpec(cloud = "aws", region = "us-east-1"))

# Creating a Index
index = pinecone.Index(INDEX_NAME)
index

In [None]:
# Download the Data

#!wget -q -O lesson2-wiki.csv.zip "https://www.dropbox.com/scl/fi/yxzmsrv2sgl249zcspeqb/lesson2-wiki.csv.zip?rlkey=paehnoxjl3s5x53d1bedt4pmc&dl=0"

#!unzip lesson2-wiki.csv.zip

In [None]:
#@title load The Dataset
max_articles_num = 500
df = pd.read_csv('wiki.csv', nrows = max_articles_num)
df.head()

In [None]:
#@title Prepare the Embeddings and Upsert(upload) to Pinecone
prepared = []

for i, row in tqdm(df.iterrows(), total = df.shape[0]):
  meta = ast.literal_eval(row['metadata'])
  values = ast.literal_eval(row['values'])
  prepared.append({'id' : row['id'],
                   'values' : values,
                   'metadata': meta})
  if len(prepared) >= 250:
    index.upsert(vectors = prepared)
    prepared = []

In [None]:
index.describe_index_stats()

In [None]:
#@title Connect to OpenAI
openai_client = OpenAI(api_key = OPENAI_API_KEY)

def get_embedding(articles, model = "text-embedding-ada-002"):
  return [record.embedding for record in openai_client.embeddings.create(input = articles, model = model).data]

In [None]:
#@title Run Your Query
query = "What is the berlin wall ?"

embed = get_embedding([query])
res = index.query(vector = embed, top_k = 4, include_metadata = True)
text = [r['metadata']['text'] for r in res['matches']]
print("\n".join(text))

In [None]:
## Build the Prompt
query = "write an article titled: what is the berlin wall?"
embed = get_embedding([query])

res = index.query(vector = embed,
                  top_k = 3,
                  include_metadata = True)

contexts = [x['metadata']['text'] for x in res['matches']]

prompt_start = ("Answer the question based on the context below.  \n\n"+
                "Context:\n")

prompt_end = (f"\n\nQuestion: {query}\nAnswer:")

prompt = (prompt_start + "\n\n---\n\n".join(contexts) + "\n\n---\n\n" + prompt_end)

print(prompt)

In [None]:
# Get the Summary
res = openai_client.completions.create(
    model = "gpt-3.5-turbo-instruct",
    prompt = prompt,
    temperature = 0.3,
    max_tokens = 636,
    top_p = 1,
    frequency_penalty = 0,
    presence_penalty = 0,
    stop = None)

print("-" * 80)
print(res.choices[0].text)
print("-" * 80)