# Lesson 2 - Retrieval Augmented Generation (RAG)

### Import  the Needed Packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
from datasets import load_dataset

from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm

import ast
import os
import pandas as pd

import uuid

from dotenv import load_dotenv
load_dotenv()

True

In [4]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

### Setup Pinecone

In [13]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)

INDEX_NAME = f'vedclove-{str(uuid.uuid4())}'
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-east-1'))

index = pinecone.Index(INDEX_NAME)

### Load the Dataset

**Note:** To access the dataset outside of this course, just copy the following two lines of code and run it (remember to uncomment them first before executing):

#!wget -q -O lesson2-wiki.csv.zip "https://www.dropbox.com/scl/fi/yxzmsrv2sgl249zcspeqb/lesson2-wiki.csv.zip?rlkey=paehnoxjl3s5x53d1bedt4pmc&dl=0"

#!unzip lesson2-wiki.csv.zip

<p style="background-color:#fff1d7; padding:15px; "> <b>(Note: <code>max_articles_num = 500</code>):</b> To achieve a more comprehensive context for the Language Learning Model, a larger number of articles is generally more beneficial. In this lab, we've initially set <code>max_articles_num</code> to 500 for speedier results, allowing you to observe the outcomes faster. Once you've done an initial run, consider increasing this value to 750 or 1,000. You'll likely notice that the context provided to the LLM becomes richer and better. You can experiment by gradually raising this variable for different queries to observe the improvements in the LLM's contextual understanding.</p>

In [62]:
max_articles_num = 500
df = pd.read_csv('./data/wiki.csv', nrows=max_articles_num)
df.head()


Unnamed: 0,id,metadata,values
1,1-0,"{'chunk': 0, 'source': 'https://simple.wikiped...","[-0.011254455894231796, -0.01698738895356655, ..."
2,1-1,"{'chunk': 1, 'source': 'https://simple.wikiped...","[-0.0015197008615359664, -0.007858820259571075..."
3,1-2,"{'chunk': 2, 'source': 'https://simple.wikiped...","[-0.009930099360644817, -0.012211072258651257,..."
4,1-3,"{'chunk': 3, 'source': 'https://simple.wikiped...","[-0.011600767262279987, -0.012608098797500134,..."
5,1-4,"{'chunk': 4, 'source': 'https://simple.wikiped...","[-0.026462381705641747, -0.016362832859158516,..."


### Prepare the Embeddings and Upsert to Pinecone

In [69]:
prepped = []

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    meta = ast.literal_eval(row['metadata'])
    prepped.append({'id':row['id'], 
                    'values':ast.literal_eval(row['values']), 
                    'metadata':meta})
    if len(prepped) >= 250:
        index.upsert(prepped)
        prepped = []


100%|██████████| 500/500 [00:16<00:00, 30.17it/s] 


In [70]:
index.describe_index_stats()

{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '186',
                                    'content-type': 'application/json',
                                    'date': 'Mon, 29 Dec 2025 03:21:18 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '44',
                                    'x-pinecone-request-id': '1192017300727192398',
                                    'x-pinecone-request-latency-ms': '48',
                                    'x-pinecone-response-duration-ms': '49'}},
 'dimension': 1536,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'__default__': {'vector_count': 500}},
 'storageFullness': 0.0,
 'total_vector_count': 500,
 'vector_type': 'dense'}

### Connect to OpenAI

In [50]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)

def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)

### Run Your Query

In [55]:
query = "Tell me about the game Cricket?"

embed = get_embeddings([query])
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)
text = [r['metadata']['text'] for r in res['matches']]
print('\n'.join(text))


Sport 

Sport is an important part of Australian culture because the climate is good for outdoor activities. 23.5% Australians over the age of 15 regularly take part in organised sporting activities. In international sports, Australia has very strong teams in cricket, hockey, netball, rugby league and rugby union, and performs well in cycling, rowing and swimming. Local popular sports include Australian Rules Football, horse racing, soccer and motor racing. Australia has participated in every summer Olympic Games since 1896, and every Commonwealth Games. Australia has hosted the 1956 and 2000 Summer Olympics, and has ranked in the top five medal-winners since 2000. Australia has also hosted the 1938, 1962, 1982 and 2006 Commonwealth Games and are to host the 2018 Commonwealth Games. Other major international events held regularly in Australia include the Australian Open, one of the four Grand Slam tennis tournaments, annual international cricket matches and the Formula One Australian G

### Build the Prompt

In [57]:
query = "write an article titled: what is the game cricket?"
embed = get_embeddings([query])
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)

contexts = [
    x['metadata']['text'] for x in res['matches']
]

prompt_start = (
    "Answer the question based on the context below.\n\n"+
    "Context:\n"
)

prompt_end = (
    f"\n\nQuestion: {query}\nAnswer:"
)

prompt = (
    prompt_start + "\n\n---\n\n".join(contexts) + 
    prompt_end
)

print(prompt)

Answer the question based on the context below.

Context:
Sport 

Sport is an important part of Australian culture because the climate is good for outdoor activities. 23.5% Australians over the age of 15 regularly take part in organised sporting activities. In international sports, Australia has very strong teams in cricket, hockey, netball, rugby league and rugby union, and performs well in cycling, rowing and swimming. Local popular sports include Australian Rules Football, horse racing, soccer and motor racing. Australia has participated in every summer Olympic Games since 1896, and every Commonwealth Games. Australia has hosted the 1956 and 2000 Summer Olympics, and has ranked in the top five medal-winners since 2000. Australia has also hosted the 1938, 1962, 1982 and 2006 Commonwealth Games and are to host the 2018 Commonwealth Games. Other major international events held regularly in Australia include the Australian Open, one of the four Grand Slam tennis tournaments, annual inte

### Get the Summary 

In [58]:
res = openai_client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature=0,
    max_tokens=636,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None
)
print('-' * 80)
print(res.choices[0].text)

--------------------------------------------------------------------------------

Cricket is a popular sport that originated in England and has since spread to many countries around the world, including Australia. It is a bat-and-ball game played between two teams of eleven players each on a large oval-shaped field. The objective of the game is for one team to score more runs than the other team while the other team tries to get all of the opposing players out.

The game is played with a hard leather ball and a wooden bat. The batting team takes turns sending two players, known as batsmen, onto the field to face the bowling of the opposing team. The bowler, who is a member of the fielding team, throws or bowls the ball towards the batsman, who attempts to hit the ball with their bat and score runs by running between two sets of three wooden stumps, called wickets, at either end of the field.

The fielding team tries to get the batsmen out by hitting the wickets with the ball, catching 