In [1]:
! pip install sentence-transformers chromadb wikipedia-api

Collecting chromadb
  Downloading chromadb-1.3.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import wikipediaapi

In [3]:
wiki = wikipediaapi.Wikipedia(user_agent="RAGUsingWikipediaAPI (usamapk7861@gmail.com)", language='en')

In [4]:
PAGE_TITLES = [
    "Artificial intelligence",
    "Machine learning",
    "Deep learning",
    "Transformer (machine learning model)",
    "Generative artificial intelligence"
]

In [5]:
documents = []
metadatas = []
ids = []
doc_counter = 0

In [6]:
for title in PAGE_TITLES:
  print(f"Fetching {title}...")
  page = wiki.page(title)

  if(not page.exists()):
    print(f"Page {title} does not exist")
    continue

  paragraphs = page.text.split("\n")

  for i, para_text in enumerate(paragraphs):
        if len(para_text.strip()) > 50:
            documents.append(para_text.strip())
            metadatas.append({"source": title})
            ids.append(f"{title}_{i}")
            doc_counter += 1

print(f"  Successfully extracted {doc_counter} paragraphs.")

Fetching Artificial intelligence...
Fetching Machine learning...
Fetching Deep learning...
Fetching Transformer (machine learning model)...
Fetching Generative artificial intelligence...
  Successfully extracted 821 paragraphs.


In [7]:
from sentence_transformers import SentenceTransformer

In [8]:
model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
embeddings = model.encode(documents)
print(embeddings.shape)

(821, 384)


In [10]:
import chromadb
chromadb_client = chromadb.Client()

In [12]:
collection_name = "wikipedia_collection"
if len(chromadb_client.list_collections()) > 0 and collection_name in [chromadb_client.list_collections()[0].name]:
 chromadb_client.delete_collection(name=collection_name)
collection = chromadb_client.create_collection(name=collection_name)

In [13]:
collection.add(
    embeddings=embeddings,
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

In [14]:
USER_QUERY = "What is Artificial Intelligence?"

In [15]:
query_embedding = model.encode([USER_QUERY])

results = collection.query(
    query_embeddings=query_embedding,
    n_results=1
)

In [16]:
retrieved_context = results['documents'][0][0]
source_page = results['metadatas'][0][0]['source']

In [17]:
PROMPT_TEMPLATE = f"""
Based *only* on the following context, please answer the question.
If the context does not contain the answer, say "The context does not provide this information."

Context:
{retrieved_context}

Question:
{USER_QUERY}

Answer:
"""

In [18]:
import openai

In [19]:
client = openai.Client(base_url="https://api.groq.com/openai/v1", api_key="YOUR_API_KEY_HERE")

In [20]:
completion = client.chat.completions.create(
    model="llama-3.3-70b-versatile",
    messages=[{"role": "user", "content": PROMPT_TEMPLATE}],
    temperature=0.2
)

In [21]:
final_answer = completion.choices[0].message.content

print(f"User Query: {USER_QUERY}")
print(f"Retrieved Context {retrieved_context}")
print(f"Source Page {source_page}")
print(f"LLM Response {final_answer}")

User Query: What is Artificial Intelligence?
Retrieved Context Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.
Source Page Artificial intelligence
LLM Response Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making.
