In [3]:
import wikipedia
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [2]:
pip install "urllib3<2.0"

Defaulting to user installation because normal site-packages is not writeable
Collecting urllib3<2.0
  Downloading urllib3-1.26.20-py2.py3-none-any.whl.metadata (50 kB)
Downloading urllib3-1.26.20-py2.py3-none-any.whl (144 kB)
Installing collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 2.3.0
    Uninstalling urllib3-2.3.0:
      Successfully uninstalled urllib3-2.3.0
Successfully installed urllib3-1.26.20
Note: you may need to restart the kernel to use updated packages.


__Step 1: Retrieving Knowledge__

In [4]:
def get_wikipedia_content(topic):
    try:
        page = wikipedia.page(topic)
        return page.content
    except wikipedia.exceptions.PageError:
        return None
    except wikipedia.exceptions.DisambiguationError as e:
        # handle cases where the topic is ambiguous
        print(f"Ambiguous topic. Please be more specific. Options: {e.options}")
        return None

# user input
topic = input("Enter a topic to learn about: ")
document = get_wikipedia_content(topic)

if not document:
    print("Could not retrieve information.")
    exit()

__Since Wikipedia articles can be long, we will split the text into smaller overlapping chunks for better retrieval__

In [5]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")

def split_text(text, chunk_size=256, chunk_overlap=20):
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunks.append(tokenizer.convert_tokens_to_string(tokens[start:end]))
        if end == len(tokens):
            break
        start = end - chunk_overlap
    return chunks

chunks = split_text(document)
print(f"Number of chunks: {len(chunks)}")

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (17553 > 512). Running this sequence through the model will result in indexing errors


Number of chunks: 75


__Step 2: Storing and Retrieving Knowledge__

In [6]:
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = embedding_model.encode(chunks)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

__Step 3: Querying the RAG Pipeline__

In [23]:
query = input("Ask a question about the topic: ")
query_embedding = embedding_model.encode([query])

k = 3
distances, indices = index.search(np.array(query_embedding), k)
retrieved_chunks = [chunks[i] for i in indices[0]]
print("Retrieved chunks:")
for chunk in retrieved_chunks:
    print("- " + chunk)

Retrieved chunks:
- s leadership. on august 19, 2020, apple ' s share price briefly topped $ 467. 77, making it the first us company with a market capitalization of us $ 2 trillion. during its annual wwdc keynote speech on june 22, 2020, apple announced it would move away from intel processors, and the mac would transition to processors developed in - house. the announcement was expected by industry analysts, and it has been noted that macs featuring apple ' s processors would allow for big increases in performance over current intel - based models. on november 10, 2020, the macbook air, macbook pro, and the mac mini became the first macs powered by an apple - designed processor, the apple m1. in april 2022, it was reported that samsung electro - mechanics would be collaborating with apple on its m2 chip instead of lg innotek. developer logs showed that at least nine mac models with four different m2 chips were being tested. the wall street journal reported that apple ' s effort to dev

In [24]:
qa_model_name = "deepset/roberta-base-squad2"
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)

context = " ".join(retrieved_chunks)
answer = qa_pipeline(question=query, context=context)
print(f"Answer: {answer['answer']}")

Device set to use mps:0


Answer: apple m1
