In [1]:
!pip install transformers faiss-gpu torch accelerate sentencepiece nltk pypdf2 pymupdf
from IPython.display import clear_output
clear_output()

In [2]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModel
import requests
from bs4 import BeautifulSoup
import faiss
import nltk
import numpy as np

In [3]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
llm = pipeline(model="databricks/dolly-v2-3b",
               torch_dtype=torch.bfloat16,
               trust_remote_code=True,
               device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

instruct_pipeline.py:   0%|          | 0.00/9.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/databricks/dolly-v2-3b:
- instruct_pipeline.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]



In [5]:
from PyPDF2 import PdfReader

reader = PdfReader("/content/sample-story.pdf")
content = "".join([page.extract_text() for page in reader.pages])
print(content)

The Secret of Elysium Woods
 
It all started with a missing dog. Benny, a sprightly golden retriever with a tendency to wander off, had disappeared from the yard again. For 
twelve-year-old Sarah, this was becoming a frustrating routine. Her parents had warned her countless times to keep the gate locked, but 
somehow Benny always found a way to slip away. This time, however, Benny hadn’t returned by evening. The sun had long since set, casting a 
deep blue shadow over the small town of Elysium, and Sarah felt a growing knot of worry in her stomach. 
 
"He's probably just chasing squirrels in the woods," her older brother Jack had said, though he didn't seem too concerned. Jack, at sixteen, had a 
way of brushing off everything like it didn’t matter.
 
But for Sarah, it did matter. Benny wasn’t just a dog—he was her best friend, her only companion during the long, lazy summer days when 
school was out. So, despite her parents' protests, Sarah grabbed a flashlight, pulled on her hoodie, 

In [6]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [7]:
def chunk_content_by_sentence(content):
    return sent_tokenize(content)

content_chunks = chunk_content_by_sentence(content)
print(f"Number of content chunks (sentences): {len(content_chunks)}")

Number of content chunks (sentences): 81


In [8]:
chunk_embeddings = []
for chunk in content_chunks:
    inputs = tokenizer(chunk, return_tensors='pt', max_length=512, truncation=True)
    with torch.no_grad():
        embedding = model(**inputs).last_hidden_state.mean(dim=1).numpy()
    chunk_embeddings.append(embedding)

embeddings_np = np.vstack(chunk_embeddings)
print(f"Shape of embeddings: {embeddings_np.shape}")

Shape of embeddings: (81, 768)


In [9]:
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)
print("Number of vectors in the FAISS index:", index.ntotal)

Number of vectors in the FAISS index: 81


In [25]:
def generate_rag_response(query, model, tokenizer, index, content_chunks):

    query_inputs = tokenizer(query, return_tensors='pt')
    with torch.no_grad():
        query_embedding = model(**query_inputs).last_hidden_state.mean(dim=1).detach().numpy()
    k = 2
    distances, indices = index.search(query_embedding, k)
    relevant_contexts = [content_chunks[i] for i in indices[0]]
    combined_context = " ".join(relevant_contexts)

    input_text = (
        f"### Context Overview:\n"
        f"This is the {combined_context}\n"
        f"### Instructions:\n"
        f"Using the provided context, please answer the following question. Your response should:\n"
        f"- Be as clear and concise as possible.\n"
        f"- If the answer is not present in the context, respond with 'I don't know' instead of providing an incorrect answer.\n"
        f"- Make a proper, meaningful sentence for answering.\n\n"
        f"### Question:\n"
        f"{query}\n\n"
        f"### Your Answer:"
    )


    response = llm(input_text, temperature=0.3)
    return response[0]['generated_text'].strip()

In [27]:
query = "What was the missing dog's name?"
response = generate_rag_response(query, model, tokenizer, index, content_chunks)
print("Generated Response:", response)

Generated Response: The missing dog's name was Benny.
