In [1]:
!pip install transformers datasets faiss-cpu sentence-transformers



In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import torch
import requests
import PyPDF2

In [3]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# Load a smaller GPT-Neo model
model_name = "EleutherAI/gpt-neo-125M"
model = GPTNeoForCausalLM.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the device to CPU or GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPTNeoBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=False)
            (q_proj): Linear(in_features=768, out_features=768, bias=False)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_fe

In [4]:
# Load a Sentence Transformer model to convert sentences to embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Dummy corpus of documents for retrieval
documents = [
    "The Eiffel Tower is located in Paris.",
    "The Great Wall of China is visible from space."
    # "Artificial Intelligence is transforming the world.",
    # "The Grand Canyon is one of the seven wonders of the world."
]

# Convert the documents into embeddings
document_embeddings = embedder.encode(documents)

# Initialize FAISS index
dimension = document_embeddings.shape[1]  # Dimension of the embeddings
faiss_index = faiss.IndexFlatL2(dimension)  # L2 distance index
faiss_index.add(np.array(document_embeddings))  # Add document embeddings to index


In [5]:
def generate_without_rag(query):
    """Generate a response from the LLM without any external document retrieval."""
    # Tokenize the query and generate response
    inputs = tokenizer(query, return_tensors="pt").to(device)

    # Generate output without any retrieved document
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=50,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and return the generated response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


def generate_with_rag(query):
    """Generate a response from the LLM with document retrieval using FAISS."""
    # Retrieve the relevant document using FAISS
    query_embedding = embedder.encode([query])
    _, indices = faiss_index.search(np.array(query_embedding), k=1)
    relevant_document = documents[indices[0][0]]
    
    # Augment the query with the retrieved document
    augmented_prompt = f"Using the information from the document, answer the question.\n\nDocument: {relevant_document}\n\nQuestion: {query}\n\nAnswer:"
    
    # Tokenize the augmented prompt and generate response
    inputs = tokenizer(augmented_prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=50,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3  # This prevents repeating the same n-grams
    )

    # Decode and return the generated response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Example usage
query = "Where is the Eiffel Tower located?"

# Generate response without RAG
response_without_rag = generate_without_rag(query)
print("Response WITHOUT RAG:")
print(response_without_rag)

# Generate response with RAG
response_with_rag = generate_with_rag(query)
print("\nResponse WITH RAG:")
print(response_with_rag)


Response WITHOUT RAG:
Where is the Eiffel Tower located?

The Eiffel Tower is located in the city of Berlin, Germany. The tower is located in the city of Berlin, Germany. The tower is a part of the city of Berlin, Germany

Response WITH RAG:
Using the information from the document, answer the question.

Document: The Eiffel Tower is located in Paris.

Question: Where is the Eiffel Tower located?

Answer: The Tower is in Paris, France.


In [6]:
def download_pdf(url, output_path='manual.pdf'):
    response = requests.get(url)
    with open(output_path, 'wb') as file:
        file.write(response.content)
    print(f"PDF downloaded and saved as {output_path}")

def extract_text_from_pdf(pdf_path):
    reader = PyPDF2.PdfReader(pdf_path)
    text = ""
    for page in range(len(reader.pages)):
        text += reader.pages[page].extract_text()
    return text

# Download the PDF and extract text
url = "https://techinfo.honda.com/rjanisis/pubs/OM/AH/BTYA2222OM/enu/BTYA2222OM.pdf"
download_pdf(url, 'acura_mdx_manual.pdf')
manual_text = extract_text_from_pdf('acura_mdx_manual.pdf')
print("PDF text extracted.")


PDF downloaded and saved as acura_mdx_manual.pdf
PDF text extracted.


In [7]:
def chunk_text(text, chunk_size=500):
    """Chunk the text into smaller pieces for embedding."""
    text_chunks = []
    words = text.split()
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        text_chunks.append(chunk)
    return text_chunks

# Chunk the extracted text
text_chunks = chunk_text(manual_text)
print(f"Manual text chunked into {len(text_chunks)} chunks.")

Manual text chunked into 273 chunks.


In [8]:
# Load a pre-trained sentence transformer model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each text chunk
chunk_embeddings = embedder.encode(text_chunks)

# Store embeddings using FAISS for efficient similarity search
dimension = chunk_embeddings.shape[1]  # Dimension of the embeddings
faiss_index = faiss.IndexFlatL2(dimension)  # L2 distance index
faiss_index.add(np.array(chunk_embeddings))

print("Embeddings created and indexed.")

Embeddings created and indexed.


In [31]:
def generate_response_with_llm(query, context):
    """Generate a response based on the relevant manual chunk using the LLM."""
    prompt = f"Using the information from the owner's manual section below, answer the question concisely:\n\nManual Section: {context}\n\nQuestion: {query}\n\nAnswer:"
    
    # Set pad_token to eos_token
    tokenizer.pad_token = tokenizer.eos_token

    # Tokenize the input and set attention_mask
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    attention_mask = inputs['attention_mask']
    
    # Set the `pad_token_id` to `eos_token_id`
    pad_token_id = tokenizer.eos_token_id
    
    # Generate the response with limited new tokens, attention mask, and no_repeat_ngram_size to prevent repetitions
    outputs = model.generate(
        inputs['input_ids'], 
        attention_mask=attention_mask, 
        max_new_tokens=100,  # Control the number of new tokens generated
        pad_token_id=pad_token_id,
        no_repeat_ngram_size=3  # Prevent repetition of 3-grams
    )
    
    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [37]:
def search_owners_manual(query, top_k=1):
    """Search for the most relevant chunk based on the query in the owner's manual."""
    query_embedding = embedder.encode([query])
    distances, indices = faiss_index.search(np.array(query_embedding), top_k)
    results = [text_chunks[idx] for idx in indices[0]]
    return results

# Use the existing function `generate_response_with_llm` to generate responses

def query_acura_manual(query):
    """Query the Acura MDX owner's manual using embeddings and generate a response with the LLM."""
    # Search the owner's manual for relevant chunks
    result_chunks = search_owners_manual(query)
    context = result_chunks[0] if result_chunks else "No relevant information found."
    
    # Generate a response using the existing LLM function
    response = generate_response_with_llm(query, context)
    return response

# Example query
user_query = "When does the Auto high beam mode turn ON?"
response = query_acura_manual(user_query)
print(f"Final Response: {response}")

Final Response: Using the information from the owner's manual section below, answer the question concisely:

Manual Section: beam: All of the following conditions must be met before the high beams turn on. ●Your vehicle speed is 25mph (40 km/h) or more. ●There are no preceding or oncoming vehicle with headlights or taillights turned on. ●There are few street lights on the road ahead.One of the following conditions must be met before the low beams turn on. ●Your vehicle speed is 15 mph (24 km/h) or less. ●There is a preceding or oncoming vehicle with headlights or taillights turned on. ●There are many street lights on the road ahead.1How to Use the Auto High-Beam In the following cases, th e auto high-beam system may not switch the head lights properly or the switching timing may be ch anged. In case of the automatic switching operati on does not fit for your driving habits, please swit ch the headlights manually. •The brightness of the lights from the preceding or oncoming vehicle is i