*Step 1 : Import Necessary Libraries*

In [1]:
!pip install transformers sentence-transformers faiss-cpu PyMuPDF



*Step 2 : Load the folder with documents/Data and chunk it*

In [2]:
import fitz  # PyMuPDF
import os

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

def chunk_text(text, chunk_size=500):
    """Split text into smaller chunks."""
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Read and chunk PDF documents
pdf_folder = "/teamspace/studios/this_studio/PDFs"  
document_chunks = []
chunk_index_mapping = {}

for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        text = extract_text_from_pdf(os.path.join(pdf_folder, pdf_file))
        chunks = chunk_text(text)
        for i, chunk in enumerate(chunks):
            document_chunks.append(chunk)
            chunk_index_mapping[len(document_chunks) - 1] = {
                "file": pdf_file,
                "section": chunk
            }


*Step 3 : Convert Chunks into Embeddings*

In [3]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Generate embeddings for document chunks
chunk_embeddings = embedding_model.encode(document_chunks, convert_to_numpy=True)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

*Step 4 :  Store Embeddings in a Vector Database*

In [4]:
import faiss
import numpy as np

# InitializeingFAISS index
dimension = chunk_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)

# Add embeddings to the FAISS index
faiss_index.add(chunk_embeddings)


*Step 5 :   Retrieve Relevant Chunks with similarity threshold of 0.5  Based on User Query*

In [6]:
from sklearn.metrics.pairwise import cosine_similarity


def retrieve_relevant_sections(query, top_k=3, similarity_threshold=0.5):
    """Retrieve top-k relevant sections based on the query."""
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, top_k)
    
    # Extract relevant sections from data with a similarity threshold filter
    relevant_sections = []
    for idx in indices[0]:
        if idx != -1:
            section = chunk_index_mapping[idx]
            # Calculate similarity between query and the retrieved section text
            section_embedding = embedding_model.encode([section['section']], convert_to_numpy=True)
            similarity_score = cosine_similarity([query_embedding[0]], [section_embedding[0]])[0][0]
            if similarity_score > similarity_threshold:
                relevant_sections.append(section)
    return relevant_sections


*Step 6 :   Selection of Model, Uses a Hugging Face LLM to generate a human-like response.*

In [7]:
 from transformers import AutoModelForCausalLM, AutoTokenizer


# Load Hugging Face model
model_name = "EleutherAI/gpt-neo-1.3B"  
response_model = AutoModelForCausalLM.from_pretrained(model_name)
response_tokenizer = AutoTokenizer.from_pretrained(model_name)


if response_tokenizer.pad_token is None:
    response_tokenizer.pad_token = response_tokenizer.eos_token  




    
   


config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [8]:
def evaluate_similarity(query, response):
    """Calculate similarity score between the query and the response."""
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    response_embedding = embedding_model.encode([response], convert_to_numpy=True)
    similarity_score = cosine_similarity(query_embedding, response_embedding)[0][0]
    return similarity_score


*Step 7 :   Generate answer/response function to query*

In [10]:
def generate_answer(query, relevant_sections):
    """Generate a response using the query and retrieved sections."""
    # Combine the relevant sections into context
    context = "\n".join([f"From {section['file']}:\n{section['section']}" for section in relevant_sections])
    
    # Custom prompt 
    input_text = f"Based on the context below,  answer to the query in less than 500 words in length without repeating any information.\n\nContext: {context}\n\nQuery: {query}\n\nAnswer:"
    
    # Tokenize input text
    inputs = response_tokenizer(
        input_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=1024
    )
    
    # Generate the response
    outputs = response_model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=200,
        pad_token_id=response_tokenizer.pad_token_id
    )
    
    # Decode and return the generated answer
    answer = response_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


*Step 8 :  To filter the response with perplexity, similarity score and custom prompts - to set quality of response in prior to presenting to user*

In [11]:
def filter_response(response, query, max_perplexity=15, min_similarity=0.4):
    """Auto-check and filter the generated response based on perplexity and similarity."""
    # Check perplexity
    perplexity_score = evaluate_perplexity(response)
    if perplexity_score > max_perplexity:
        return "The response generated is too uncertain. Please try again or please reframe your question."

    # Check similarity with the query
    similarity_score = evaluate_similarity(query, response)
    if similarity_score < min_similarity:
        return "The response seems irrelevant to your question. Please try again or please reframe your question."

    # Return the valid response
    return response

def answer_query(query, top_k=1):
    """Retrieve relevant sections, generate a response, and filter it."""
    # Retrieve relevant sections
    relevant_sections = retrieve_relevant_sections(query, top_k=top_k)
    if not relevant_sections:
        return "Sorry, I couldn't find relevant information.Please try another query or  reframe your question"
    
    # Generate the response
    response = generate_answer(query, relevant_sections)
    
    # Auto-check and filter the response
    filtered_response = filter_response(response, query)
    
    return filtered_response


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load model and tokenizer for perplexity evaluation
perplexity_model = GPT2LMHeadModel.from_pretrained("gpt2")
perplexity_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def evaluate_perplexity(response):
    inputs = perplexity_tokenizer(response, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = perplexity_model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    perplexity = torch.exp(loss).item()
    return perplexity

# Example
#response = "The best exercises for muscle gain include squats, deadlifts, and bench presses."
perplexity_score = evaluate_perplexity(response)
print(f"Perplexity Score: {perplexity_score:.2f}")


*Step 9 :  User Query Samples Section*

In [15]:
user_query = "what are the expert tips for beginners at gym?"
response = answer_query(user_query)
print("\nResponse:\n", response)


Response:
 Based on the context below,  answer to the query in less than 500 words in length without repeating any information.

Context: From fitness_guide.pdf:
Fitness Workout Guide 1.​ Fitness Guide 1.1.​ A Beginner Workout Plan Starting a fitness journey as a beginner is a transformative step towards a healthier, stronger you. Having the right mindset and strategies can significantly impact your success. Here are some valuable tips to get you started: ●​ Start Slowly. Don't try to do too much too soon. If you're new to working out, start with a few days a week and gradually increase the frequency and intensity of your workouts as you get stronger. ●​ Listen to Your Body. If you're feeling pain, stop and rest. Don't push yourself too hard, especially in the beginning. ●​ Find a Workout Buddy. Working out with a friend can help you stay motivated and accountable. ●​ Make it Fun. You're less likely to stick with your workouts if you're not enjoying them. Find activities that you enjo

*Step 10 :  Performance metrics response time , similarity score and perplexity score can be viewed for each query*

In [16]:
import time

# Measure response time
start_time = time.time()
response = answer_query(user_query)
end_time = time.time()

print("\nResponse Time: {:.2f} seconds".format(end_time - start_time))



Response Time: 39.06 seconds


In [17]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load embedding model
similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def evaluate_similarity(query, response):
    query_embedding = similarity_model.encode(query, convert_to_numpy=True)
    response_embedding = similarity_model.encode(response, convert_to_numpy=True)
    similarity = cosine_similarity([query_embedding], [response_embedding])
    return similarity[0][0]

similarity_score = evaluate_similarity(user_query, response)
print(f"Similarity Score: {similarity_score:.2f}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Similarity Score: 0.59


In [18]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load model and tokenizer for perplexity evaluation
perplexity_model = GPT2LMHeadModel.from_pretrained("gpt2")
perplexity_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def evaluate_perplexity(response):
    inputs = perplexity_tokenizer(response, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = perplexity_model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    perplexity = torch.exp(loss).item()
    return perplexity

# Example
#response = "The best exercises for muscle gain include squats, deadlifts, and bench presses."
perplexity_score = evaluate_perplexity(response)
print(f"Perplexity Score: {perplexity_score:.2f}")


Perplexity Score: 8.87
