In [2]:
pip install PyPDF2

Collecting PyPDF2Note: you may need to restart the kernel to use updated packages.

  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [13]:
import os
import warnings
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline

In [14]:
warnings.filterwarnings("ignore")
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

In [15]:
class RestaurantChatbot:
    def __init__(self, pdf_path):
        # Initialize components
        self.pdf_path = pdf_path
        self.text = self._extract_text()
        self.chunks = self._chunk_text()
        self._setup_retrieval()
        self._setup_generation()
        
    def _extract_text(self):
        """Extract text from PDF"""
        reader = PdfReader(self.pdf_path)
        return "\n".join([page.extract_text() for page in reader.pages])
    
    def _chunk_text(self, chunk_size=300, overlap=50):
        """Split text into manageable chunks"""
        words = self.text.split()
        return [' '.join(words[i:i+chunk_size]) 
               for i in range(0, len(words), chunk_size - overlap)]
    
    def _setup_retrieval(self):
        """Setup FAISS vector store for retrieval"""
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings = self.embedding_model.encode(self.chunks)
        self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(embeddings)
        
    def _setup_generation(self):
        """Setup text generation pipeline"""
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = GPT2LMHeadModel.from_pretrained('gpt2')
        
        self.generator = pipeline(
            'text-generation',
            model=self.model,
            tokenizer=self.tokenizer,
            device=-1,
            max_new_tokens=100,
            truncation=True
        )
        
        # Length management
        self.model_max_length = 1024
        self.max_context_tokens = 800
    
    def _retrieve_relevant_chunks(self, query, k=3):
        """Retrieve most relevant text chunks"""
        query_embedding = self.embedding_model.encode([query])
        distances, indices = self.index.search(query_embedding, k)
        return [self.chunks[i] for i in indices[0]]
    
    def _truncate_context(self, context, question):
        """Ensure input stays within model limits"""
        question_tokens = self.tokenizer.encode(question, return_tensors='pt')
        question_length = question_tokens.shape[1]
        
        context_tokens = self.tokenizer.encode(context, return_tensors='pt')
        if context_tokens.shape[1] > self.max_context_tokens:
            context_tokens = context_tokens[:, :self.max_context_tokens]
            context = self.tokenizer.decode(context_tokens[0])
        
        return context
    
    def generate_answer(self, question, context):
        """Generate answer with safety checks"""
        try:
            context = self._truncate_context(context, question)
            prompt = f"Answer based on restaurant information:\nContext: {context}\nQuestion: {question}\nAnswer:"
            
            response = self.generator(
                prompt,
                max_new_tokens=100,
                num_return_sequences=1,
                temperature=0.7,
                truncation=True,
                do_sample=True
            )
            
            return response[0]['generated_text'].split("Answer:")[-1].strip()
        except Exception as e:
            return f"Error generating answer: {str(e)}"
    
    def query(self, question):
        """Complete RAG workflow"""
        context = "\n".join(self._retrieve_relevant_chunks(question))
        return self.generate_answer(question, context)



In [16]:
if __name__ == "__main__":
    # Initialize with your PDF path
    chatbot = RestaurantChatbot("respdfdata.pdf")
    
    # Example queries
    queries = [
        "Which restaurants serve Biryani in Roorkee?",
        "What's the price range at Tamarind Restaurant?",
        "Which restaurants are open today?",
        "Where can I find Italian food in Roorkee?"
    ]
    
    for query in queries:
        print(f"\nQ: {query}")
        print(f"A: {chatbot.query(query)}")
        print("-" * 50)

Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (1454 > 1024). Running this sequence through the model will result in indexing errors



Q: Which restaurants serve Biryani in Roorkee?
A: 7. T.C. Restaurant - Address: 2318, Civil Lines, Roorkee - Contact: +919930689564 - Cuisine: North Indian, Chinese, Chinese - Opening Info: Open today (11am–1am, 9:45am–12am) - Price Range: ₹30-₹350 - Menu Highlights: - Main Course: Chicken (₹280), Chicken Biryani (₹240) - Sandwich
--------------------------------------------------

Q: What's the price range at Tamarind Restaurant?
A: Price range: ₹40-₹250 - Menu Highlights: - Middle Course: Chicken (₹250) - Main Course: Desserts: Vegetable Spring Rolls (₹120), Paneer Tikka (₹220), Chicken Biryani (₹250) - Margherita Pizza (₹250) - Margherita Pizza (₹250) - Desserts: Gulab Jamun (�
--------------------------------------------------

Q: Which restaurants are open today?
A: A. C. R. C. - Address: 3921 Haridwar Highway, Roorkee, Roorkee Locality, Roorkee - Contact: +91910853345 - Cuisine: Indian, German, Fried Chicken, Pasta, Hamburger, Vegetable Salad, Pasta, Cheese, Pasta, Ketchup, Toma