In [None]:
import os
import time
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# === Step 0: Load API keys from .env ===
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# === Step 1: Load and Split PDF Data ===
def load_pdf_file(data_path):
    loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
    return loader.load()

def text_split(extracted_data):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    return splitter.split_documents(extracted_data)

extracted_data = load_pdf_file('Data/')
text_chunks = text_split(extracted_data)
print("✅ Text Chunks Loaded:", len(text_chunks))

# === Step 2: Embedding ===
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
print("✅ Test Embedding Length:", len(embeddings.embed_query("hello world")))

# === Step 3: Pinecone Setup ===
pc = Pinecone(api_key=PINECONE_API_KEY)

try:
    pc.delete_index("medicalchatbot")
except Exception as e:
    print("⚠️ Index delete warning:", e)

pc.create_index(
    name="medicalchatbot",
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

while not pc.describe_index("medicalchatbot").status['ready']:
    print("⏳ Waiting for Pinecone to be ready...")
    time.sleep(2)
print("✅ Pinecone is ready.")

# === Step 4: Create VectorStore & Retriever ===
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name="medicalchatbot"
)
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# === Step 5: Load OpenAI LLM ===
llm = ChatOpenAI(
    model_name="gpt-4",
    temperature=0.4,
    openai_api_key=OPENAI_API_KEY
)

# === Step 6: RAG Prompt ===
system_prompt = (
    "You are an assistant for question-answering tasks.\n"
    "Use the following pieces of context to answer the question concisely.\n"
    "If you don't know the answer, say 'I don't know.'\n\n{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

qa_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, qa_chain)

# === Step 7: Ask a Question ===
response = rag_chain.invoke({"input": "What is ?"})
print("🧠 ANSWER:", response["answer"])


  from .autonotebook import tqdm as notebook_tqdm
  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


✅ Text Chunks Loaded: 0
✅ Test Embedding Length: 384
✅ Pinecone is ready.


  llm = ChatOpenAI(


NotFoundError: Error code: 404 - {'error': {'message': 'The model `gpt-4` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}