In [None]:
print("OK")

In [None]:
%pwd

In [None]:
import os
os.chdir("../")

In [None]:
%pwd

In [None]:
import pydantic_core
from langchain_core._api import deprecation
print("✅ pydantic-core & LangChain core OK")


In [None]:
# ===============================
# Imports
# ===============================
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

# ===============================
# Function to load PDFs
# ===============================
def load_pdf_files(directory_path: str):
    """
    Loads all PDF files from the given directory and returns a list of documents.

    Args:
        directory_path (str): Path to the directory containing PDFs.

    Returns:
        List[Document]: A list of LangChain Document objects.
    """
    if not os.path.exists(directory_path):
        raise FileNotFoundError(f"Directory not found: {directory_path}")

    loader = DirectoryLoader(
        directory_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

# ===============================
# Example usage
# ===============================
try:
    extracted_data = load_pdf_files("data")
    print(f"✅ Loaded {len(extracted_data)} PDF documents successfully.")
except FileNotFoundError as e:
    print(f"❌ {e}")
except Exception as e:
    print(f"❌ An error occurred while loading PDFs: {e}")


In [None]:
extracted_data

In [None]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [None]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

In [None]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

In [None]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [None]:
embeddings = download_hugging_face_embeddings()

In [None]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

In [None]:
from dotenv import load_dotenv
load_dotenv()
import os

In [None]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
groq_API_KEY=os.environ.get('groq_API_KEY')

In [None]:
GROQ_API_KEY = os.getenv("GROQ_API_KEY")  # Use env variable


In [None]:
from pinecone import ServerlessSpec,Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore
from langchain_pinecone import PineconeVectorStore



vector_store = PineconeVectorStore(index=index_name=, embedding=embeddings)

# Embed each chunk and upsert the embeddings into your Pinecone index.

# Upsert your chunked documents
vector_store.add_documents(text_chunks)

In [None]:
from langchain_pinecone import PineconeVectorStore

# Create the vector store from an existing Pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)


In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
# Retrieve documents for the query
retrieved_docs = retriever.invoke("What is Acne?")

# Combine all retrieved document contents into a single context string
context_text = "\n".join(doc.page_content for doc in retrieved_docs)


In [None]:
context_text

In [None]:
from langchain_groq import ChatGroq

# Initialize the Groq chat model
llm = ChatGroq(
    model="openai/gpt-oss-20b",  # model name
    temperature=0.0,             # deterministic responses
    max_tokens=100              # optional: set max tokens per response
)

In [None]:
llm.invoke("What is Acne?")

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# -------------------------------
# 1. Define the system prompt
# -------------------------------
system_prompt = (
    "You are a question answering assistant. "
    "Use the retrieved context below. "
    "If you don't know the answer, say you don't know. "
    "Be concise (3 sentences max).\n\n"
    "{context}"
)

# -------------------------------
# 2. Create the chat prompt
# -------------------------------
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),  # Must match the key in invoke()
    ]
)

# -------------------------------
# 3. Initialize output parser
# -------------------------------
output_parser = StrOutputParser()

# -------------------------------
# 4. Define the RAG chain
# -------------------------------
def rag_chain(question):
    # Retrieve relevant docs
    docs = retriever.invoke(question)
    context = "\n\n".join([d.page_content for d in docs])

    # Chain: prompt -> LLM -> output parser
    chain = prompt | llm | output_parser

    # Invoke the chain with proper variable names
    return chain.invoke({
        "context": context,
        "input": question  # must match {input} in human message
    })

# -------------------------------
# 5. Test
# -------------------------------
response = rag_chain("What is acne?")
print(response)


In [None]:
jupyter nbconvert --clear-output --inplace backend/notebook/trials.ipynb
