In [None]:
import os
%pwd

os.chdir('../')
%pwd

In [None]:
from langchain_community.document_loaders import PyPDFLoader,DirectoryLoader
from langchain_classic.text_splitter import  RecursiveCharacterTextSplitter
from langchain_community.retrievers import PineconeHybridSearchRetriever
from langchain_classic.schema import Document
from src.logger import logging
from dotenv import load_dotenv
from src.exception import CustomException
from typing import List
from pinecone_text.sparse import BM25Encoder
import sys,boto3,json
from langchain_mistralai import MistralAIEmbeddings,ChatMistralAI
from pinecone import Pinecone,ServerlessSpec

In [None]:
load_dotenv()

In [None]:
def load_document(directory):
    try:
        pdf= DirectoryLoader(
            directory, 
            glob='*.pdf',
            loader_cls=PyPDFLoader
        )

        doc=pdf.load()
        logging.info('document loaded successfully')
        logging.info(f'total pages loaded: {len(doc)}')
        return doc 
    except Exception as e:
        raise CustomException(e,sys)

In [None]:
files=load_document("data")

In [None]:
int(files[14].metadata['page_label'])

In [None]:
hasattr(files[0].metadata,'source')

In [None]:
'source' in files[0].metadata

In [None]:
def extract_useful_text(files:List[Document]) -> List:
    filtered_docs=[]
    for doc in files:
        try:
            if hasattr(doc,'page_content') and hasattr(doc,'metadata') and 'source' in doc.metadata and 'page_label' in doc.metadata:
                cnt= doc.page_content
                temp=Document(
                    page_content=cnt,
                    metadata={
                        'source':doc.metadata['source'],
                        'page_no':int(doc.metadata['page_label'])
                    }
                )
                filtered_docs.append(temp)
        
        except Exception as e:
            raise CustomException(e,sys)
        
    logging.info('Completed extraction of page_content & source')
    logging.info(f'total document created: {len(filtered_docs)}')
    return filtered_docs

In [None]:
filtered_docs=extract_useful_text(files)

In [None]:
def split_text(doc:List[Document]):
    try:
        text_splitter= RecursiveCharacterTextSplitter(
            chunk_size=800,
            chunk_overlap=200
        )

        text=text_splitter.split_documents(doc)
        logging.info('Splitting completed')
        logging.info(f'total chunks: {len(text)}')
        return text
    
    except Exception as e:
        raise CustomException(e,sys)

In [None]:
final_doc=split_text(filtered_docs)

In [None]:
if not os.getenv("MISTRAL_API_KEY"):
    os.environ["MISTRAL_API_KEY"] = os.environ.get('MISTERAL_API_KEY')

In [None]:
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = os.environ.get('LANGSMITH_API_KEY')

In [None]:
os.environ.get('LANGSMITH_API_KEY')

In [None]:
embeddings = MistralAIEmbeddings(
    model="mistral-embed",
)

In [None]:
len(embeddings.embed_query(final_doc[15].page_content))

In [None]:
pc=Pinecone()

In [None]:
[i['name'] for i  in pc.list_indexes()]

In [None]:
index_name="medical-chatbot"

if index_name not in [i['name'] for i in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric='dotproduct',
        spec=ServerlessSpec(cloud='aws',region='us-east-1')
    )
    

In [None]:
final_doc[0]

In [None]:
import uuid

In [None]:
bucket=os.environ.get('BUCKET_NAME')

In [None]:
def upload_file(
    documents: List[Document],user_id: str,bucket_name: str):
    try:
        s3 = boto3.client("s3")
        print("S3 client initialized")
    except Exception as e:
        raise RuntimeError(f"S3 init failed: {e}")

    
    valid_texts = []
    valid_docs = []

    for doc in documents:
        if hasattr(doc, "page_content") and doc.page_content:
            valid_texts.append(doc.page_content)
            valid_docs.append(doc)
        else:
            print(f"Skipping empty document: {doc.metadata.get('page_no', 'unknown')}")

    if not valid_texts:
        raise ValueError("No valid document text found")

    print(f"Extracted {len(valid_texts)} valid documents")

    embed_model = MistralAIEmbeddings(model="mistral-embed")

    try:
        sample_vec = embed_model.embed_query(valid_texts[0])
        dim = len(sample_vec)
        print(f"Embedding dimension: {dim}")
    except Exception as e:
        raise RuntimeError(f"Embedding test failed: {e}")


    bm25 = BM25Encoder.default()
    bm25.fit(valid_texts)

    pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
    existing_indexes = pc.list_indexes().names()

    if user_id not in existing_indexes:
        pc.create_index(
            name=user_id,
            dimension=dim,
            metric="dotproduct",
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )
        print(f"Created Pinecone index: {user_id}")
    else:
        print(f"Using existing Pinecone index: {user_id}")

    index = pc.Index(user_id)

    
    vectors = []
    batch_size = 50

    for i, doc in enumerate(valid_docs):
        try:
            doc_id = uuid.uuid4().hex[:16]
            text = valid_texts[i]

            source = doc.metadata.get("source", "unknown")
            page_no = doc.metadata.get("page_no", -1)

            s3.put_object(
                Bucket=bucket_name,
                Key=f"{user_id}/{doc_id}.json",
                Body=json.dumps({
                    "id": doc_id,
                    "text": text,
                    "source": source,
                    "page_no": page_no
                }),
                ContentType="application/json"
            )

            dense_vec = embed_model.embed_query(text)
            sparse_vec = bm25.encode_documents([text])[0]

            vectors.append({
                "id": doc_id,
                "values": dense_vec,
                "sparse_values": sparse_vec,
                "metadata": {
                    "text": text,
                    "source": source,
                    "page_no": page_no,
                    "s3_uri": f"s3://{bucket_name}/{user_id}/{doc_id}.json"
                }
            })

            if len(vectors) >= batch_size:
                index.upsert(vectors)
                vectors.clear()
                print(f"Upserted batch ending at {i}")

        except Exception as e:
            print(f"Failed doc {i}: {e}")

    if vectors:
        index.upsert(vectors)
        print("Final batch upserted")

    
    retriever = PineconeHybridSearchRetriever(
        embeddings=embed_model,
        sparse_encoder=bm25,
        index=index
    )

    return retriever


In [None]:
retriever=upload_file(final_doc,"medical-chatbot",bucket)

In [None]:
from langchain_pinecone import PineconeVectorStore

In [None]:
final_doc[0]

In [None]:
vectorstore = PineconeVectorStore.from_documents(
    documents=final_doc,
    embedding=embeddings,
    index_name='medical-chatbot',
    text_key="text"   
)

In [None]:
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 5,
        "fetch_k": 20,
        "lambda_mult": 0.5
    }
)

In [None]:
docs=retriever.invoke("Acne treatment")

In [None]:
from langchain_classic.prompts import ChatPromptTemplate,PromptTemplate
from langchain_core.messages  import SystemMessage,HumanMessage,AIMessage

In [None]:
prompt = PromptTemplate(
    template="""
        You are a medical question answering assistant.
        You must answer the user’s question using ONLY the information provided in the given documents (context).
        Do NOT use prior knowledge, assumptions, or external medical knowledge.

        Rules you MUST follow:
        1. If the answer is clearly stated in the documents, answer accurately and concisely.
        2. If the documents do NOT contain enough information to answer the question, say:
        "The provided documents do not contain sufficient information to answer this question."
        3. Do NOT add new medical advice, diagnoses, treatments, or recommendations that are not explicitly mentioned in the documents.
        4. Do NOT speculate or generalize beyond the text.
        5. If multiple documents provide relevant information, combine them logically.
        6. Use medically precise language, but keep explanations clear and simple.
        7. Do NOT mention document IDs, embeddings, vector stores, or retrieval mechanisms.

        Context:
        {context}

        User Question:
        {question}""",
        input_variables=['context','question'],
        validate_template=True
    )

In [None]:
template="""You are a medical question answering assistant.

Answer the user’s question using ONLY the information provided in the documents.
Do not use external knowledge or assumptions.

Style and tone rules:
- Be friendly, natural, and conversational.
- Answer directly. Do NOT start your response with phrases like
  "Based on the provided documents",
  "According to the documents",
  or similar meta statements.
- Write as if explaining to a patient in simple, clear language.
- Do not mention documents, context, sources, or retrieval.

Safety rules:
1. If the documents clearly contain the answer, explain it simply and accurately.
2. If the documents do NOT contain enough information, say:
   "I don’t have enough information in the provided material to answer that."
3. Do not add medical advice, diagnoses, or treatments beyond what is explicitly stated.

If a reference is ambiguous, prefer the most recently mentioned condition rather than asking for clarification.

 """


In [None]:
def extract_req_text(query):
    res=retriever.invoke(query)
    req_text=[]
    for dox in res:
        req_text.append(dox.page_content)
    return req_text

In [None]:
req_text=[]
for dox in docs:
    req_text.append(dox.page_content)

req_text

In [None]:
llm = ChatMistralAI(
    model="mistral-large-latest",
    temperature=0.7,
    max_retries=2
)


In [None]:
prompt1 = ChatPromptTemplate.from_messages([
    ("system",template),
    ("system", "{history}"),
    ("human", "Context:\n{context}\n\nQuestion:\n{question}")
])

In [None]:
chat_history = [SystemMessage(content=template)]

In [None]:
from langchain_classic.memory import ConversationSummaryBufferMemory,ConversationBufferMemory
from langchain_classic.chains import LLMChain
from langchain_community.chat_message_histories import RedisChatMessageHistory


In [None]:
memory = ConversationSummaryBufferMemory(
    llm=llm,
    max_token_limit=1500,
    return_messages=True,
    input_key="question" 
)

In [None]:
REDIS_URL=os.environ.get('REDIS_URL')
REDIS_URL

In [None]:
user_no=1

In [None]:
message_history = RedisChatMessageHistory(
    url=REDIS_URL,
    session_id=f"medical_chat_user_{user_no}"  
)


In [None]:
memory1 = ConversationBufferMemory(
    chat_memory=message_history,
    return_messages=True,
    input_key='question'
)

In [None]:
chain = LLMChain(
    llm=llm,
    prompt=prompt1,
    memory=memory1,
    output_key='text'
)


In [None]:
while True:
    user_input = input("Your Query: ")
    if user_input.lower() == "exit":
        break

    context = extract_req_text(user_input)

    response = chain.invoke({
        "context": context,
        "question": user_input
    })

    print("Ai:", response["text"])


In [None]:
import redis
import json

r = redis.Redis.from_url(REDIS_URL, decode_responses=True)

key = "message_store:medical_chat_user_1"

messages = r.lrange(key, 0, -1)

for m in messages:
    print(json.loads(m))


In [None]:
print(memory.buffer)