In [1]:
print("!!Hello, World!!")

!!Hello, World!!


In [3]:
import datasets
print(datasets.__version__)


4.4.1


In [4]:
# Complete Medical RAG System - GenmedAssist
# Integrates PDF documents and Hugging Face symptom_to_diagnosis dataset

import os
import re
from pathlib import Path
from typing import List
from dotenv import load_dotenv

# LangChain imports
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_openai import ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Pinecone imports (keep as you used; adjust if your Pinecone client differs)
from pinecone import Pinecone, ServerlessSpec

# Hugging Face datasets
from datasets import load_dataset

# ===========================
# CONFIGURATION
# ===========================

# Set working directory
WORKING_DIR = r"D:\Sai Teja Honours\GenmedAssist"
os.chdir(WORKING_DIR)
print(f'CWD set to {os.getcwd()}')

# Load environment variables
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# fail early if env vars missing
if not PINECONE_API_KEY:
    raise EnvironmentError("PINECONE_API_KEY not set. Please add it to your .env file or environment.")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY not set. Please add it to your .env file or environment.")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Index configuration
INDEX_NAME = "genmedassist"
EMBEDDING_DIMENSION = 384
LOAD_NEW_DATA = False  # Set to True to upload new documents

# (Continue with the rest of your pipeline here: loading PDFs, creating embeddings,
# creating/connecting to Pinecone index, loading HF dataset etc.)


CWD set to D:\Sai Teja Honours\GenmedAssist


In [5]:

# ===========================
# STEP 1: LOAD PDF DOCUMENTS
# ===========================

def load_pdf_files(data_dir: str) -> List[Document]:
    """Load all PDF files from the specified directory."""
    pdf_files = Path(data_dir).glob("*.pdf")
    documents = []
    for pdf in pdf_files:
        loader = PyPDFLoader(str(pdf))
        documents.extend(loader.load())
    print(f"‚úÖ Loaded {len(documents)} pages from PDF files")
    return documents

def filter_to_minimal_docs(documents: List[Document]) -> List[Document]:
    """Filter documents to minimal metadata (only source)."""
    minimal_docs = []
    for doc in documents:
        src = doc.metadata.get("source") if isinstance(doc.metadata, dict) else None
        minimal_docs.append(Document(page_content=doc.page_content, metadata={"source": src}))
    return minimal_docs



In [6]:
# ===========================
# STEP 2: LOAD HUGGING FACE DATASET
# ===========================

def load_symptom_diagnosis_dataset() -> List[Document]:
    """Load the Gretel AI symptom_to_diagnosis dataset and convert to Documents."""
    print("\nüì• Loading Hugging Face dataset: gretelai/symptom_to_diagnosis")
    
    # Load dataset
    ds = load_dataset("gretelai/symptom_to_diagnosis")
    
    # Convert to LangChain Documents
    documents = []
    
    # Process train split
    if 'train' in ds:
        for idx, example in enumerate(ds['train']):
            # Create a comprehensive text from the dataset fields
            content = f"""
Patient Case {idx + 1}:

Symptoms: {example.get('Patient_Symptoms', 'N/A')}
Diagnosis: {example.get('Diagnosis', 'N/A')}

Additional Information:
- Age: {example.get('Age', 'N/A')}
- Gender: {example.get('Gender', 'N/A')}
- Disease: {example.get('Disease', 'N/A')}
- Fever: {example.get('Fever', 'N/A')}
- Cough: {example.get('Cough', 'N/A')}
- Fatigue: {example.get('Fatigue', 'N/A')}
- Difficulty Breathing: {example.get('Difficulty Breathing', 'N/A')}
- Blood Pressure: {example.get('Blood Pressure', 'N/A')}
- Cholesterol Level: {example.get('Cholesterol Level', 'N/A')}
""".strip()
            
            doc = Document(
                page_content=content,
                metadata={
                    "source": "huggingface_symptom_diagnosis",
                    "dataset_index": idx,
                    "diagnosis": example.get('Diagnosis', ''),
                    "disease": example.get('Disease', '')
                }
            )
            documents.append(doc)
    
    print(f"‚úÖ Loaded {len(documents)} cases from Hugging Face dataset")
    return documents



In [7]:
# ===========================
# STEP 3: TEXT PROCESSING
# ===========================

def clean_text(text: str) -> str:
    """Remove unwanted characters and normalize spacing."""
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def text_split(documents: List[Document], chunk_size: int = 500, chunk_overlap: int = 20) -> List[Document]:
    """Clean and split documents into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    
    # Clean documents
    cleaned_docs = []
    for doc in documents:
        doc.page_content = clean_text(doc.page_content)
        cleaned_docs.append(doc)
    
    # Split documents
    texts_chunk = text_splitter.split_documents(cleaned_docs)
    
    # Clean chunks again
    for chunk in texts_chunk:
        chunk.page_content = clean_text(chunk.page_content)
    
    print(f"‚úÖ Created {len(texts_chunk)} text chunks")
    return texts_chunk



In [8]:
# ===========================
# STEP 4: EMBEDDINGS
# ===========================

def download_embeddings():
    """Download and return the HuggingFace embeddings model."""
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    print(f"‚úÖ Loaded embedding model: {model_name}")
    return embeddings



In [9]:
# ===========================
# STEP 5: PINECONE SETUP
# ===========================

def setup_pinecone_index(texts_chunk: List[Document], embedding, load_new: bool = False):
    """Create or connect to Pinecone index."""
    pc = Pinecone(api_key=PINECONE_API_KEY)
    INDEX_NAME = "genmedassist"
    # Create index if it doesn't exist
    if not pc.has_index(INDEX_NAME):
        print(f"üîß Creating new index: {INDEX_NAME}")
        pc.create_index(
            name=INDEX_NAME,
            dimension=EMBEDDING_DIMENSION,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        )
    else:
        print(f"‚úÖ Index '{INDEX_NAME}' already exists")
    
    # Load or upload documents
    if load_new:
        print("‚¨ÜÔ∏è Uploading new documents to Pinecone...")
        docsearch = PineconeVectorStore.from_documents(
            documents=texts_chunk,
            embedding=embedding,
            index_name=INDEX_NAME,
        )
        print("‚úÖ Uploaded new documents to Pinecone")
    else:
        print("üîí Loading existing Pinecone index (no upload)")
        docsearch = PineconeVectorStore.from_existing_index(
            index_name=INDEX_NAME,
            embedding=embedding,
        )
    
    return docsearch



In [10]:
# ===========================
# STEP 6: RAG CHAIN SETUP
# ===========================

def create_rag_chain(docsearch):
    """Create the RAG chain with retriever and LLM."""
    # Create retriever
    retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    
    # Initialize ChatGPT model
    chatModel = ChatOpenAI(model="gpt-4", temperature=0)
    
    # Define system prompt
    system_prompt = (
        "You are a Medical assistant for diagnostic tasks that gives concise and accurate information. "
        "You will give diagnoses based on the context provided. For patients, use simple terms. "
        "For doctors, provide technical terms using ICD-10 codes when available. "
        "Use the following pieces of retrieved context to answer the question. "
        "If you don't know the answer, say that you don't know. "
        "Use three sentences maximum and keep the answer concise.\n\n"
        "{context}"
    )
    
    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", "{input}"),
    ])
    
    # Create chains
    question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
    rag_chain = create_retrieval_chain(retriever, question_answer_chain)
    
    print("‚úÖ RAG chain created successfully")
    return rag_chain



In [None]:
# ===========================
# MAIN EXECUTION
# ===========================

def main():
    """Main execution function."""
    print("\n" + "="*60)
    print("üè• MEDICAL RAG SYSTEM - GenmedAssist")
    print("="*60 + "\n")
    
    # Step 1: Load PDF documents
    print("üìö Step 1: Loading PDF documents...")
    extracted_data = load_pdf_files("data")
    minimal_docs = filter_to_minimal_docs(extracted_data)
    
    # Step 2: Load Hugging Face dataset
    print("\nüìä Step 2: Loading Hugging Face dataset...")
    hf_documents = load_symptom_diagnosis_dataset()
    
    # Combine all documents
    all_documents = minimal_docs + hf_documents
    print(f"\n‚úÖ Total documents: {len(all_documents)}")
    print(f"   - PDF documents: {len(minimal_docs)}")
    print(f"   - HF dataset cases: {len(hf_documents)}")
    
    # Step 3: Split documents
    print("\nüìù Step 3: Splitting documents into chunks...")
    texts_chunk = text_split(all_documents)
    
    # Step 4: Load embeddings
    print("\nüßÆ Step 4: Loading embeddings model...")
    embedding = download_embeddings()
    
    # Step 5: Setup Pinecone
    print("\nüîß Step 5: Setting up Pinecone...")
    docsearch = setup_pinecone_index(texts_chunk, embedding, load_new=LOAD_NEW_DATA)
    
    # Step 6: Create RAG chain
    print("\nü§ñ Step 6: Creating RAG chain...")
    rag_chain = create_rag_chain(docsearch)
    
    # Test queries
    print("\n" + "="*60)
    print("üß™ TESTING THE SYSTEM")
    print("="*60 + "\n")
    
    test_queries = [
        "What is Acne? Symptoms and treatment options?",
        "What are the symptoms of diabetes?",
        "How to treat high blood pressure?",
        "What causes fever and cough together?"
    ]
    
    for query in test_queries:
        print(f"\n‚ùì Query: {query}")
        print("-" * 60)
        response = rag_chain.invoke({"input": query})
        print(f"üí° Answer: {response['answer']}")
        print()
    
    print("="*60)
    print("‚úÖ System ready for use!")
    print("="*60)
    
    return rag_chain



: 

In [None]:
# ===========================
# RUN THE SYSTEM
# ===========================

if __name__ == "__main__":
    rag_chain = main()
    
    # Interactive mode
    print("\nüéØ Enter your medical questions (type 'quit' to exit):")
    while True:
        user_input = input("\n‚ùì Your question: ")
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("üëã Goodbye!")
            break
        
        response = rag_chain.invoke({"input": user_input})
        print(f"\nüí° Answer: {response['answer']}")


üè• MEDICAL RAG SYSTEM - GenmedAssist

üìö Step 1: Loading PDF documents...
‚úÖ Loaded 637 pages from PDF files

üìä Step 2: Loading Hugging Face dataset...

üì• Loading Hugging Face dataset: gretelai/symptom_to_diagnosis


Generating train split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 853/853 [00:00<00:00, 132102.84 examples/s]
Generating test split:   0%|          | 0/212 [00:00<?, ? examples/s]