In [1]:
# Install required packages
!pip install pandas numpy scikit-learn xgboost faiss-cpu sentence-transformers transformers torch



In [2]:
# --- Imports ---
import os
import json
import joblib
import faiss
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import re
from typing import Dict, List, Any

  from .autonotebook import tqdm as notebook_tqdm


# 1. Data Loading and Preprocessing

In [3]:
def load_and_split_data(data_path: str) -> tuple:
    """Load Pima dataset and split into train/test"""
    try:
        df = pd.read_csv(data_path)
        print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
        
        X = df.drop("Outcome", axis=1)
        y = df["Outcome"]
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        print(f"Train set: {X_train.shape[0]} samples")
        print(f"Test set: {X_test.shape[0]} samples")
        print(f"Positive class ratio - Train: {y_train.mean():.2f}, Test: {y_test.mean():.2f}")
        
        return X_train, X_test, y_train, y_test
        
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

# Load data
data_path = "../data/raw/diabetes.csv"
X_train, X_test, y_train, y_test = load_and_split_data(data_path)

Dataset loaded: 768 rows, 9 columns
Train set: 614 samples
Test set: 154 samples
Positive class ratio - Train: 0.35, Test: 0.35


# 2. Train and Evaluate Baseline Classifiers

In [4]:
def train_classifiers(X_train, y_train, X_test, y_test) -> tuple:
    """Train and evaluate baseline classifiers"""
    
    # Logistic Regression
    print("\n=== Training Logistic Regression ===")
    logreg = LogisticRegression(max_iter=1000, random_state=42)
    logreg.fit(X_train, y_train)
    
    logreg_pred = logreg.predict(X_test)
    print("Logistic Regression Report:")
    print(classification_report(y_test, logreg_pred))
    
    # XGBoost
    print("\n=== Training XGBoost ===")
    xgb = XGBClassifier(
        use_label_encoder=False, 
        eval_metric="logloss",
        random_state=42,
        n_estimators=100
    )
    xgb.fit(X_train, y_train)
    
    xgb_pred = xgb.predict(X_test)
    print("XGBoost Report:")
    print(classification_report(y_test, xgb_pred))
    
    # Save models
    os.makedirs("../models", exist_ok=True)
    joblib.dump(logreg, "../models/logreg.pkl")
    joblib.dump(xgb, "../models/xgb.pkl")
    print("\nModels saved to ../models/")
    
    return logreg, xgb

# Train models
logreg, xgb = train_classifiers(X_train, y_train, X_test, y_test)


=== Training Logistic Regression ===
Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.76      0.82      0.79       100
           1       0.61      0.52      0.56        54

    accuracy                           0.71       154
   macro avg       0.68      0.67      0.67       154
weighted avg       0.71      0.71      0.71       154


=== Training XGBoost ===
XGBoost Report:
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       100
           1       0.62      0.61      0.62        54

    accuracy                           0.73       154
   macro avg       0.71      0.71      0.71       154
weighted avg       0.73      0.73      0.73       154


Models saved to ../models/


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


# 3. FAISS Index and RAG Components

In [5]:
def build_knowledge_base() -> tuple:
    """Build enhanced knowledge base with diabetes guidelines"""
    
    # Document collection with metadata
    knowledge_docs = [
        {
            "title": "WHO Diabetes Definition",
            "content": "Diabetes is a chronic disease characterized by elevated blood glucose levels...",
            "source": "WHO Guidelines"
        },
        {
            "title": "Risk Factors",
            "content": "Key diabetes risk factors include BMI, glucose intolerance, family history...",
            "source": "Clinical Guidelines"
        },
        # Add more documents as needed
    ]
    
    # Extract content and metadata
    docs_content = [doc["content"] for doc in knowledge_docs]
    docs_metadata = [{k: v for k, v in doc.items() if k != "content"} 
                    for doc in knowledge_docs]
    
    # Initialize embedding model
    print("Loading embedding model...")
    embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    
    # Create embeddings
    print("Creating document embeddings...")
    doc_embeddings = embedder.encode(docs_content, convert_to_numpy=True)
    
    # Build FAISS index
    dim = doc_embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(doc_embeddings)
    
    print(f"FAISS index built with {len(docs_content)} documents")
    
    return embedder, index, docs_content, docs_metadata

# Build knowledge base
embedder, index, docs_content, docs_metadata = build_knowledge_base()

Loading embedding model...
Creating document embeddings...
FAISS index built with 2 documents


# 4. Pipeline Integration and Testing

In [8]:
def process_query(query: str, patient_features: List[float]) -> Dict[str, Any]:
    """
    Process a query about diabetes risk and return structured JSON response
    
    Args:
        query: User's question about diabetes risk
        patient_features: List of patient measurements
        
    Returns:
        JSON response with prediction, retrieved documents, and explanation
    """
    try:
        # Get classifier prediction
        input_data = np.array(patient_features).reshape(1, -1)
        probability = float(xgb.predict_proba(input_data)[0, 1])
        prediction = int(probability > 0.5)
        risk_level = "High Risk" if probability >= 0.7 else "Moderate Risk" if probability >= 0.4 else "Low Risk"
        
        # Retrieve relevant documents
        query_vec = embedder.encode([query], convert_to_numpy=True)
        distances, indices = index.search(query_vec, k=3)
        
        retrieved_docs = []
        for i, (idx, dist) in enumerate(zip(indices[0], distances[0])):
            doc = {
                "title": docs_metadata[idx]["title"],
                "content": docs_content[idx],
                "source": docs_metadata[idx]["source"],
                "relevance_score": float(1 / (1 + dist))
            }
            retrieved_docs.append(doc)
        
        # Generate explanation
        explanation = {
            "conclusion": f"Patient shows {risk_level.lower()} for diabetes",
            "reasoning": "Based on the model prediction and retrieved medical guidelines...",
            "sources": [doc["source"] for doc in retrieved_docs]
        }
        
        # Compile response
        response = {
            "query": query,
            "prediction": {
                "probability": probability,
                "risk_level": risk_level,
                "binary_outcome": prediction
            },
            "retrieved_documents": retrieved_docs,
            "explanation": explanation
        }
        
        return response
        
    except Exception as e:
        return {
            "error": str(e),
            "query": query
        }

In [13]:
# Test the pipeline
test_query = "What is my diabetes risk given my measurements?"
#test_patient = [6, 180, 90, 35, 100, 33.0, 0.9, 55]
test_patient=[1, 85, 66, 20, 0, 22.0, 0.1, 25]
# Process query
result = process_query(test_query, test_patient)

# Pretty print the JSON response
print(json.dumps(result, indent=2))

{
  "query": "What is my diabetes risk given my measurements?",
  "prediction": {
    "probability": 9.77476520347409e-05,
    "risk_level": "Low Risk",
    "binary_outcome": 0
  },
  "retrieved_documents": [
    {
      "title": "Risk Factors",
      "content": "Key diabetes risk factors include BMI, glucose intolerance, family history...",
      "source": "Clinical Guidelines",
      "relevance_score": 0.5293553471565247
    },
    {
      "title": "WHO Diabetes Definition",
      "content": "Diabetes is a chronic disease characterized by elevated blood glucose levels...",
      "source": "WHO Guidelines",
      "relevance_score": 0.4831613004207611
    },
    {
      "title": "Risk Factors",
      "content": "Key diabetes risk factors include BMI, glucose intolerance, family history...",
      "source": "Clinical Guidelines",
      "relevance_score": 2.938735877055719e-39
    }
  ],
  "explanation": {
    "conclusion": "Patient shows low risk for diabetes",
    "reasoning": "Based o