# Audit Sprint 4: Functional Prototype Verification

This notebook audits the current state of the project (Sprint 4). It verifies:
1. **Environment**: Correct installation of key libraries.
2. **Data Ingestion**: Processing of raw SAS/XPT files.
3. **Scratch Implementation**: Functionality of the custom XGBoost class.
4. **Productive Pipeline**: Baseline training with PyCaret (Note: May skip on Python 3.12).
5. **Application**: UI component readiness.

## 1. Environment Verification

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import streamlit as st
import sys
import os

pycaret_available = False
try:
    import pycaret
    pycaret_available = True
    print(f"PyCaret Version: {pycaret.__version__}")
except (ImportError, RuntimeError) as e:
    print(f"⚠️ PyCaret not available (likely Python 3.12 issue): {e}")

print(f"Python Version: {sys.version}")
print(f"Pandas Version: {pd.__version__}")
print(f"Numpy Version: {np.__version__}")
print(f"XGBoost Version: {xgb.__version__}")
print(f"Streamlit Version: {st.__version__}")

# Ensure project root is in path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added {project_root} to sys.path")

## 2. Data Ingestion Audit
Verifying `src/data_ingestion.py`. We will use the sample file `LLCP2022_10rows.xpt`.

In [None]:
from src.data_ingestion import load_and_process_data, split_data
import os

# Paths
raw_path = "../data/01_raw/LLCP2022_10rows.xpt"
output_dir = "../data/02_intermediate"
processed_path = os.path.join(output_dir, "processed_data.parquet")

if not os.path.exists(raw_path):
    print(f"⚠️ Warning: Raw file {raw_path} not found. Using mock data for audit.")
else:
    print(f"Found raw file at {raw_path}")
    
    try:
        # Run Ingestion
        df = load_and_process_data(raw_path, output_dir)
        
        if df is not None:
            print("✅ Data loaded successfully.")
            print(f"Shape: {df.shape}")
            
            # Check for Target
            if 'CVDINFR4' in df.columns or 'CVDCRHD4' in df.columns:
                 print("✅ Target variable found (CVDINFR4 or CVDCRHD4).")
            else:
                 print("❌ Target variable NOT found.")
                 
            # Check SEQNO index
            if df.index.name == 'SEQNO':
                print("✅ Index is correctly set to SEQNO.")
            else:
                 print(f"⚠️ Index is {df.index.name}, expected SEQNO.")
                 
            # Verify Parquet creation
            if os.path.exists(processed_path):
                print(f"✅ Parquet file created at {processed_path}")
            else:
                print("❌ Parquet file NOT created.")
        else:
            print("❌ Data ingestion failed (returned None).")
    except Exception as e:
        print(f"❌ Data ingestion raised error: {e}")

## 3. Scratch Model Implementation Audit
Verifying `src/model.py` (XGBoost from scratch) using synthetic data.

In [None]:
from src.model import XGBoostScratch
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate synthetic data
X, y = make_classification(n_samples=100, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Scratch Model on {len(X_train)} samples...")

try:
    # Instantiate
    model_scratch = XGBoostScratch(n_estimators=5, max_depth=2, learning_rate=0.1)
    
    # Fit
    model_scratch.fit(X_train, y_train)
    print("✅ Model fitted successfully.")
    
    # Predict
    y_pred = model_scratch.predict(X_test)
    y_proba = model_scratch.predict_proba(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"✅ Prediction successful. Accuracy: {acc:.2f}")
    print(f"Probabilities sample: {y_proba[:5]}")
    
except Exception as e:
    print(f"❌ Scratch Model verification failed: {e}")
    import traceback
    traceback.print_exc()

## 4. Productive Pipeline (PyCaret) Audit
Verifying `src/train_pycaret.py`. 
**Note:** Errors here are caught so they don't stop the whole notebook.

In [None]:
import pandas as pd

# Load data
data_path = "../data/02_intermediate/processed_data.parquet"

if os.path.exists(data_path) and pycaret_available:
    try:
        from pycaret.classification import setup, compare_models, pull
        df = pd.read_parquet(data_path)
        target_col = 'CVDINFR4'
        if target_col not in df.columns and 'CVDCRHD4' in df.columns:
            target_col = 'CVDCRHD4'
        
        print(f"Using target: {target_col}")
        
        # Minimal PyCaret Setup
        print("Initializing PyCaret Setup...")
        # Use a small sample to speed up audit
        sample_df = df.head(50) if len(df) > 50 else df
        
        exp = setup(data=sample_df, target=target_col, session_id=123, verbose=False, html=False)
        print("✅ PyCaret Setup initialized.")
        
        # Compare Models
        print("Running compare_models (budget mode)...")
        best = compare_models(include=['lr', 'dt'], n_select=1)
        
        print("✅ compare_models execution finished.")
        print(pull())
        
    except Exception as e:
        print(f"❌ PyCaret execution failed: {e}")
        import traceback
        traceback.print_exc()
else:
    print("⚠️ Skipping PyCaret audit (Data missing or Library unavailable).")

## 5. UI Application Check
Verifying `src/app.py` dependencies and `src/adapters.py`.

In [None]:
from src.adapters import PyCaretAdapter
import pickle

# Check Adapter
try:
    # Mock a model object with predict_proba
    class MockModel:
        def predict_proba(self, X):
            return np.array([[0.1, 0.9]] * len(X))
            
    adapter = PyCaretAdapter(MockModel())
    X_mock = pd.DataFrame(np.random.rand(5, 5))
    probs = adapter.predict_proba(X_mock)
    preds = adapter.predict(X_mock, threshold=0.5)
    
    print("✅ PyCaretAdapter works with mock model.")
    print(f"Probs shape: {probs.shape}, Preds shape: {preds.shape}")

except Exception as e:
    print(f"❌ Adapter verification failed: {e}")

print("\nTo run the UI, execute command in terminal:")
print("streamlit run src/app.py")