In [135]:
def get_namespace_from_service_account():
    """
    Reads the Kubernetes namespace from the service account mount point.
    Returns 'default' if not running in a Kubernetes pod or if the file doesn't exist.
    """
    namespace_file = '/var/run/secrets/kubernetes.io/serviceaccount/namespace'
    try:
        with open(namespace_file, 'r') as f:
            return f.read().strip()
    except IOError:
        return 'default'

In [136]:
%update_token

Token successfully refreshed.


## Agent Configuration  

This section covers the configuration of the agent, including:  
* Defining the data context that the agent will interact with  
* Setting up the routine the agent will follow as a system prompt (embedding the data context)  
* Establishing the list of tools available for the agent to complete its tasks  

In [137]:
## Agent Configuration
import os
from pathlib import Path
from llama_index.core import Settings
from llama_index.llms.nvidia import NVIDIA
from llama_index.embeddings.nvidia import NVIDIAEmbedding
import json
import inspect
import pandas as pd
import psycopg2
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from typing import Dict, List, Any, Optional
from datetime import datetime
import numpy as np
import kserve
from kserve import ModelServer
from kserve.model import Model
from kserve.utils.utils import get_predict_input, get_predict_response
from kubernetes import client, config
from kubernetes.client import ApiClient
import yaml
from kserve import KServeClient
from kserve import KServeClient
from kserve import constants
from kserve import utils
from kserve import V1beta1InferenceService
from kserve import V1beta1InferenceServiceSpec
from kserve import V1beta1PredictorSpec
from kserve import V1beta1ModelSpec
from kserve import V1beta1ModelFormat
from kubernetes.client import V1ResourceRequirements
from kubernetes.client.models import V1ObjectMeta  
import hashlib
import re
import mlflow
from mlflow.tracking import MlflowClient
from urllib.parse import urlparse

In [138]:
# 1. Kubernetes and Auth Setup (Same as other agents)
def get_namespace_from_service_account():
    namespace_file = '/var/run/secrets/kubernetes.io/serviceaccount/namespace'
    try:
        with open(namespace_file, 'r') as f:
            return f.read().strip()
    except IOError:
        return 'default'

def get_nvidia_auth_token():
    token_path = Path("/etc/secrets/ezua/.auth_token")
    if token_path.exists():
        with open(token_path, "r") as f:
            return f.read().strip()
    raise ValueError("NVIDIA auth token not found")

NAMESPACE = get_namespace_from_service_account()
nvidia_api_key = get_nvidia_auth_token()

In [139]:
# 2. LLM Configuration (Same as other agents)
llm = NVIDIA(
    base_url="https://llama-3-1-8b-6efc4543-predictor-ezai-services.hpepcai-ingress.pcai.hpecic.net/v1",
    model="meta/llama-3.1-8b-instruct",
    api_key=nvidia_api_key,
    temperature=0.1,
    max_tokens=1024
)
Settings.llm = llm

HTTP Request: GET https://llama-3-1-8b-6efc4543-predictor-ezai-services.hpepcai-ingress.pcai.hpecic.net/v1/models "HTTP/1.1 200 OK"


In [140]:
# 3. PostgreSQL Connection (Same as other agents)
def get_db_connection():
    return psycopg2.connect(
        host=f"vince-retail-postgres.{NAMESPACE}.svc.cluster.local",
        database="vince-retail",
        user="postgres",
        password="postgres",
        port="5432"
    )

In [141]:
## Enhanced MLflow Setup with Error Handling
def setup_mlflow():
    mlflow_uri = "http://mlflow.mlflow.svc.cluster.local:5000"
    s3_uri = 'http://local-s3-service.ezdata-system.svc.cluster.local:30000'
    experiment_name = 'vince-retail'
    
    mlflow.set_tracking_uri(mlflow_uri)
    
    # Test connection with retries
    max_retries = 3
    retry_delay = 2  # seconds
    
    for attempt in range(max_retries):
        # Create experiment if not exists
        try:
            experiment = mlflow.get_experiment_by_name(experiment_name)
            if experiment is None:
                print("Creating new MLflow experiment...")
                mlflow.create_experiment(experiment_name)
        except Exception as e:
            print(f"Warning: Could not verify experiment - {str(e)}")
        
        mlflow.set_experiment(experiment_name)
        return

In [142]:
# 5. System Prompt
DATA_SCIENTIST_SYSTEM_PROMPT = f"""
You are a senior data scientist working with retail data. Your responsibilities include:

1. Predictive Modeling:
   - Analyze business requirements and identify prediction opportunities
   - Select appropriate algorithms based on data characteristics
   - Train, evaluate, and deploy models using MLflow
   - Serve models via KServe for production use

2. Data Preparation:
   - Work with Data Engineer agent to ensure clean data
   - Feature engineering for predictive tasks
   - Handle temporal splits for time-series data

3. Model Operations:
   - Version all models with MLflow
   - Monitor model performance
   - Retrain models when data drift detected

Available Prediction Tasks:
- Demand forecasting
- Customer lifetime value
- Churn prediction
- Recommendation systems
- Price optimization

Always:
- Explain your modeling approach
- Validate data suitability
- Track all experiments
- Document feature importance
"""

In [143]:
# 6. Core Prediction Tools
class PredictiveTools:
    def __init__(self):
        self.conn = get_db_connection()
        try:
            setup_mlflow()
        except Exception as e:
            print(f"Warning: MLflow not available - {str(e)}")
            self.mlflow_enabled = False
        else:
            self.mlflow_enabled = True

    def get_data_for_prediction(self, query: str) -> pd.DataFrame:
        """Execute SQL query and return as DataFrame"""
        with self.conn.cursor() as cursor:
            cursor.execute(query)
            columns = [desc[0] for desc in cursor.description]
            data = cursor.fetchall()
            return pd.DataFrame(data, columns=columns)

    def _log_to_mlflow(self, model, metrics, artifact_path):
        """Safe logging with MLflow availability check"""
        if not self.mlflow_enabled:
            print("MLflow not available - skipping logging")
            return None
            
        try:
            with mlflow.start_run():
                for name, value in metrics.items():
                    mlflow.log_metric(name, value)
                model_info = mlflow.sklearn.log_model(model, artifact_path)
                return model_info.model_uri
        except Exception as e:
            print(f"MLflow logging failed: {str(e)}")
            return None

    def train_demand_forecast_model(self, product_id: str = None):
        """Train demand forecast model using correct schema"""
        # Build query with proper column names
        query = """
        SELECT 
            o.order_date as date,
            op.product_id,
            op.product_quantity as quantity,
            c.price_cents/100.0 as price  -- Convert cents to dollars
        FROM source_order_products op
        JOIN source_orders o ON op.order_id = o.order_id
        JOIN source_catalog c ON op.product_id = c.product_id
        """
        
        if product_id:
            query += f" WHERE op.product_id = '{product_id}'"
        
        try:
            df = self.get_data_for_prediction(query)
            
            if df.empty:
                return {"status": "error", "message": "No data found for training"}
            
            # Feature engineering
            df['date'] = pd.to_datetime(df['date'])
            df = df.set_index('date').sort_index()
            df['day_of_week'] = df.index.dayofweek
            df['month'] = df.index.month
            df['year'] = df.index.year
            
            # Model training
            from sklearn.ensemble import RandomForestRegressor
            from sklearn.metrics import mean_absolute_error
            
            X = df[['day_of_week', 'month', 'year', 'price']]
            y = df['quantity']
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
            
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train, y_train)
            
            # Evaluation
            predictions = model.predict(X_test)
            mae = mean_absolute_error(y_test, predictions)
            
            # MLflow logging
            model_uri = self._log_to_mlflow(
                model=model,
                metrics={"mae": mae},
                artifact_path="demand-forecast-model"
            )
            
            return {
                "status": "success",
                "metrics": {"mae": mae},
                "model_uri": model_uri,
                "features_used": list(X.columns),
                "training_samples": len(X_train)
            }
            
        except Exception as e:
            return {"status": "error", "message": str(e)}

    def get_production_model_uri(self, model_name: str) -> str:
        """Get the proper S3 URI for a production model"""
        try:
            client = MlflowClient()
            
            # 1. Get production model
            latest_versions = client.get_latest_versions(
                name=model_name, 
                stages=["Production"]
            )
            
            if not latest_versions:
                raise ValueError(f"No production model found for {model_name}")
            
            # 2. Convert to proper S3 URI
            source_uri = latest_versions[0].source
            
        except Exception as e:
            raise ValueError(f"Failed to get model URI: {str(e)}")

    def deploy_model_to_kserve(self, model_name: str) -> Dict:
        """Deploy with proper S3 URI handling"""
        try:
            # 1. Get production model URI
            model_uri = self.get_production_model_uri(model_name)
            print(f"Resolved model URI: {model_uri}")
            
            # 2. Generate compliant name
            deploy_name = self._generate_compliant_name(model_name)
            
            # 3. Create KServe client and specs
            kserve_client = KServeClient()
            model_spec = V1beta1ModelSpec(
                model_format=V1beta1ModelFormat(name="mlflow"),
                storage_uri=model_uri,
                runtime="kserve-mlserver",
                resources=V1ResourceRequirements(
                    requests={"cpu": "1", "memory": "2Gi"},
                    limits={"cpu": "1", "memory": "2Gi"}
                )
            )
            
            # 4. Deploy
            isvc = V1beta1InferenceService(
                api_version="serving.kserve.io/v1beta1",
                kind="InferenceService",
                metadata=V1ObjectMeta(name=deploy_name, namespace=NAMESPACE),
                spec=V1beta1InferenceServiceSpec(
                    predictor=V1beta1PredictorSpec(model=model_spec)
                )
            )
            kserve_client.create(isvc)
            
            # 5. Wait for deployment
            kserve_client.wait_isvc_ready(deploy_name, namespace=NAMESPACE, timeout_seconds=300)
            
            return {
                "status": "deployed",
                "deployment_name": deploy_name,
                "model_name": model_name,
                "model_uri": model_uri,
                "message": f"Deployed {model_name} from {model_uri}"
            }
            
        except Exception as e:
            return {"status": "error", "error": str(e)}

In [144]:
# 7. KServe Model Wrapper
class RetailerModel(Model):
    def __init__(self, name: str):
        super().__init__(name)
        self.model = None
        self.ready = False

    def load(self):
        # Load model from MLflow
        model_uri = f"models:/{self.name}/latest"
        self.model = mlflow.pyfunc.load_model(model_uri)
        self.ready = True
        return self.ready

    def predict(self, payload: Dict, headers: Dict[str, str] = None) -> Dict:
        input_data = get_predict_input(payload)
        prediction = self.model.predict(input_data)
        return get_predict_response(payload, prediction, self.name)

In [145]:
class DataScientistAgent:
    def __init__(self):
        self.tools = PredictiveTools()
        self.deployed_models = {}
    
    def run_prediction_pipeline(self, task: str, params: Dict):
        """End-to-end prediction workflow with proper error handling"""
        try:
            print(f"\nStarting {task} prediction pipeline...")
            
            if task == "demand_forecast":
                # 1. Train model
                train_result = self.tools.train_demand_forecast_model(
                    product_id=params.get("product_id")
                )
                
                if train_result.get("status") != "success":
                    return train_result
                
                # 2. Deploy model
                model_name = f"demand-forecast-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
                deploy_result = self.tools.deploy_model_to_kserve(
                    train_result["model_uri"],
                    model_name
                )

                # 3. Verify and store deployment
                if deploy_result.get("status") == "deployed":
                    ready = self._wait_for_deployment(model_name)
                    if not ready:
                        return {
                            "status": "error",
                            "message": "Deployment timeout",
                            "model_name": model_name
                        }
                    
                    # Get the real endpoint URL
                    deploy_result["endpoint"] = self._get_kserve_url(model_name)
                    self.deployed_models[model_name] = {
                        "train_metrics": train_result["metrics"],
                        "deploy_info": deploy_result,
                        "deploy_time": datetime.now().isoformat()
                    }
                
                return deploy_result
            
            return {"status": "error", "message": "Unsupported task"}
        
        except Exception as e:
            return {"status": "error", "message": str(e)}

    def _get_kserve_url(self, model_name: str) -> str:
        """Get the actual KServe endpoint URL"""
        config.load_incluster_config()
        api = client.CustomObjectsApi()
        
        isvc = api.get_namespaced_custom_object(
            group="serving.kserve.io",
            version="v1beta1",
            namespace=NAMESPACE,
            plural="inferenceservices",
            name=model_name
        )
        
        if 'status' in isvc and 'url' in isvc['status']:
            return isvc['status']['url']
        return f"http://{model_name}-predictor-default.{NAMESPACE}.svc.cluster.local"

    def _wait_for_deployment(self, model_name: str, timeout_sec: int = 300) -> bool:
        """Wait for KServe deployment to be ready"""
        config.load_incluster_config()
        api = client.CustomObjectsApi()
        
        start_time = time.time()
        while time.time() - start_time < timeout_sec:
            try:
                isvc = api.get_namespaced_custom_object(
                    group="serving.kserve.io",
                    version="v1beta1",
                    namespace=NAMESPACE,
                    plural="inferenceservices",
                    name=model_name
                )
                
                if 'status' in isvc and 'conditions' in isvc['status']:
                    for condition in isvc['status']['conditions']:
                        if condition['type'] == 'Ready' and condition['status'] == 'True':
                            return True
                
                time.sleep(5)
            except Exception:
                time.sleep(5)
        
        return False

In [146]:
# 9. Interactive Interface
def run_data_scientist_agent():
    agent = DataScientistAgent()
    
    print("""
    Data Scientist Agent (KServe Integrated)
    --------------------------------------
    Available commands:
    - forecast <product_id>: Run demand forecast pipeline
    - list_models: Show deployed models
    - delete <model_name>: Delete a deployed model
    - exit: Quit
    """)
    
    while True:
        try:
            command = input("\nCommand: ").strip().lower()
            
            if command.startswith("forecast"):
                product_id = command.split()[1] if len(command.split()) > 1 else None
                result = agent.run_prediction_pipeline(
                    "demand_forecast",
                    {"product_id": product_id}
                )
                
                if result.get("status") == "deployed":
                    print(f"\n🎉 Model deployed successfully!")
                    print(f"Name: {result['model_name']}")
                    print(f"Endpoint: {result['endpoint']}")
                    print(f"Test with: curl -X POST {result['endpoint']}/v1/models/{result['model_name']}:predict")
                else:
                    print(f"\n❌ Deployment failed: {result.get('error', 'Unknown error')}")
                
            elif command == "list_models":
                if not agent.deployed_models:
                    print("No models currently deployed")
                else:
                    print("\nDeployed Models:")
                    for name, info in agent.deployed_models.items():
                        print(f"\n🔹 {name}")
                        print(f"Deployed: {info['deploy_time']}")
                        print(f"Endpoint: {info['deploy_info']['endpoint']}")
                        print(f"Metrics: {info['train_metrics']}")
                        
            elif command.startswith("delete "):
                model_name = command.split()[1]
                # Add delete functionality here
                print(f"Delete functionality for {model_name} would be implemented here")
                
            elif command in ["exit", "quit"]:
                break
                
            else:
                print("Invalid command. Try 'forecast <product_id>', 'list_models', or 'exit'")
                
        except Exception as e:
            print(f"Error: {str(e)}")

if __name__ == "__main__":
    run_data_scientist_agent()


    Data Scientist Agent (KServe Integrated)
    --------------------------------------
    Available commands:
    - forecast <product_id>: Run demand forecast pipeline
    - list_models: Show deployed models
    - delete <model_name>: Delete a deployed model
    - exit: Quit
    



Command:  forecast 1



Starting demand_forecast prediction pipeline...


2025/04/02 11:47:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run classy-gnu-247 at: http://mlflow.mlflow.svc.cluster.local:5000/#/experiments/22/runs/de312a1b81ea4b8abffd77d09041efeb.
2025/04/02 11:47:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow.mlflow.svc.cluster.local:5000/#/experiments/22.



❌ Deployment failed: Unknown error



Command:  exit


In [147]:
%update_token

Token successfully refreshed.


##### Agent Runtime
This section covers the code executed while the agent is in action, including:
* Preparing the tools for use by the agent
* The agent's runtime function

## Running the Agent
### Sample Questions:
1. What are our top-selling products by revenue and quantity sold?
2. Who are our top 10 customers by total spend and order frequency?
3. Which products have the lowest stock levels relative to their sales velocity?
4. Which product categories generate the highest profit margins?
5. What is our order fulfillment rate and average time to fulfill orders?
6. How has our customer base grown over time?
7. What are the seasonal trends in our product categories?
8. What products are frequently purchased together?
9. What percentage of customers make repeat purchases?
10. Which customer segments are most profitable when considering acquisition cost and lifetime value?

## Cleanup

In [2]:
!kubectl get isvc

No resources found in vincent-charbon-ed7eea62 namespace.
