In [None]:
from utils import inference_validator, feature_engineering
import pandas as pd
pd.set_option('display.max_rows', None)  # None means unlimited rows
pd.set_option('display.max_columns', None) # None means unlimited columns
pd.set_option('display.width', None)      # None means auto-detect width
pd.set_option('display.max_colwidth', None) # None means unlimited column width

In [None]:
# groq_api_key = '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'
import groq

# Initialize Groq client
client = groq.Client(api_key="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")

import numpy as np

def get_risk_score_reasoning(user_df, prediction):
    """
    Generates a reasoning for the risk score using an LLM.

    Args:
        user_df (pd.DataFrame): User data after feature engineering.
        prediction (np.ndarray): The predicted risk score (NumPy array).

    Returns:
        str: The reasoning for the risk score.
    """
    risk_score = prediction[0]  # Extract the first element (scalar)
    user_data_str = user_df.to_string()

    # Prepare the prompt for the LLM
    prompt = (
        f"You are a financial risk analyst. A user has applied for a loan, and the machine learning model has assigned them a risk score of {risk_score:.2f}.\n\n"
        f"Below are the user's details after feature engineering:\n"
        f"{user_data_str}\n\n"
        f"### Instructions:\n"
        f"- Provide a detailed explanation of why this person received the given risk score.\n"
        f"- Highlight key features that contributed to the risk score (e.g., high debt-to-income ratio, low credit score, late payments, etc.).\n"
        f"- Explain how these features impact the user's financial risk.\n"
        f"- Mention any factors that could improve or worsen their risk score.\n"
        f"- Provide actionable recommendations for the user to reduce their risk score if applicable.\n\n"
        f"- Remember risk score between 27-40 kind of low risk, 40-60 kind of moderate risk and above 60 kind of high risk. more than 75 extremely high. \n\n"
        f"### Output Format:\n"
        f"- Output ONLY a plain text explanation without any additional formatting or labels.\n"
    )

    # ... (rest of your LLM interaction code) ...
    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": "You are a financial risk analyst."},
            {"role": "user", "content": prompt},
        ],
        max_tokens=250,
        temperature=0,
    )
    return response.choices[0].message.content



In [None]:
import logging
import joblib
import os
import xgboost
import json

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

def model_fn(model_dir):
    """
    Load model artifacts from the model_dir for SageMaker model serving
    
    Args:
        model_dir (str): Directory where model artifacts are stored
        
    Returns:
        dict: Dictionary containing all loaded model artifacts
    """
    try:
        logger.info(f"Loading model from: {model_dir}")
        
        # Load all artifacts from local model_dir
        artifacts = {}
        
        # Load numerical and categorical imputers (using joblib)
        artifacts['nimputer'] = joblib.load(os.path.join(model_dir, 'nimputer.joblib'))
        logger.info("Loaded nimputer")
        
        artifacts['cimputer'] = joblib.load(os.path.join(model_dir, 'cimputer.joblib'))
        logger.info("Loaded cimputer")
        
        # Load label encoders (still using pickle)
        artifacts['ordinalencoder'] = joblib.load(os.path.join(model_dir, 'ordinalencoder.joblib'))
        logger.info("Loaded ordinalencoder")

        artifacts['targetencoder'] = joblib.load(os.path.join(model_dir, 'targetencoder.joblib'))
        logger.info("Loaded targetencoder")

        artifacts['kmean'] = joblib.load(os.path.join(model_dir, 'kmeans.joblib'))

        
        # Load feature list (JSON remains unchanged)
        with open(os.path.join(model_dir, 'before_feature.json'), 'r') as f:
            artifacts['before_columns'] = json.load(f)
        logger.info("Loaded before_columns")

        # Load the model
        import catboost

        model_path = os.path.join(model_dir, 'cat_model.cbm')
        model = catboost.CatBoostRegressor()
        model.load_model(model_path)
        artifacts['xgb'] = model
        logger.info("Loaded xgb model")
        
        # Add the reasoning dictionary 
        logger.info("All model artifacts loaded successfully")
        return artifacts
    
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise



In [None]:
def input_fn(request_body, request_content_type):
    """
    Deserialize and prepare the prediction input
    
    Args:
        request_body: The request body
        request_content_type: The request content type
        
    Returns:
        dict: Input data in dictionary format
    """
    logger.info(f"Received request with content type: {request_content_type}")
    
    if request_content_type == 'application/json':
        try:
            input_data = json.loads(request_body)
            logger.info(f"Parsed input data: {str(input_data)[:100]}...")
            return input_data
        except Exception as e:
            logger.error(f"Error parsing JSON input: {str(e)}")
            raise ValueError(f"Error parsing JSON input: {str(e)}")
    else:
        raise ValueError(f"Unsupported content type: {request_content_type}. Only application/json is supported.")

In [None]:
from encoders import CustomTargetEncoder, CustomOrdinalEncoder

def predict_fn(input_data, model_artifacts):
    """
    Apply model to the input data
    
    Args:
        input_data: Input data (from input_fn)
        model_artifacts: Model artifacts (from model_fn)
        
    Returns:
        dict: Prediction result
    """

    pd.set_option("display.max_rows", None)  # Show all rows
    pd.set_option("display.max_columns", None)  # Show all columns
    pd.set_option("display.width", 1000)  # Increase column width
    pd.set_option("display.max_colwidth", None) 

    try:
        logger.info("Starting prediction process")
        
        if isinstance(input_data, str):
            input_dict = json.loads(input_data)
        else:
            input_dict = input_data
        logger.info(f"Deserialized input data: {input_dict}")

        # Convert dictionary to DataFrame
        user_df = pd.DataFrame([input_dict])
        logger.info(f"Initial DataFrame:\n{user_df}")
        
        # Extract model artifacts
        nimputer = model_artifacts['nimputer']
        cimputer = model_artifacts['cimputer']
        targetencoder = model_artifacts['targetencoder']
        ordinalencoder = model_artifacts['ordinalencoder']
        kmean = model_artifacts['kmean']

        before_columns = model_artifacts['before_columns']
        xgb_model = model_artifacts['xgb']
        # Validate and preprocess the input data
        input_data = inference_validator(user_df)
        logger.info(f"input data:\n{input_data}")
        
        logger.info("Input data validated")
        
        user_df = pd.DataFrame(input_data)
        user_df = user_df.reindex(columns=before_columns, fill_value=None)
        logger.info(f"user_df data:\n{user_df}")
        logger.info(f"Reindexed user_df columns: {user_df.columns.tolist()}")
        
        # Apply imputers
        c_features = [f for f in cimputer.feature_names_in_ if f in user_df.columns]
        n_features = [f for f in nimputer.feature_names_in_ if f in user_df.columns]
        logger.info(f"c_features: {c_features}")
        logger.info(f"n_features: {n_features}")
        if c_features:
            user_df[c_features] = cimputer.transform(user_df[c_features])
            logger.info("Applied categorical imputation")
        if n_features:
            user_df[n_features] = nimputer.transform(user_df[n_features])
            logger.info("Applied numerical imputation")
            
        logger.info("Applied imputation")
        logger.info(f"user_df data:\n{user_df}")
        
        user_df['FinancialCluster'] = kmean.predict(user_df[['CreditCardUtilizationRate', 'MonthlyIncome']])
        user_df = feature_engineering(user_df)
        logger.info("Applied feature engineering")
        user_df = targetencoder.transform(user_df)
        user_df = ordinalencoder.transform(user_df)
        # Transform using the preprocessor
        # # user_processed = inference_preprocessor.transform(user_df)
        # logger.info("Applied feature preprocessing")
        user_df = user_df.drop(columns=['RiskScore'])
        # Make prediction
        # print(user_df)
        prediction = xgb_model.predict(user_df)
        # prediction_proba = xgb_model.predict_proba(user_df)
        logger.info(f"Generated prediction: {prediction[0]}")
        
        result = {
            "prediction": (prediction[0]).tolist(),
            # "reasoning": get_risk_score_reasoning(user_df, prediction),
        }
        
        return result
        
    except Exception as e:
        logger.error(f"Error during prediction: {str(e)}")
        raise


In [None]:
def output_fn(prediction_output, accept):
    """
    Serialize the prediction output
    
    Args:
        prediction_output: The prediction output from predict_fn
        accept: The accept content type
        
    Returns:
        The serialized prediction
    """
    logger.info(f"Formatting output with accept type: {accept}")
    logger.info(f"Type of prediction_output: {type(prediction_output)}")
    logger.info(f"Contents of prediction_output: {prediction_output}")
    
    if accept == 'application/json' or accept == '*/*':
        try:
            # Serialize to JSON
            json_output = json.dumps(prediction_output)
            logger.info("Successfully serialized prediction to JSON")
            return json_output
        except Exception as e:
            logger.error(f"Error serializing to JSON: {str(e)}")
            raise
    else:
        raise ValueError(f"Unsupported accept type: {accept}. Only application/json is supported.")

In [None]:
ui = {
  "ApplicationDate": "2024-01-01",
  "Age": 50,                            # min: 18.00, max: 80.00
  "CreditScore": 100,                   # min: 343.00, max: 712.00
  "EmploymentStatus": "Unemployed",     # ['Employed', 'Self-Employed', 'Unemployed']
  "EducationLevel": "High School",      # ['Master', 'Associate', 'Bachelor', 'High School', 'Doctorate']
  "LoanAmount": 120000,                 # min: 3674.00, max: 184732.00
  "LoanDuration": 60,                   # min: 12.00, max: 120.00
  "MaritalStatus": "Widowed",            # ['Married', 'Single', 'Divorced', 'Widowed']
  "NumberOfDependents": 1,              # min: 0.00, max: 5.00
  "HomeOwnershipStatus": "Own",         # ['Own', 'Mortgage', 'Rent', 'Other']
  "MonthlyDebtPayments": 7000,          # min: 50.00, max: 2919.00
  "CreditCardUtilizationRate": 0.6,     # min: 0.000974, max: 0.917380
  "NumberOfOpenCreditLines": 2,         # min: 0.00, max: 13.00
  "NumberOfCreditInquiries": 1,         # min: 0.00, max: 7.00
  "DebtToIncomeRatio": 0.1,           # min: 0.001720, max: 0.902253
  "BankruptcyHistory": 0,
  "LoanPurpose": "Home",              # ['Home', 'Debt Consolidation', 'Education', 'Other', 'Auto']
  "PreviousLoanDefaults": 0,
  "PaymentHistory": 24,             # min: 8.00, max: 45.00
  "LengthOfCreditHistory": 18,            # min: 1.00, max: 29.00
  "SavingsAccountBalance": 20000,         # min: 73.00, max: 200089.00
  "CheckingAccountBalance": 15000,        # min: 24.00, max: 52572.00
  "TotalAssets": 250000,                  # min: 2098.00, max: 1,198,472.00
  "TotalLiabilities": 20000000,           # min: 372.00, max: 1,417,302.00
  "MonthlyIncome": 11000,                 # min: 1250.00, max: 25000.00
  "UtilityBillsPaymentHistory": 0.5,      # min: 0.259203, max: 0.999433
  "JobTenure": 12,                        # min: 0.00, max: 16.00
}

In [None]:
user_input = {
  "ApplicationDate": "2024-01-21",
  "Age": 20,
  "CreditScore": 550,
  "EmploymentStatus": "Unemployed",
  "EducationLevel": "High School",
  "LoanAmount": 80000,
  "LoanDuration": 72,
  "MaritalStatus": "Single",
  "NumberOfDependents": 3,
  "HomeOwnershipStatus": "Rent",
  "MonthlyDebtPayments": 2000,
  "CreditCardUtilizationRate": 0.7,
  "NumberOfOpenCreditLines": 8,
  "NumberOfCreditInquiries": 5,
  "BankruptcyHistory": 1,
  "LoanPurpose": "Debt Consolidation",
  "PreviousLoanDefaults": 1,
  "PaymentHistory": 12,
  "LengthOfCreditHistory": 6,
  "SavingsAccountBalance": 2000,
  "CheckingAccountBalance": 1000,
  "TotalAssets": 50000,
  "TotalLiabilities": 120000,
  "MonthlyIncome": 4000,
  "UtilityBillsPaymentHistory": 0.6,
  "JobTenure": 2
}

import json
artifacts = model_fn("saved")
input_data = input_fn(json.dumps(user_input), "application/json")
result = predict_fn(input_data, artifacts)
print(f'RiskScore = {result["prediction"]}')
# print(result["reasoning"])
