In [1]:
import boto3
import sagemaker
from sagemaker import Model
import os
import json

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
sagemaker_session = sagemaker.Session()

role = sagemaker.get_execution_role()
print(f"SageMaker Role ARN: {role}")

SageMaker Role ARN: arn:aws:iam::796932308591:role/service-role/SageMaker-ExecutionRole-20250214T145019


In [3]:
from utils import inference_validator, reasoning, feature_engineering, additional_feature_engineering
import pickle, json

In [4]:
# import boto3
# import pickle
# import json
# import io
# # Initialize S3 client
# s3 = boto3.client('s3')

# # Define S3 bucket and file path
# bucket = 'mitrailabs-personaclassification'

# # Function to load pickle files from S3
# def load_pickle_from_s3(bucket, key):
#     response = s3.get_object(Bucket=bucket, Key=key)
#     return pickle.load(io.BytesIO(response['Body'].read()))

# # Function to load JSON file from S3
# def load_json_from_s3(bucket, key):
#     response = s3.get_object(Bucket=bucket, Key=key)
#     # response_json
#     # io.BytesIO(file_obj['Body'].read())
#     return json.load(response['Body']) 

# # Load all objects from S3
# model_prefix = 'risk_prediction/Intermediate_states/'

# nimputer = load_pickle_from_s3(bucket, f"{model_prefix}nimputer.pkl")
# cimputer = load_pickle_from_s3(bucket, f"{model_prefix}cimputer.pkl")
# label_encoders = load_pickle_from_s3(bucket, f"{model_prefix}label_encoders.pkl")
# inference_preprocessor = load_pickle_from_s3(bucket, f"{model_prefix}inference_preprocessor.pkl")

# before_columns = load_json_from_s3(bucket, f"{model_prefix}before_feature.json")


# model_prefix = 'risk_prediction/models/'

# xgb = load_pickle_from_s3(bucket, f"{model_prefix}xgb_model.pkl")

# # ✅ Now all objects are loaded from S3 into memory
# print("All files loaded successfully from S3!")


In [5]:
# s3 = boto3.client('s3')

# bucket = 'mitrailabs-personaclassification'


# model_prefix = 'risk_prediction/Intermediate_states/'
# nimputer_response = s3.get_object(
#     Bucket=bucket,
#     Key=f"{model_prefix}nimputer.pkl"
# )

# nimputer = pickle.load(nimputer_response['Body'])


# with open('saved/nimputer.pkl', 'rb') as f:
#     nimputer = pickle.load(f)

# with open('saved/cimputer.pkl', 'rb') as f:
#     cimputer = pickle.load(f)

# with open('saved/label_encoders.pkl', 'rb') as f:
#     label_encoders = pickle.load(f)

    
# with open('saved/inference_preprocessor.pkl', 'rb') as f:
#     inference_preprocessor = pickle.load(f)

# with open('saved/xgb_model.pkl', 'rb') as f:
#     xgb = pickle.load(f)

# with open('saved/before_feature.json', 'r') as f:
#     before_columns = json.load(f)

In [6]:
import pandas as pd
import numpy as np

class ManualTargetEncoder:
    def __init__(self, smoothing=1.0):
        """
        Initialize the encoder.
        :param smoothing: Smoothing parameter to balance between category mean and global mean.
        """
        self.smoothing = smoothing
        self.encodings = {}  # Store encodings for each categorical column
        self.global_mean = None  # Store the global mean of the target

    def fit(self, X, y):
        """
        Fit the encoder on the training data.
        :param X: DataFrame containing categorical columns.
        :param y: Target variable.
        """
        self.global_mean = y.mean()

        for col in X.columns:
            # Calculate the mean target for each category
            category_means = y.groupby(X[col]).mean()
            # Calculate the count of each category
            category_counts = y.groupby(X[col]).count()
            # Apply smoothing
            smoothed_encoding = (category_means * category_counts + self.global_mean * self.smoothing) / (
                        category_counts + self.smoothing)
            # Store the encodings
            self.encodings[col] = smoothed_encoding

    def transform(self, X):
        """
        Transform the categorical columns using the learned encodings.
        :param X: DataFrame containing categorical columns.
        :return: Transformed DataFrame.
        """
        X_transformed = X.copy()
        for col in X.columns:
            # Replace categories with their encodings
            X_transformed[col] = X[col].map(self.encodings[col]).fillna(self.global_mean)
        return X_transformed

    def fit_transform(self, X, y):
        """
        Fit the encoder and transform the data in one step.
        :param X: DataFrame containing categorical columns.
        :param y: Target variable.
        :return: Transformed DataFrame.
        """
        self.fit(X, y)
        return self.transform(X)

In [7]:
import boto3
import joblib
import json
import io
import xgboost

# Initialize S3 client
s3 = boto3.client('s3')

# Define S3 bucket
bucket = 'mitrailabs-personaclassification'

# Function to load joblib files from S3
def load_joblib_from_s3(bucket, key):
    response = s3.get_object(Bucket=bucket, Key=key)
    return joblib.load(io.BytesIO(response['Body'].read()))

# Function to load JSON file from S3 (unchanged)
def load_json_from_s3(bucket, key):
    response = s3.get_object(Bucket=bucket, Key=key)
    return json.load(response['Body'])

def load_xgboost_from_s3(bucket, key):
    response = s3.get_object(Bucket=bucket, Key=key)
    model_data = response['Body'].read()
    
    # Save to a temporary file
    temp_model_path = 'saved/xgb_model.json'
    with open(temp_model_path, 'wb') as f:
        f.write(model_data)
    
    # Load the model
    model = xgboost.Booster()
    model.load_model(temp_model_path)
    return model

# Load all objects from S3
model_prefix = 'risk_prediction/Intermediate_states/'
nimputer = load_joblib_from_s3(bucket, f"{model_prefix}nimputer.joblib")
cimputer = load_joblib_from_s3(bucket, f"{model_prefix}cimputer.joblib")
label_encoders = load_joblib_from_s3(bucket, f"{model_prefix}label_encoders.joblib")
# inference_preprocessor = load_joblib_from_s3(bucket, f"{model_prefix}inference_preprocessor.joblib")
before_columns = load_json_from_s3(bucket, f"{model_prefix}before_feature.json")

model_prefix = 'risk_prediction/models/'
xgb = load_xgboost_from_s3(bucket, f"{model_prefix}xgb_model.json")

# ✅ Now all objects are loaded from S3 into memory
print("All files loaded successfully from S3!")

All files loaded successfully from S3!


In [8]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

# Example user input
user_input = {
    "score_3": 500,                     # Min: 0, Max: 990
    "score_4": 80,                    # Min: 86.191572, Max: 113.978234
    "score_5": 0.5,                    # Min: 0.000035, Max: 0.999973
    "score_6": 100,                      # Min: 60.663039, Max: 142.192400
    "risk_rate": 0,                  # Min: 0.000000, Max: 0.900000
    "last_amount_borrowed": 10000000,        # Min: 0.000000, Max: 35059.600000
    "last_borrowed_in_months": 31,      # Min: 0.000000, Max: 60.000000
    "credit_limit": 100000,                 # Min: 0.000000, Max: 448269.000000
    "income": 1200000,                   # Min: 4821.180000, Max: 5000028.000000
    "ok_since": 53,                     # Min: 0.000000, Max: 141.000000
    "n_accounts": 1,                 # Min: 0.000000, Max: 49.000000
    "n_issues": 10,                    # Min: 0.000000, Max: 49.000000
    "external_data_provider_credit_checks_last_year": 1, # Min: 0.000000, Max: 1.000000
    "external_data_provider_email_seen_before": 50,        # Min: 0.000000, Max: 59.000000
    "reported_income": 120000,          # Min: 403.000000, Max: 6355500000000000.000000
    "application_time_in_funnel": 300,  # Min: 0.000000, Max: 500.000000
    "external_data_provider_credit_checks_last_month": 0.0, # Min: 0.000000, Max: 3.000000
    "external_data_provider_fraud_score": 0, # Min: 0.000000, Max: 1000.000000
    "shipping_state": np.nan,             # No min/max provided (categorical)
    "facebook_profile": False,          # No min/max provided (boolean)
    "state": "BR-MS",                      # No min/max provided (categorical)
    "score_1": "1Rk8w4Ucd5yR3KcqZzLdow==", # No min/max provided (encoded string)
    "real_state": np.nan                  # No min/max provided (nullnp.nan
}

user_input = inference_validator(user_input)
user_df = pd.DataFrame([user_input])
# Ensure the user input has the same columns as the training data
user_df = user_df.reindex(columns=before_columns, fill_value=np.nan)

# Assuming you know which features each imputer was trained on
c_features = [f for f in cimputer.feature_names_in_ if f in user_df.columns]
n_features = [f for f in nimputer.feature_names_in_ if f in user_df.columns]
# print(c_features)
# print(n_features)

# Apply only if features exist
if c_features:
    user_df[c_features] = cimputer.transform(user_df[c_features])
if n_features:
    user_df[n_features] = nimputer.transform(user_df[n_features])

user_df = user_df.drop(columns=["target_default"])
# print("target_default" in user_df.columns)
# Apply the same label encoding to the user input
user_df[[i for i in c_features if i != "target_default"]] = label_encoders.transform(user_df[[i for i in c_features if i != "target_default"]])

yo_features = [i for i in c_features + n_features if i != "target_default"]
# user_df = user_df.reindex(columns=yo_features)
# user_df = user_df.drop(columns=["target_default"])

user_df = feature_engineering(user_df)
user_df = additional_feature_engineering(user_df)
# print(user_df)
# user_processed = inference_preprocessor.transform(user_df)
# user_df['fraud_score_bin'] = user_df['fraud_score_bin'].astype(int)
user_df = xgboost.DMatrix(user_df)
# Make prediction
prediction = xgb.predict(user_df)
# prediction_proba = xgb.predict_proba(user_df)

print("Prediction:", prediction)
print(reasoning[str(np.argmax((prediction[0])))])
# print("Prediction Probability:", prediction_proba)
# print(user_processed)
# 6 → 3 → 7 → 1 → 2 → 0 → 5 → 4

Prediction: [[0.1533128  0.3737888  0.16072789 0.15892747 0.153243  ]]
High Risk 
 Borrowers in this cluster have a high default rate (0.238) and show risky borrowing behavior. They have taken moderate-sized loans (last_amount_borrowed: 0.130) and have a relatively high debt-to-income ratio (0.100). Their credit utilization (0.014) is lower than Cluster 0, but their risk scores (score_1, score_2) are slightly lower, indicating higher risk. Their frequent borrowing and moderate risk scores suggest a higher chance of future payment issues.


In [9]:
import sklearn
print(sklearn.__version__)

1.5.2
