In [41]:
%pip install sagemaker --upgrade --quiet 
%pip install -q  xgboost==1.3.1 pandas==1.0.5 scikit-learn

Note: you may need to restart the kernel to use updated packages.
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpip subprocess to install build dependencies[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[90 lines of output][0m
  [31m   [0m Ignoring numpy: markers 'python_version == "3.6" and platform_system != "AIX"' don't match your environment
  [31m   [0m Ignoring numpy: markers 'python_version == "3.7" and platform_system != "AIX"' don't match your environment
  [31m   [0m Ignoring numpy: markers 'python_version == "3.6" and platform_system == "AIX"' don't match your environment
  [31m   [0m Ignoring numpy: markers 'python_version == "3.7" and platform_system == "AIX"' don't match your environment
  [31m   [0m Ignoring numpy: markers 'python_version >= "3.8" and platform_system == "AIX"' don't match your environment
  [31m   [0m Collecting setuptools
  [31m   [0m   Using cached setuptools-79.0.0

In [42]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
import json
import joblib
import os
from sklearn.model_selection import train_test_split
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)
from sagemaker.inputs import TrainingInput
from sagemaker.image_uris import retrieve
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

# Setting SageMaker variables
sess = sagemaker.Session()
write_bucket = sess.default_bucket()
write_prefix = "fraud-detect-demo"

region = sess.boto_region_name
s3_client = boto3.client("s3", region_name=region)

sagemaker_role = sagemaker.get_execution_role()
sagemaker_client = boto3.client("sagemaker")

# Local data processing
# Load the combined dataset
data_path = "training.csv"
df = pd.read_csv(data_path).drop("ACCT_NBR", axis=1)
target_col = 'IS_WARN'  # update if needed

# Split data into train, validation, and test sets (70%, 15%, 15%)
# 1) First split: 70% train, 30% “temp”
train_df, temp_df = train_test_split(
    df,
    test_size=0.30,
    stratify=df[target_col],       # preserve class balance
    random_state=42
)

# Calculate scale_pos_weight for handling class imbalance
neg_count = train_df[target_col].value_counts()[0]
pos_count = train_df[target_col].value_counts()[1]
scale_pos_weight = neg_count / pos_count
print(f"Calculated scale_pos_weight: {scale_pos_weight:.2f}")

# 2) Second split: split that 30% into half validation, half test
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,             # 0.50 × 30% = 15%
    stratify=temp_df[target_col],  # again preserve balance
    random_state=42
)

# Quick sanity check
print(f"Train pos%: {train_df[target_col].mean()*100:.2f}%")
print(f" Val  pos%: {val_df[target_col].mean()*100:.2f}%")
print(f" Test pos%: {test_df[target_col].mean()*100:.2f}%")

# Create a local directory to store the split datasets
data_dir = "data"
os.makedirs(data_dir, exist_ok=True)

# Save the split datasets to local files
train_path = os.path.join(data_dir, "train.csv")
val_path = os.path.join(data_dir, "validation.csv")
test_path = os.path.join(data_dir, "test.csv")

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

# Setting output locations for model and other artifacts
model_key = f"{write_prefix}/model"
output_key = f"{write_prefix}/output"

model_uri = f"s3://{write_bucket}/{model_key}"
output_uri = f"s3://{write_bucket}/{output_key}"
estimator_output_uri = f"s3://{write_bucket}/{write_prefix}/training_jobs"
bias_report_output_uri = f"s3://{write_bucket}/{write_prefix}/clarify-output/bias"
explainability_report_output_uri = f"s3://{write_bucket}/{write_prefix}/clarify-output/explainability"
tuning_job_name_prefix = "xgbtune" 
training_job_name_prefix = "xgbtrain"

xgb_model_name = "fraud-detect-xgb-model"
endpoint_name_prefix = "xgb-fraud-model-dev"
train_instance_count = 1
train_instance_type = "ml.m4.xlarge"
predictor_instance_count = 1
predictor_instance_type = "ml.m4.xlarge"
clarify_instance_count = 1
clarify_instance_type = "ml.m4.xlarge"


Calculated scale_pos_weight: 61.42
Train pos%: 1.60%
 Val  pos%: 1.60%
 Test pos%: 1.60%


In [43]:
%%writefile xgboost_train.py

import argparse
import os
import joblib
import json
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score, f1_score

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Hyperparameters and algorithm parameters are described here
    parser.add_argument("--num_round", type=int, default=100)
    parser.add_argument("--max_depth", type=int, default=3)
    parser.add_argument("--eta", type=float, default=0.2)
    parser.add_argument("--subsample", type=float, default=0.9)
    parser.add_argument("--colsample_bytree", type=float, default=0.8)
    parser.add_argument("--objective", type=str, default="binary:logistic")
    parser.add_argument("--eval_metric", type=str, default="auc")
    parser.add_argument("--scale_pos_weight", type=float, default=1.0) # Add scale_pos_weight argument
    parser.add_argument("--nfold", type=int, default=3)
    parser.add_argument("--early_stopping_rounds", type=int, default=3)
    

    # SageMaker specific arguments. Defaults are set in the environment variables
    # Location of input training data
    parser.add_argument("--train_data_dir", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    # Location of input validation data
    parser.add_argument("--validation_data_dir", type=str, default=os.environ.get("SM_CHANNEL_VALIDATION"))
    # Location where trained model will be stored. Default set by SageMaker, /opt/ml/model
    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    # Location where model artifacts will be stored. Default set by SageMaker, /opt/ml/output/data
    parser.add_argument("--output_data_dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR"))
    
    args = parser.parse_args()

    data_train = pd.read_csv(f"{args.train_data_dir}/train.csv")
    train = data_train.drop("IS_WARN", axis=1)
    label_train = pd.DataFrame(data_train["IS_WARN"])
    dtrain = xgb.DMatrix(train, label=label_train)
    
    
    data_validation = pd.read_csv(f"{args.validation_data_dir}/validation.csv")
    validation = data_validation.drop("IS_WARN", axis=1)
    label_validation = pd.DataFrame(data_validation["IS_WARN"])
    dvalidation = xgb.DMatrix(validation, label=label_validation)

    params = {"max_depth": args.max_depth,
              "eta": args.eta,
              "objective": args.objective,
              "subsample" : args.subsample,
              "colsample_bytree":args.colsample_bytree,
              "eval_metric": args.eval_metric,  
              "scale_pos_weight": args.scale_pos_weight # Use scale_pos_weight
             }
    
    num_boost_round = args.num_round
    nfold = args.nfold
    early_stopping_rounds = args.early_stopping_rounds
    eval_metric = args.eval_metric
    
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        nfold=nfold,
        early_stopping_rounds=early_stopping_rounds,
        metrics=[eval_metric],
        seed=42,
    )
    
    model = xgb.train(params=params, dtrain=dtrain, num_boost_round=len(cv_results))
    
    train_pred = model.predict(dtrain)
    validation_pred = model.predict(dvalidation)
    
    # Calculate AUC for reference
    train_auc = roc_auc_score(label_train, train_pred)
    validation_auc = roc_auc_score(label_validation, validation_pred)
    
    # Calculate F1-score (we need binary predictions)
    train_pred_binary = (train_pred > 0.5).astype(int)
    validation_pred_binary = (validation_pred > 0.5).astype(int)
    train_f1 = f1_score(label_train, train_pred_binary)
    validation_f1 = f1_score(label_validation, validation_pred_binary)
    
    print(f"[0]#011train-auc:{train_auc:.2f}")
    print(f"[0]#011validation-auc:{validation_auc:.2f}")
    print(f"[0]#011train-f1:{train_f1:.2f}")
    print(f"[0]#011validation-f1:{validation_f1:.2f}")

    metrics_data = {"hyperparameters" : params,
                    "binary_classification_metrics": {
                        "validation:auc": {"value": validation_auc},
                        "train:auc": {"value": train_auc},
                        "validation:f1": {"value": validation_f1},
                        "train:f1": {"value": train_f1}
                    }
                   }
              
    # Save the evaluation metrics to the location specified by output_data_dir
    metrics_location = args.output_data_dir + "/metrics.json"
    
    # Save the model to the location specified by model_dir
    model_location = args.model_dir + "/xgboost-model"

    with open(metrics_location, "w") as f:
        json.dump(metrics_data, f)

    with open(model_location, "wb") as f:
        joblib.dump(model, f)

Overwriting xgboost_train.py


In [44]:
# Set static hyperparameters that will not be tuned

static_hyperparams = {
    "eval_metric" : "logloss",
    "objective": "binary:logistic",
    "scale_pos_weight": str(scale_pos_weight) # Add calculated scale_pos_weight
}

xgb_estimator = XGBoost(
                        entry_point="xgboost_train.py",
                        output_path=estimator_output_uri,
                        code_location=estimator_output_uri,
                        hyperparameters=static_hyperparams,
                        role=sagemaker_role,
                        instance_count=train_instance_count,
                        instance_type=train_instance_type,
                        framework_version="1.3-1",
                        base_job_name=training_job_name_prefix
                    )
# Setting ranges of hyperparameters to be tuned
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.1, 0.7),
    "subsample": ContinuousParameter(0.75, 0.95),
    "colsample_bytree": ContinuousParameter(0.7, 0.9),
    "max_depth": IntegerParameter(3, 15),
    "num_round": IntegerParameter(100, 500)
}
objective_metric_name = "validation:f1"

# Setting up tuner object
tuner_config_dict = {
                     "estimator" : xgb_estimator,
                     "max_jobs" : 50,
                     "max_parallel_jobs" : 10,
                     "objective_metric_name" : objective_metric_name,
                     "hyperparameter_ranges" : hyperparameter_ranges,
                     "base_tuning_job_name" : tuning_job_name_prefix,
                     "strategy" : "Bayesian",
                     "objective_type" : "Maximize"  # For F1-score, higher is better
                    }
tuner = HyperparameterTuner(**tuner_config_dict)

In [45]:
# Setting the input channels for tuning job using local files
s3_input_train = TrainingInput(s3_data=sess.upload_data(path=train_path, key_prefix=f"{write_prefix}/train"), content_type="csv")
s3_input_validation = TrainingInput(s3_data=sess.upload_data(path=val_path, key_prefix=f"{write_prefix}/validation"), content_type="csv")

tuner.fit(inputs={"train": s3_input_train, "validation": s3_input_validation}, include_cls_metadata=False)
tuner.wait()

................................................................................................!
!


In [46]:
# Summary of tuning results ordered in descending order of performance
df_tuner = sagemaker.HyperparameterTuningJobAnalytics(tuner.latest_tuning_job.job_name).dataframe()
df_tuner = df_tuner[df_tuner["FinalObjectiveValue"]>-float('inf')].sort_values("FinalObjectiveValue", ascending=False)
df_tuner

Unnamed: 0,colsample_bytree,eta,max_depth,num_round,subsample,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
48,0.739445,0.235703,15.0,259.0,0.815427,xgbtune-250423-1116-002-3f8685a7,Completed,0.56,2025-04-23 11:17:17+00:00,2025-04-23 11:19:06+00:00,109.0
1,0.845371,0.1,15.0,179.0,0.821369,xgbtune-250423-1116-049-aa83230b,Completed,0.55,2025-04-23 11:23:20+00:00,2025-04-23 11:24:04+00:00,44.0
33,0.772187,0.114695,15.0,253.0,0.87606,xgbtune-250423-1116-017-d1fb43a1,Completed,0.55,2025-04-23 11:19:45+00:00,2025-04-23 11:20:30+00:00,45.0
26,0.865163,0.162239,15.0,264.0,0.875803,xgbtune-250423-1116-024-2a4eb9df,Completed,0.55,2025-04-23 11:20:33+00:00,2025-04-23 11:21:12+00:00,39.0
9,0.709205,0.222702,15.0,258.0,0.791475,xgbtune-250423-1116-041-469d70ee,Completed,0.54,2025-04-23 11:22:35+00:00,2025-04-23 11:23:14+00:00,39.0
7,0.7,0.154979,15.0,226.0,0.772751,xgbtune-250423-1116-043-7479f782,Completed,0.53,2025-04-23 11:22:46+00:00,2025-04-23 11:23:26+00:00,40.0
18,0.753894,0.20825,14.0,254.0,0.852097,xgbtune-250423-1116-032-cecab2e7,Completed,0.53,2025-04-23 11:21:34+00:00,2025-04-23 11:22:13+00:00,39.0
5,0.782563,0.157369,13.0,252.0,0.856652,xgbtune-250423-1116-045-c5668bb0,Completed,0.53,2025-04-23 11:22:49+00:00,2025-04-23 11:23:28+00:00,39.0
13,0.7,0.246824,15.0,246.0,0.803203,xgbtune-250423-1116-037-e0d38297,Completed,0.53,2025-04-23 11:22:10+00:00,2025-04-23 11:22:50+00:00,40.0
35,0.712673,0.212199,15.0,110.0,0.808433,xgbtune-250423-1116-015-840341a8,Completed,0.53,2025-04-23 11:19:42+00:00,2025-04-23 11:20:21+00:00,39.0


In [60]:
best_train_job_name = tuner.best_training_job()

model_path = estimator_output_uri + '/' + best_train_job_name + '/output/model.tar.gz'
training_image = retrieve(framework="xgboost", region=region, version="1.3-1")
create_model_config = {"model_data":model_path,
                       "role":sagemaker_role,
                       "image_uri":training_image,
                       "name":endpoint_name_prefix,
                       "predictor_cls":sagemaker.predictor.Predictor
                       }
# Create a SageMaker model
model = sagemaker.model.Model(**create_model_config)
# Deploy the best model and get access to a SageMaker Predictor
predictor = model.deploy(initial_instance_count=predictor_instance_count, 
                         instance_type=predictor_instance_type,
                         serializer=CSVSerializer(),
                         deserializer=CSVDeserializer())
print(f"\nModel deployed at endpoint : {model.endpoint_name}")

----------!
Model deployed at endpoint : xgb-fraud-model-dev-2025-04-23-11-46-27-046


In [64]:
# Find the indices of all rows where the target label is 1
pos_indices = test_df.index[test_df[target_col] == 1]

# Randomly select one of those indices
idx = np.random.choice(pos_indices)

# Build the model input payload by dropping the target column and converting the selected row to a list
payload = test_df.drop([target_col], axis=1).loc[idx].tolist()

# Use the model to predict the score for the selected payload
pred_score = float(predictor.predict(payload)[0][0])

# Get the true label for the selected test sample
true_label = test_df.loc[idx, target_col]

# Print out the model's predicted score and the true label for comparison
print(f"(random) Model predicted score : {pred_score:.3f}, True label : {true_label}")

(random) Model predicted score : 0.002, True label : 1


In [63]:
# Delete model

try:
 sess.delete_model(xgb_model_name)
except:
 pass
sess.delete_model(model.name)

# Delete inference endpoint config
sess.delete_endpoint_config(endpoint_config_name=predictor._get_endpoint_config_name())

# Delete inference endpoint
sess.delete_endpoint(endpoint_name=model.endpoint_name)
