In [1]:
# Imports only
import os, io, time, getpass, glob
import numpy as np
import pandas as pd
import boto3, sagemaker
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_curve, f1_score, classification_report, confusion_matrix
from sagemaker import image_uris
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/studio-lab-user/.config/sagemaker/config.yaml


In [2]:
# Credentials, region, role, sessions 
REGION = "us-east-2"                           # <-- your region
BUCKET = "loandefault-stutteringemo"           # <-- your S3 bucket (must be in REGION)
ROLE   = "arn:aws:iam::167819473785:role/SageMakerExecutionRole"  # <-- your SageMaker exec role

# Set env + read keys without printing them
os.environ["AWS_DEFAULT_REGION"] = REGION
os.environ["AWS_ACCESS_KEY_ID"] = getpass.getpass("AWS_ACCESS_KEY_ID: ")
os.environ["AWS_SECRET_ACCESS_KEY"] = getpass.getpass("AWS_SECRET_ACCESS_KEY: ")
tok = getpass.getpass("AWS_SESSION_TOKEN (press Enter if not using temp creds): ")
if tok:
    os.environ["AWS_SESSION_TOKEN"] = tok

# Build sessions bound to your region/creds
boto_sess = boto3.Session(
    aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
    aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    aws_session_token=os.environ.get("AWS_SESSION_TOKEN"),
    region_name=REGION,
)
s3 = boto_sess.client("s3")
sm_sess = sagemaker.Session(boto_session=boto_sess)

# Sanity check (no secrets printed)
who = boto_sess.client("sts").get_caller_identity()
print("Account:", who["Account"])
print("Region :", sm_sess.boto_region_name)
print("Role   :", ROLE)

AWS_ACCESS_KEY_ID:  ········
AWS_SECRET_ACCESS_KEY:  ········
AWS_SESSION_TOKEN (press Enter if not using temp creds):  ········


Account: 167819473785
Region : us-east-2
Role   : arn:aws:iam::167819473785:role/SageMakerExecutionRole


In [3]:
# Container image + general config
PREFIX = "loan-default-xgb"
IMAGE_URI = image_uris.retrieve(framework="xgboost", region=REGION, version="1.7-1")
print("Training/Hosting container:", IMAGE_URI)

Training/Hosting container: 257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.7-1


In [4]:
# Load Loan_default* from local, detect/clean target to 0/1

# Show where we are + visible files (handy for path issues)
print("cwd:", os.getcwd())
print("files:", [f for f in os.listdir(".") if not f.startswith(".")][:30])

# Locate and load the data file
cands = [f for f in os.listdir(".") if f.lower().startswith("loan_default")]
assert cands, "No file starting with 'Loan_default' found in this folder. If needed, run: !ls -al"
PATH = cands[0]
print("Loading:", PATH)

if PATH.lower().endswith((".xlsx", ".xls")):
    df = pd.read_excel(PATH)
else:
    # auto-detect delimiter (comma/semicolon/tab)
    df = pd.read_csv(PATH, sep=None, engine="python")

print("Shape:", df.shape)

# Choose/Detect the target column
TARGET = "Default"  # change this if you already know the exact name

if TARGET not in df.columns:
    lower_map = {c.lower(): c for c in df.columns}
    for guess in ["default", "is_default", "loan_default", "defaulter", "loan_status", "label", "target"]:
        if guess in lower_map:
            TARGET = lower_map[guess]
            print("Detected target column:", TARGET)
            break

assert TARGET in df.columns, f"Target column `{TARGET}` not found. First columns: {list(df.columns)[:12]}"

# Normalize target to clean 0/1
col = df[TARGET]

if pd.api.types.is_numeric_dtype(col):
    # drop NaN/inf then cast
    before = len(df)
    df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=[TARGET])
    df[TARGET] = df[TARGET].astype(float).round().astype(int)
    if before - len(df):
        print(f"Dropped {before - len(df)} rows with NaN/inf in target.")
else:
    # string normalize + mapping
    s = col.astype(str).str.strip().str.lower()
    mapping = {
        "1":1, "0":0,
        "yes":1, "no":0, "y":1, "n":0,
        "true":1, "false":0, "t":1, "f":0,
        "default":1, "defaulted":1,
        "non-default":0, "not default":0,
        "charged off":1, "charge-off":1, "charge off":1, "late":1, "delinquent":1,
        "paid":0, "paid off":0, "fully paid":0, "current":0,
    }
    def to01(x):
        if x in mapping: 
            return mapping[x]
        try:
            f = float(x)
            if f in (0.0, 1.0):
                return int(f)
        except Exception:
            pass
        return np.nan

    df[TARGET] = s.map(to01)
    unmapped = df.loc[df[TARGET].isna(), col.name].astype(str).str.lower().unique().tolist()
    before = len(df)
    df = df.dropna(subset=[TARGET])
    df[TARGET] = df[TARGET].astype(int)
    dropped = before - len(df)
    if dropped:
        print(f"Dropped {dropped} rows with unmapped target labels. Examples: {unmapped[:8]}")

# Final sanity print
print("Target counts:\n", df[TARGET].value_counts())
df.head(3)

cwd: /home/studio-lab-user/sagemaker-studiolab-notebooks
files: ['Getting Started.ipynb', 'images', 'winequality_combined.csv', 'train_noheader.csv', 'validation_noheader.csv', 'features_noheader.csv', 'predictions.csv', 'WineQuality_Container.ipynb', 'train.csv', 'val.csv', 'model.tar.gz', 'model_artifacts', 'loan-default-project', 'sm_model', 'LoanDefault_SageM.ipynb', 'Loan_default.csv', 'data']
Loading: Loan_default.csv
Shape: (10780, 18)
Dropped 1 rows with NaN/inf in target.
Target counts:
 Default
0    9505
1    1274
Name: count, dtype: int64


Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56.0,85994.0,50587.0,520.0,80.0,4.0,15.23,36.0,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69.0,50432.0,124440.0,458.0,15.0,1.0,4.81,60.0,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46.0,84208.0,129188.0,451.0,26.0,3.0,21.17,24.0,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1


In [5]:
# One-hot encode; put label first (XGBoost CSV format requirement)
y = df[TARGET].astype(int)
X = df.drop(columns=[TARGET])
X_enc = pd.get_dummies(X, drop_first=True)

# label first column
X_enc.insert(0, TARGET, y.values)

train_df, val_df = train_test_split(X_enc, test_size=0.2, random_state=42, stratify=y)
print("train:", train_df.shape, "val:", val_df.shape)

train: (8623, 10803) val: (2156, 10803)


In [6]:
# Write CSVs (no header/index) and upload to S3
os.makedirs("data", exist_ok=True)
train_path = "data/train.csv"
val_path   = "data/val.csv"
train_df.to_csv(train_path, header=False, index=False)
val_df.to_csv(val_path,   header=False, index=False)

train_s3 = f"s3://{BUCKET}/{PREFIX}/train/train.csv"
val_s3   = f"s3://{BUCKET}/{PREFIX}/val/val.csv"
s3.upload_file(train_path, BUCKET, f"{PREFIX}/train/train.csv")
s3.upload_file(val_path,   BUCKET, f"{PREFIX}/val/val.csv")

print("Uploaded:")
print("  ", train_s3)
print("  ", val_s3)

Uploaded:
   s3://loandefault-stutteringemo/loan-default-xgb/train/train.csv
   s3://loandefault-stutteringemo/loan-default-xgb/val/val.csv


In [7]:
# Configure Estimator (built-in XGBoost container) and train
xgb = Estimator(
    image_uri=IMAGE_URI,
    role=ROLE,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{BUCKET}/{PREFIX}/output",
    sagemaker_session=sm_sess,
)

xgb.set_hyperparameters(
    objective="binary:logistic",
    eval_metric="auc",
    num_round=200,
    max_depth=5,
    eta=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=1,
)

xgb.fit({
    "train": TrainingInput(train_s3, content_type="text/csv"),
    "validation": TrainingInput(val_s3, content_type="text/csv"),
})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-09-01-11-29-12-683


2025-09-01 11:29:13 Starting - Starting the training job...
2025-09-01 11:29:47 Downloading - Downloading input data......
2025-09-01 11:30:47 Downloading - Downloading the training image......
  import pkg_resources[0m
[34m[2025-09-01 11:31:47.828 ip-10-0-149-52.us-east-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-09-01 11:31:47.910 ip-10-0-149-52.us-east-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-09-01:11:31:48:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-09-01:11:31:48:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-09-01:11:31:48:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-09-01:11:31:48:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-09-01:11:31:48:INFO] Running XGBoost Sage

In [8]:
# Deploy endpoint (uses the same AWS-managed container)
ENDPOINT_NAME = f"loan-default-xgb-{int(time.time())}"
predictor: Predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=ENDPOINT_NAME,
)
predictor.serializer = CSVSerializer()
print("Endpoint InService:", ENDPOINT_NAME)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-09-01-11-40-56-077
INFO:sagemaker:Creating endpoint-config with name loan-default-xgb-1756726856
INFO:sagemaker:Creating endpoint with name loan-default-xgb-1756726856


------!Endpoint InService: loan-default-xgb-1756726856


In [9]:
# QUCIK TEST (numeric-only payload)
row = val_df.drop(columns=[TARGET]).iloc[[0]].copy()

bool_cols = row.select_dtypes(include=["bool"]).columns
if len(bool_cols):
    row[bool_cols] = row[bool_cols].astype(int)
row = row.astype(float)

resp = predictor.predict(row.to_csv(header=False, index=False))
prob = float(resp.decode("utf-8").strip())
print("Predicted default probability:", prob)

Predicted default probability: 0.4727606475353241


In [10]:
# BATCH SCORE (ensure numeric features)
def predict_in_byte_chunks(predictor, X_df, max_bytes=900_000, start_rows=256):
    """
    Send CSV in chunks that stay under max_bytes per request (default ~0.9MB).
    Adapts rows per chunk downward if the payload is too big.
    """
    # ensure numeric features
    X = X_df.copy()
    bool_cols = X.select_dtypes(include=["bool"]).columns
    if len(bool_cols):
        X[bool_cols] = X[bool_cols].astype(int)
    X = X.astype(float)

    scores = []
    n = len(X)
    i = 0
    rows = start_rows

    while i < n:
        # adaptively shrink rows until payload fits
        while True:
            j = min(i + rows, n)
            csv_payload = X.iloc[i:j].to_csv(header=False, index=False)
            size = len(csv_payload.encode("utf-8"))
            if size <= max_bytes or rows == 1:
                break
            rows = max(1, rows // 2)  # shrink and try again

        resp = predictor.predict(csv_payload)
        text = resp.decode("utf-8") if isinstance(resp, (bytes, bytearray)) else str(resp)
        chunk_scores = [float(s) for s in text.strip().split("\n") if s.strip()]
        scores.extend(chunk_scores)
        i = j  # advance
        # modestly grow rows again for next chunk
        rows = min(start_rows, rows * 2)

    return np.array(scores)

# use it
X_only = val_df.drop(columns=[TARGET])
scores = predict_in_byte_chunks(predictor, X_only, max_bytes=900_000, start_rows=256)
y_true = val_df[TARGET].values[:len(scores)]
auc = roc_auc_score(y_true, scores)
print("Validation AUC:", round(auc, 4), "| scored rows:", len(scores))

Validation AUC: 0.7037 | scored rows: 2156


In [11]:
# Save the feature order used at training time (label first in train/val; features are the rest)
feature_cols = [c for c in train_df.columns if c != TARGET]
pd.Series(feature_cols).to_csv("features_noheader.csv", index=False, header=False)
print("Saved features_noheader.csv with", len(feature_cols), "columns")

Saved features_noheader.csv with 10802 columns


In [12]:
# Example: invoke the endpoint from any script
smr = boto3.client("sagemaker-runtime", region_name=REGION)

def predict_proba_from_dict(row_dict):
    # row_dict: {feature_name: value, ...} covering ALL feature_cols
    # Order the values to match training
    vals = [row_dict[c] for c in feature_cols]
    # Booleans -> int, everything -> float
    vals = [int(v) if isinstance(v, bool) else float(v) for v in vals]
    payload = ",".join(str(v) for v in vals)
    resp = smr.invoke_endpoint(
        EndpointName=ENDPOINT_NAME,
        ContentType="text/csv",
        Body=payload,
    )
    prob = float(resp["Body"].read().decode("utf-8").strip())
    return prob

# example:
example = dict(val_df.drop(columns=[TARGET]).iloc[0])
print("Example prob:", predict_proba_from_dict(example))

Example prob: 0.4727606475353241


In [13]:
# Choose a threshold (report both default 0.5 and a tuned one)
y_true = val_df[TARGET].values[:len(scores)]
# find F1-optimal threshold
prec, rec, thr = precision_recall_curve(y_true, scores)
f1 = 2 * prec[:-1] * rec[:-1] / (prec[:-1] + rec[:-1] + 1e-9)
best_idx = int(np.argmax(f1))
best_thr = float(thr[best_idx])
print("Best F1 threshold:", round(best_thr, 4))

def report_at(threshold):
    y_pred = (scores >= threshold).astype(int)
    print("\n=== Threshold:", threshold, "===")
    print(pd.DataFrame(confusion_matrix(y_true, y_pred),
                       index=["Actual 0","Actual 1"], columns=["Pred 0","Pred 1"]))
    print(classification_report(y_true, y_pred, digits=4))

report_at(0.5)       # default
report_at(best_thr)  # tuned

Best F1 threshold: 0.2351

=== Threshold: 0.5 ===
          Pred 0  Pred 1
Actual 0    1865      36
Actual 1     227      28
              precision    recall  f1-score   support

           0     0.8915    0.9811    0.9341      1901
           1     0.4375    0.1098    0.1755       255

    accuracy                         0.8780      2156
   macro avg     0.6645    0.5454    0.5548      2156
weighted avg     0.8378    0.8780    0.8444      2156


=== Threshold: 0.23505891859531403 ===
          Pred 0  Pred 1
Actual 0    1612     289
Actual 1     148     107
              precision    recall  f1-score   support

           0     0.9159    0.8480    0.8806      1901
           1     0.2702    0.4196    0.3287       255

    accuracy                         0.7973      2156
   macro avg     0.5931    0.6338    0.6047      2156
weighted avg     0.8395    0.7973    0.8154      2156



In [14]:
# Save predictions (local + S3)
pred_df = pd.DataFrame({"y_true": y_true, "score": scores})
pred_df.to_csv("predictions.csv", index=False)
s3.upload_file("predictions.csv", BUCKET, f"{PREFIX}/predictions.csv")
print("Saved predictions.csv and uploaded to s3://{}/{}".format(BUCKET, f"{PREFIX}/predictions.csv"))

Saved predictions.csv and uploaded to s3://loandefault-stutteringemo/loan-default-xgb/predictions.csv


In [15]:
# delete endpoint to stop billing
predictor.delete_endpoint()
# (optional) also delete endpoint config (same name)
sm_sess.sagemaker_client.delete_endpoint_config(EndpointConfigName=ENDPOINT_NAME)
print("Deleted endpoint & config:", ENDPOINT_NAME)

INFO:sagemaker:Deleting endpoint configuration with name: loan-default-xgb-1756726856
INFO:sagemaker:Deleting endpoint with name: loan-default-xgb-1756726856
