In [5]:
# Cell 1: Import Libraries and Set Up SageMaker Session
import sagemaker
import boto3
import pandas as pd
import numpy as np
from sagemaker.sklearn.estimator import SKLearn
import io
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import json

# Initialize SageMaker session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = 'sagemaker-us-east-2-209479276378'  # Replace with your S3 bucket name
prefix = 'credit-limit-prediction'

# S3 client for direct operations
s3_client = boto3.client('s3')

In [6]:
# Cell 2: Load and Prepare the Dataset
cards_df = pd.read_csv(f's3://{bucket}/data/cards_data.csv')
users_df = pd.read_csv(f's3://{bucket}/data/users_data.csv')

# Merge datasets
merged_df = pd.merge(cards_df, users_df, left_on='client_id', right_on='id', suffixes=('_card', '_user'))

# Data Cleaning
drop_cols = ['id_card', 'client_id', 'card_number', 'cvv', 'address', 
             'id_user', 'latitude', 'longitude', 'card_on_dark_web']
cleaned_df = merged_df.drop(columns=drop_cols)
cleaned_df = cleaned_df.dropna()

# Convert monetary columns to float
money_cols = ['credit_limit', 'per_capita_income', 'yearly_income', 'total_debt']
for col in money_cols:
    cleaned_df[col] = cleaned_df[col].replace('[\$,]', '', regex=True).astype(float)

# Handle dates
cleaned_df['expires_year'] = pd.to_datetime(cleaned_df['expires'], format='%m/%Y').dt.year
cleaned_df['acct_open_year'] = pd.to_datetime(cleaned_df['acct_open_date'], format='%m/%Y').dt.year
cleaned_df = cleaned_df.drop(columns=['expires', 'acct_open_date'])

# Feature Engineering
cleaned_df['debt_to_income'] = cleaned_df['total_debt'] / cleaned_df['yearly_income'].replace(0, np.nan)
cleaned_df['account_age'] = 2025 - cleaned_df['acct_open_year']
cleaned_df['income_per_card'] = cleaned_df['yearly_income'] / cleaned_df['num_credit_cards'].replace(0, np.nan)

# Encode categorical variables
cat_columns = ['card_brand', 'card_type', 'has_chip', 'gender']
cleaned_df = pd.get_dummies(cleaned_df, columns=cat_columns, drop_first=True)

# Features and target
features = [col for col in cleaned_df.columns if col != 'credit_limit']
X = cleaned_df[features]
y = cleaned_df['credit_limit']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Cell 3: Upload Data to S3
# Save scaled data to CSV strings without headers
train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
train_df['credit_limit'] = y_train.values
test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)
test_df['credit_limit'] = y_test.values

train_csv_buffer = io.StringIO()
test_csv_buffer = io.StringIO()
train_df.to_csv(train_csv_buffer, index=False, header=False)
test_df.to_csv(test_csv_buffer, index=False, header=False)

# Upload to S3
s3_client.put_object(
    Bucket=bucket,
    Key=f'{prefix}/train/train.csv',
    Body=train_csv_buffer.getvalue()
)
s3_client.put_object(
    Bucket=bucket,
    Key=f'{prefix}/test/test.csv',
    Body=test_csv_buffer.getvalue()
)

s3_train_path = f's3://{bucket}/{prefix}/train/train.csv'
s3_test_path = f's3://{bucket}/{prefix}/test/test.csv'
print(f"Training data uploaded to: {s3_train_path}")
print(f"Test data uploaded to: {s3_test_path}")

# Save scaler
import joblib
scaler_filename = 'scaler.joblib'
joblib.dump(scaler, scaler_filename)
s3_client.upload_file(scaler_filename, bucket, f'{prefix}/scaler/scaler.joblib')

Training data uploaded to: s3://sagemaker-us-east-2-209479276378/credit-limit-prediction/train/train.csv
Test data uploaded to: s3://sagemaker-us-east-2-209479276378/credit-limit-prediction/test/test.csv


In [8]:
# Cell 4: Define the Training Script
with open('train.py', 'w') as f:
    f.write('''
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import json

def model_fn(model_dir):
    """Load the trained model from the model directory."""
    model = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return model

def input_fn(request_body, request_content_type):
    """Parse the input data for prediction."""
    if request_content_type == 'application/json':
        input_data = json.loads(request_body)
        return np.array(input_data['instances'])
    raise ValueError(f"Unsupported content type: {request_content_type}")

def predict_fn(input_data, model):
    """Run prediction using the loaded model."""
    return model.predict(input_data)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    args, _ = parser.parse_known_args()

    # Load training data
    train_files = [os.path.join(args.train, f) for f in os.listdir(args.train) if f.endswith('.csv')]
    train_data = pd.concat([pd.read_csv(f, header=None) for f in train_files])
    X_train = train_data.iloc[:, :-1].values
    y_train = train_data.iloc[:, -1].values

    # Train the model
    model = RandomForestRegressor(
        n_estimators=200,
        max_depth=None,
        min_samples_split=2,
        max_features='sqrt',
        random_state=42
    )
    model.fit(X_train, y_train)

    # Save the model
    joblib.dump(model, os.path.join(args.model_dir, 'model.joblib'))
    ''')

In [9]:
# Cell 5: Train the Model on SageMaker
sklearn_estimator = SKLearn(
    entry_point='train.py',
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version='1.0-1',
    py_version='py3',
    sagemaker_session=sagemaker_session,
    dependencies=[scaler_filename]
)

sklearn_estimator.fit({'train': s3_train_path})

2025-04-15 19:46:34 Starting - Starting the training job...
2025-04-15 19:46:56 Starting - Preparing the instances for training...
2025-04-15 19:47:21 Downloading - Downloading input data...
2025-04-15 19:47:46 Downloading - Downloading the training image......
2025-04-15 19:48:57 Training - Training image download completed. Training in progress.
2025-04-15 19:48:57 Uploading - Uploading generated training model[34m2025-04-15 19:48:51,552 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-04-15 19:48:51,556 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-04-15 19:48:51,560 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-04-15 19:48:51,578 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-04-15 19:48:51,867 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m


In [10]:
# Cell 6: Deploy the Model
predictor = sklearn_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    endpoint_name='credit-limit-endpoint'
)

------!

In [16]:
# Cell 7: Test the Endpoint
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

predictor.serializer = JSONSerializer()
predictor.deserializer = JSONDeserializer()

# Prepare test data
test_input = {"instances": X_test_scaled.tolist()}
predictions = predictor.predict(test_input)
predicted_credit_limits = [float(pred) for pred in predictions]

# Example Prediction
example_input = {
    'current_age': 22,
    'retirement_age': 66,
    'birth_year': 1966,
    'birth_month': 11,
    'per_capita_income': 29278,
    'yearly_income': 59696,
    'total_debt': 127613,
    'credit_score': 787,
    'num_credit_cards': 1,
    'num_cards_issued': 2,
    'year_pin_last_changed': 2008,
    'expires_year': 2022,
    'acct_open_year': 2002,
    'debt_to_income': 127613/59696,
    'account_age': 2025-2002,
    'income_per_card': 59696/5,
    'card_brand_Visa': 1,
    'card_type_Debit': 0,
    'card_type_Credit': 1,
    'has_chip_YES': 1,
    'gender_Female': 1
}

# Ensure all features are present
input_df = pd.DataFrame([example_input])
for col in features:
    if col not in input_df.columns:
        input_df[col] = 0
input_df = input_df[features]
input_scaled = scaler.transform(input_df)
input_json = {"instances": input_scaled.tolist()}
prediction = predictor.predict(input_json)
print(f"\nPredicted Credit Limit: ${float(prediction[0]):.2f}")



Predicted Credit Limit: $14622.12


In [17]:
# Cell 8: Clean Up
predictor.delete_endpoint()