# Heart Disease Prediction with SageMaker

This notebook demonstrates a complete pipeline for training and deploying a machine learning model for heart disease prediction using AWS SageMaker.

## Objectives:
1. Prepare heart disease data
2. Train AdaBoostClassifier model in SageMaker
3. Deploy the model as an endpoint
4. Test the model on new data


## 1. Install Dependencies and Import Libraries


In [None]:
# Install required libraries
%pip install scikit-learn pandas numpy sagemaker boto3


In [None]:
import pandas as pd
import numpy as np
import json
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import sagemaker
import boto3
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer


## 2. Data Preparation

Load and prepare heart disease data.


In [None]:
# Load heart disease data from data folder
try:
    df = pd.read_csv('data/heart.csv')
    print("Data loaded from data/heart.csv")
except FileNotFoundError:
    print("data/heart.csv file not found. Trying alternative paths...")
    try:
        df = pd.read_csv('../data/heart.csv')
        print("Data loaded from ../data/heart.csv")
    except FileNotFoundError:
        print("Heart data not found in data folder. Creating synthetic data...")
        # Create synthetic data for demonstration
        np.random.seed(42)
        n_samples = 1000
        
        df = pd.DataFrame({
            'age': np.random.randint(29, 80, n_samples),
            'sex': np.random.randint(0, 2, n_samples),
            'cp': np.random.randint(0, 4, n_samples),
            'trestbps': np.random.randint(94, 201, n_samples),
            'chol': np.random.randint(126, 565, n_samples),
            'fbs': np.random.randint(0, 2, n_samples),
            'restecg': np.random.randint(0, 3, n_samples),
            'thalach': np.random.randint(71, 203, n_samples),
            'exang': np.random.randint(0, 2, n_samples),
            'oldpeak': np.random.uniform(0, 6.2, n_samples),
            'slope': np.random.randint(0, 3, n_samples),
            'ca': np.random.randint(0, 4, n_samples),
            'thal': np.random.randint(0, 3, n_samples),
            'target': np.random.randint(0, 2, n_samples)
        })
        
        # Save synthetic data
        df.to_csv('heart.csv', index=False)
        print("Synthetic data saved to heart.csv")

print(f"Dataset size: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()


In [None]:
# Dataset information
print("Dataset information:")
print(df.info())
print("\nStatistics:")
print(df.describe())
print("\nTarget variable distribution:")
print(df['target'].value_counts())


In [None]:
# Prepare features and target variable
X = df.drop(columns='target')
y = df['target']

# Split into train/validation/test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")
print(f"Test set size: {X_test.shape}")

# Feature normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("\nNormalization completed successfully")


In [None]:
# Save data in CSV format for SageMaker
# SageMaker expects data in format: target,feature1,feature2,...
train_data = pd.concat([y_train.reset_index(drop=True), pd.DataFrame(X_train_scaled)], axis=1)
val_data = pd.concat([y_val.reset_index(drop=True), pd.DataFrame(X_val_scaled)], axis=1)
test_data = pd.concat([y_test.reset_index(drop=True), pd.DataFrame(X_test_scaled)], axis=1)

# Save without headers and indices
train_data.to_csv('train.csv', index=False, header=False)
val_data.to_csv('validation.csv', index=False, header=False)
test_data.to_csv('test.csv', index=False, header=False)

print("Data saved for SageMaker:")
print(f"- train.csv: {train_data.shape}")
print(f"- validation.csv: {val_data.shape}")
print(f"- test.csv: {test_data.shape}")

# Show first few rows
print("\nFirst rows of training data:")
print(train_data.head())


## 3. SageMaker Setup

Configure SageMaker session and upload data to S3.


In [None]:
# Configure SageMaker
session = sagemaker.Session()
bucket = session.default_bucket()
prefix = "heart-disease-prediction"
role = sagemaker.get_execution_role()

print(f"S3 Bucket: {bucket}")
print(f"Prefix: {prefix}")
print(f"Role: {role}")


In [None]:
# Upload data to S3
train_path = session.upload_data(path='train.csv', bucket=bucket, key_prefix=f'{prefix}/data')
val_path = session.upload_data(path='validation.csv', bucket=bucket, key_prefix=f'{prefix}/data')
test_path = session.upload_data(path='test.csv', bucket=bucket, key_prefix=f'{prefix}/data')

print(f"Training data uploaded to: {train_path}")
print(f"Validation data uploaded to: {val_path}")
print(f"Test data uploaded to: {test_path}")

# Check upload
!aws s3 ls {bucket}/{prefix}/data --recursive


## 4. Create Training Script

Create a script for training the model in SageMaker.


In [None]:
# Create training script
training_script = '''
#!/usr/bin/env python3

import argparse
import joblib
import os
import pandas as pd
import numpy as np
import io
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import json

def model_fn(model_dir):
    """Load model from file"""
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    return model

def input_fn(request_body, request_content_type):
    """Parse input data"""
    if request_content_type == 'text/csv':
        # For CSV data
        data = pd.read_csv(io.StringIO(request_body), header=None)
        return data.values
    elif request_content_type == 'application/json':
        # For JSON data
        data = json.loads(request_body)
        return np.array(data)
    else:
        raise ValueError(f"Unsupported content type: {request_content_type}")

def predict_fn(input_data, model):
    """Make predictions using the model"""
    predictions = model.predict(input_data)
    return predictions

def output_fn(prediction, content_type):
    """Format output data"""
    if content_type == 'application/json':
        return json.dumps(prediction.tolist())
    elif content_type == 'text/csv':
        return ','.join(map(str, prediction))
    else:
        raise ValueError(f"Unsupported content type: {content_type}")

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--n-estimators', type=int, default=100)
    parser.add_argument('--learning-rate', type=float, default=1.0)
    parser.add_argument('--max-depth', type=int, default=1)
    parser.add_argument('--random-state', type=int, default=42)
    
    # SageMaker arguments
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    
    args = parser.parse_args()
    
    # Load data
    print("Loading data...")
    train_data = pd.read_csv(os.path.join(args.train, 'train.csv'), header=None)
    val_data = pd.read_csv(os.path.join(args.validation, 'validation.csv'), header=None)
    
    # Split into features and target variable
    X_train = train_data.iloc[:, 1:].values
    y_train = train_data.iloc[:, 0].values
    X_val = val_data.iloc[:, 1:].values
    y_val = val_data.iloc[:, 0].values
    
    print(f"Training set size: {X_train.shape}")
    print(f"Validation set size: {X_val.shape}")
    
    # Create and train model
    print("Training model...")
    base_estimator = DecisionTreeClassifier(max_depth=args.max_depth, random_state=args.random_state)
    model = AdaBoostClassifier(
        base_estimator=base_estimator,
        n_estimators=args.n_estimators,
        learning_rate=args.learning_rate,
        random_state=args.random_state
    )
    
    model.fit(X_train, y_train)
    
    # Evaluate model
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    
    train_accuracy = accuracy_score(y_train, train_pred)
    val_accuracy = accuracy_score(y_val, val_pred)
    
    print(f"Training accuracy: {train_accuracy:.4f}")
    print(f"Validation accuracy: {val_accuracy:.4f}")
    
    # Save model
    print("Saving model...")
    model_path = os.path.join(args.model_dir, 'model.joblib')
    joblib.dump(model, model_path)
    
    print("Training completed successfully!")
'''

# Save script
with open('train_script.py', 'w') as f:
    f.write(training_script)

print("Training script created: train_script.py")


## 5. Train Model in SageMaker

Create and run training job in SageMaker.


In [None]:
# Create SageMaker Estimator
# Try these instance types if you get ResourceLimitExceeded error:
# - ml.t3.medium (cheapest, good for small datasets)
# - ml.t3.large (slightly more powerful)
# - ml.m5.large (if you have quota for it)
# - ml.c5.large (compute optimized)

sklearn_estimator = SKLearn(
    entry_point='train_script.py',
    framework_version='1.0-1',
    py_version='py3',
    instance_type='ml.m4.xlarge',  # Changed to supported instance type
    instance_count=1,
    role=role,
    sagemaker_session=session,
    hyperparameters={
        'n-estimators': 100,
        'learning-rate': 1.0,
        'max-depth': 1,
        'random-state': 42
    }
)

print("SageMaker Estimator created")
print(f"Instance type: ml.m4.xlarge")
print(f"Instance count: 1")
print("\nIf you get ResourceLimitExceeded error, try changing instance_type to:")

In [None]:
# Start training
print("Starting model training...")
sklearn_estimator.fit({
    'train': train_path,
    'validation': val_path
}, wait=True)

print("Training completed!")
print(f"Model saved to: {sklearn_estimator.model_data}")


## 6. Deploy Model

Deploy the trained model as an endpoint for predictions.


In [None]:
# Deploy model
print("Deploying model...")
predictor = sklearn_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.t3.medium',  # Changed from ml.t2.medium to supported type
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer()
)

print("Model deployed successfully!")
print(f"Endpoint name: {predictor.endpoint_name}")


## 7. Test Model

Test the deployed model on test data.


In [None]:
# Test model
print("Testing model...")

# Prepare test data (features only, without target variable)
test_features = X_test_scaled[:10]  # Take first 10 samples for testing
test_labels = y_test[:10].values

print(f"Testing on {len(test_features)} samples")
print(f"True labels: {test_labels}")

# Get predictions
predictions = predictor.predict(test_features)
print(f"Model predictions: {predictions}")

# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy on test data: {accuracy:.4f}")


In [None]:
# Detailed evaluation on all test data
print("Detailed model evaluation...")

# Get predictions for all test data
all_predictions = predictor.predict(X_test_scaled)

# Metrics
accuracy = accuracy_score(y_test, all_predictions)
print(f"Overall accuracy: {accuracy:.4f}")
print("\nConfusion matrix:")
print(confusion_matrix(y_test, all_predictions))
print("\nClassification report:")
print(classification_report(y_test, all_predictions))


## 8. Prediction Examples

Show prediction examples for individual patients.


In [None]:
# Prediction examples for individual patients
print("Prediction examples:")
print("=" * 50)

# Take several examples from test set
for i in range(5):
    patient_data = X_test_scaled[i:i+1]  # Single sample
    true_label = y_test.iloc[i]
    prediction = predictor.predict(patient_data)[0]
    
    print(f"Patient {i+1}:")
    print(f"  True diagnosis: {'Heart disease present' if true_label == 1 else 'No heart disease'}")
    print(f"  Model prediction: {'Heart disease present' if prediction == 1 else 'No heart disease'}")
    print(f"  Correct: {'✓' if true_label == prediction else '✗'}")
    print()


## 9. Cleanup Resources

⚠️ **IMPORTANT**: Don't forget to delete the endpoint after finishing work to avoid additional costs!


In [None]:
# Delete endpoint (uncomment to execute)
# predictor.delete_endpoint()
# print("Endpoint deleted")