In [None]:
import sagemaker
role = sagemaker.get_execution_role()
print(role)

In [None]:
import sagemaker

# Get the default SageMaker execution role
role_arn = sagemaker.get_execution_role()
role_arn

In [None]:
import pandas as pd
from sagemaker.workflow.function_step import step
from sagemaker.workflow.step_outputs import get_step
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.parameters import ParameterInteger
from sagemaker.workflow.parameters import ParameterFloat

In [None]:
train_data_path = ParameterString(
    name="TrainDataPath",
    default_value="s3://srushanth-baride/binary-classification-with-a-bank-dataset/train.csv"
)

test_data_path = ParameterString(
    name="TestDataPath",
    default_value="s3://srushanth-baride/binary-classification-with-a-bank-dataset/test.csv"
)

In [None]:
instance_type = ParameterString(
    name="InstanceType",
    default_value="ml.m5.xlarge"
)

instance_count = ParameterInteger(
    name="InstanceCount",
    default_value=1
)

In [None]:
train_test_split_ratio = ParameterFloat(
    name="TrainTestSplitRatio",
    default_value=0.8
)

---

In [None]:
@step(
    name="IngestTrainingData",
    display_name="Ingest Training Data",
    instance_type=instance_type,
    instance_count=instance_count
)
def ingest_train_data(s3_path: ParameterString) -> pd.DataFrame:
    """
    Ingest data from S3 path
    
    Args:
        s3_path (ParameterString): S3 path to training data
        
    Returns:
        pd.DataFrame: Training dataframe
    """
    df = pd.read_csv(s3_path)
    print(f"Ingested data with shape: {df.shape}")
    return df

@step(
    name="IngestTestingData",
    display_name="Ingest Testing Data",
    instance_type=instance_type,
    instance_count=instance_count
)
def ingest_test_data(s3_path: ParameterString) -> pd.DataFrame:
    """
    Ingest data from S3 path
    
    Args:
        s3_path (ParameterString): S3 path to training data
        
    Returns:
        pd.DataFrame: Training dataframe
    """
    df = pd.read_csv(s3_path)
    print(f"Ingested data with shape: {df.shape}")
    return df

In [None]:
@step(
    name="ExtractFeatures",
    display_name="Extract Feature Columns",
    instance_type=instance_type,
    instance_count=instance_count,
)
def extract_features_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extract feature columns from training data

    Args:
        train_df (pd.DataFrame): Training dataframe

    Returns:
        pd.Series: Feature variable series
    """
    features_df = df.drop(columns=["y"])
    print(f"Extracted Features columns with {features_df.shape} samples")
    return features_df

@step(
    name="ExtractTarget",
    display_name="Extract Target Column",
    instance_type=instance_type,
    instance_count=instance_count,
)
def extract_target_column(df: pd.DataFrame) -> pd.Series:
    """
    Extract target column from training data

    Args:
        train_df (pd.DataFrame): Training dataframe

    Returns:
        pd.Series: Feature variable series
    """
    features = df["y"]
    print(f"Extracted target column with {len(features)} samples")
    return features

---

In [None]:
def create_pipeline():
    # Pipeline
    step_ingest_train_data = ingest_train_data(train_data_path)
    step_ingest_test_data = ingest_test_data(test_data_path)

    step_extract_features_column = extract_features_column(step_ingest_train_data)
    step_extract_target_column = extract_target_column(step_ingest_train_data)

    # Dependencies
    step_ingest_train_data_instance = get_step(step_ingest_train_data)
    # step_ingest_test_data_instance = get_step(step_ingest_test_data)
    step_extract_features_column_instance = get_step(step_extract_features_column)
    step_extract_target_column_instance = get_step(step_extract_target_column)

    step_extract_features_column_instance.add_depends_on([step_ingest_train_data_instance])
    step_extract_target_column_instance.add_depends_on([step_ingest_train_data_instance])

    # Create and return pipeline
    pipeline = Pipeline(
        name="BankMarketingPipeline",
        parameters=[train_data_path, test_data_path, instance_type, instance_count],
        steps=[step_ingest_train_data, step_ingest_test_data, step_extract_features_column, step_extract_target_column],
        sagemaker_session=sagemaker.Session()
    )

    return pipeline

In [None]:
# Execute pipeline
pipeline = create_pipeline()

In [None]:
pipeline.upsert(role_arn=role_arn)

In [None]:
execution = pipeline.start()

---