In [None]:
import sagemaker
role = sagemaker.get_execution_role()
print(role)

In [None]:
import sagemaker

# Get the default SageMaker execution role
role_arn = sagemaker.get_execution_role()
role_arn

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sagemaker.workflow.function_step import step
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.parameters import ParameterInteger
from sagemaker.workflow.parameters import ParameterFloat

In [3]:
train_data_path = ParameterString(
    name="TrainDataPath",
    default_value="s3://srushanth-baride/binary-classification-with-a-bank-dataset/train.csv"
)

test_data_path = ParameterString(
    name="TestDataPath",
    default_value="s3://srushanth-baride/binary-classification-with-a-bank-dataset/test.csv"
)

In [4]:
instance_type = ParameterString(
    name="InstanceType",
    default_value="ml.m5.xlarge"
)

instance_count = ParameterInteger(
    name="InstanceCount",
    default_value=1
)

In [5]:
train_test_split_ratio = ParameterFloat(
    name="TrainTestSplitRatio",
    default_value=0.8
)

---

In [8]:
@step(
    name="IngestTrainingData",
    display_name="Ingest Training Data",
    instance_type=instance_type,
    instance_count=instance_count
)
def ingest_data(s3_path: ParameterString) -> pd.DataFrame:
    """
    Ingest data from S3 path
    
    Args:
        s3_path (ParameterString): S3 path to training data
        
    Returns:
        pd.DataFrame: Training dataframe
    """
    df = pd.read_csv(s3_path)
    print(f"Ingested data with shape: {df.shape}")
    return df

---

In [None]:
train_df = pd.read_csv("s3://srushanth-baride/binary-classification-with-a-bank-dataset/train.csv")

In [None]:
# @step(instance_type="ml.m5.xlarge", instance_count=1)
# def ingest_data(s3_path: str) -> pd.DataFrame:
#     train_df = pd.read_csv(s3_path)
#     return train_df

In [None]:
@step(instance_type="ml.m5.xlarge", instance_count=1)
def get_feature_column(train_df: pd.DataFrame) -> pd.Series:
    return train_df["y"]

In [None]:
train_data_file_path: str = "s3://srushanth-baride/binary-classification-with-a-bank-dataset/train.csv"
step_ingest_data_result = ingest_data(train_data_file_path)
step_get_feature_column_result = get_feature_column(step_ingest_data_result)

In [None]:
@step(instance_type="ml.m5.xlarge", instance_count=1)
def preprocess_data(df: pd.DataFrame) -> tuple:
    # Label encoding
    object_labels = [
        "job", 
        "marital", 
        "education", 
        "default", 
        "balance", 
        "housing", 
        "loan", 
        "contact", 
        "month", 
        "poutcome"
    ]

    for column_name in object_labels:
        le = LabelEncoder()
        df[column_name] = le.fit_transform(df[column_name])

    # Split features and target
    y = df["y"]
    X = df.drop(columns=["id", "y"])

    return X, y

In [None]:
@step(instance_type="ml.m5.xlarge", instance_count=1)
def split_data(X: pd.DataFrame, y: pd.DataFrame) -> tuple:
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=42
    )

    return x_train, x_test, y_train, y_test

In [None]:
import lightgbm as lgb

In [None]:
@step(instance_type="ml.m5.xlarge", instance_count=1)
def train_model(
    x_train: pd.DataFrame, x_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame
) -> lgb.LGBMClassifier:
    model = lgb.LGBMClassifier()
    model.fit(x_train, y_train, eval_set=[(x_test, y_test)])
    
    return model

In [None]:
@step(instance_type="ml.m5.xlarge", instance_count=1)
def make_predictions(model: lgb.LGBMClassifier, test_data: pd.DataFrame) -> np.ndarray:
    test_data = test_data.drop(columns=["id"])
    return model.predict_proba(test_data)[:, -1]

In [None]:
from sagemaker.workflow.step_outputs import get_step

In [None]:
def create_pipeline():
    # Define the pipeline steps
    train_data_path = "s3://srushanth-baride/binary-classification-with-a-bank-dataset/train.csv"
    test_data_path = "s3://srushanth-baride/binary-classification-with-a-bank-dataset/test.csv"

    # Ingest data
    step_ingest_data = ingest_data(train_data_path)

    # Preprocess data
    step_preprocess = preprocess_data(step_ingest_data)

    # Split data
    step_split = split_data(
        step_preprocess[0],
        step_preprocess[1]
    )
    
    # Train model
    step_train = train_model(
        step_split[0],
        step_split[1],
        step_split[2],
        step_split[3]
    )

    # Make predictions
    test_df = pd.read_csv(test_data_path)
    step_predict = make_predictions(
        step_train,
        test_df
    )

    # If you really need explicit dependencies, define them like this:
    step_preprocess_instance = get_step(step_preprocess)
    step_split_instance = get_step(step_split)
    step_train_instance = get_step(step_train)
    step_predict_instance = get_step(step_predict)

    # Add dependencies if needed
    step_split_instance.add_depends_on([step_preprocess_instance])
    step_train_instance.add_depends_on([step_split_instance])
    step_predict_instance.add_depends_on([step_train_instance])

    # Create and return pipeline
    pipeline = Pipeline(
        name="BankMarketingPipeline",
        steps=[step_ingest_data, step_preprocess, step_split, step_train, step_predict],
        sagemaker_session=sagemaker.Session()
    )
    
    return pipeline

In [None]:
# Execute pipeline
pipeline = create_pipeline()

In [None]:
pipeline.upsert(role_arn=role_arn)

In [None]:
execution = pipeline.start()

In [None]:
print(pipeline.definition())