## Initial setup

In [3]:
import pandas as pd

DATA_FILEPATH = "../penguins.csv"
df = pd.read_csv(DATA_FILEPATH)

print(df.head())

  species     island  culmen_length_mm  culmen_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen              39.1             18.7              181.0   
1  Adelie  Torgersen              39.5             17.4              186.0   
2  Adelie  Torgersen              40.3             18.0              195.0   
3  Adelie  Torgersen               NaN              NaN                NaN   
4  Adelie  Torgersen              36.7             19.3              193.0   

   body_mass_g     sex  
0       3750.0    MALE  
1       3800.0  FEMALE  
2       3250.0  FEMALE  
3          NaN     NaN  
4       3450.0  FEMALE  


We can run this notebook is [Local Mode](https://docs.aws.amazon.com/sagemaker/latest/dg/pipelines-local-mode.html) to test the pipeline in your local environment before using SageMaker. You can run the code in Local Mode by setting the `LOCAL_MODE` constant to `True`.

In [4]:
LOCAL_MODE = True

Let's load the S3 bucket name from the environment variables:

In [5]:
import os

bucket = os.environ["BUCKET"]

S3_LOCATION = f"s3://{bucket}/penguins"

Let's create a configuration dictionary with different settings depending on whether we are running the pipeline in Local Mode or not:

In [8]:
import logging
from sagemaker.workflow.pipeline_context import PipelineSession, LocalPipelineSession

pipeline_session = PipelineSession(default_bucket=bucket) if LOCAL_MODE == False else None

if LOCAL_MODE:
    config = {
        "session": LocalPipelineSession(default_bucket=bucket),
        "instance_type": "local",
        "image": None,
        "framework_version": "2.11",
        "py_version": "py39",
    }
else:
    config = {
        "session": pipeline_session,
        "instance_type": "ml.m5.xlarge",
        "image": None,
        "framework_version": "2.11",
        "py_version": "py39",
    }

# By default, The SageMaker SDK logs events related to the default
# configuration using the INFO level. To prevent these from spoiling
# the output of this notebook cells, we can change the logging
# level to ERROR instead.
logging.getLogger("sagemaker.config").setLevel(logging.ERROR)

Let's now initialize a few variables that we'll need throughout the notebook:

In [11]:
import boto3
import sagemaker

sagemaker_session = sagemaker.session.Session()
sagemaker_client = boto3.client("sagemaker")
iam_client = boto3.client("iam")
region = boto3.Session().region_name

print(region)

eu-north-1
