## 1. Convert text to BERT Features

In [36]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = "sagemaker-team4-bucket"
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

2024-04-20 23:35:11.607091: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2024-04-20 23:35:11.608431: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
!aws s3 ls sagemaker-team4-bucket

                           PRE athena/
                           PRE ryanair-data/


In [4]:
!aws s3 ls s3://sagemaker-team4-bucket/ryanair-data/

                           PRE parquet/
2024-04-20 22:48:14    1908135 ryanair_reviews.csv


In [5]:
raw_input_data_s3_uri = "s3://{}/ryanair-data/".format(bucket)
print(raw_input_data_s3_uri)

s3://sagemaker-team4-bucket/ryanair-data/


In [6]:
raw_input_data_athena_uri = "s3://{}/athena/staging".format(bucket)
print(raw_input_data_athena_uri)

s3://sagemaker-team4-bucket/athena/staging


In [7]:
!pygmentize preprocess-scikit-text-to-bert-feature-store.py

[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mmodel_selection[39;49;00m [34mimport[39;49;00m train_test_split
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mutils[39;49;00m [34mimport[39;49;00m resample
[34mimport[39;49;00m [04m[36mfunctools[39;49;00m
[34mimport[39;49;00m [04m[36mmultiprocessing[39;49;00m

[34mfrom[39;49;00m [04m[36mdatetime[39;49;00m [34mimport[39;49;00m datetime
[34mfrom[39;49;00m [04m[36mtime[39;49;00m [34mimport[39;49;00m gmtime, strftime, sleep

[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mre[39;49;00m
[34mimport[39;49;00m [04m[36mcollections[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mcsv[39;49;00m
[34mimport[39;49;00m [04m[36mglob[39;49;00m
[34mfrom[39;49;00m [04m[36mp

### Create the Experiment

In [8]:
import time
from smexperiments.experiment import Experiment

timestamp = int(time.time())

experiment = Experiment.create(
    experiment_name="Ryanair-Reviews-BERT-Experiment-{}".format(timestamp),
    description="Ryanair Reviews BERT Experiment",
    sagemaker_boto_client=sm,
)

experiment_name = experiment.experiment_name
print("Experiment name: {}".format(experiment_name))

Experiment name: Ryanair-Reviews-BERT-Experiment-1713653373


### Create the Trial

In [9]:
import time
from smexperiments.trial import Trial

timestamp = int(time.time())

trial = Trial.create(
    trial_name="trial-{}".format(timestamp), experiment_name=experiment_name, sagemaker_boto_client=sm
)

trial_name = trial.trial_name
print("Trial name: {}".format(trial_name))

Trial name: trial-1713653373


### Create the Experiment Config

In [10]:
experiment_config = {
    "ExperimentName": experiment_name,
    "TrialName": trial_name,
    "TrialComponentDisplayName": "prepare",
}

### Create Feature Store and Feature Group

In [11]:
featurestore_runtime = boto3.Session().client(service_name="sagemaker-featurestore-runtime", region_name=region)

In [12]:
timestamp = int(time.time())

feature_store_offline_prefix = "reviews-feature-store-" + str(timestamp)

print(feature_store_offline_prefix)

reviews-feature-store-1713653373


In [13]:
feature_group_name = "reviews-feature-group-" + str(timestamp)

print(feature_group_name)

reviews-feature-group-1713653373


In [14]:
from sagemaker.feature_store.feature_definition import (
    FeatureDefinition,
    FeatureTypeEnum,
)

feature_definitions = [
    FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="record_id", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL)
]

In [15]:
from sagemaker.feature_store.feature_group import FeatureGroup

feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sess)

print(feature_group)

FeatureGroup(name='reviews-feature-group-1713653373', sagemaker_session=<sagemaker.session.Session object at 0x7f8b88c2d050>, feature_definitions=[FeatureDefinition(feature_name='input_ids', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='input_mask', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='segment_ids', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='label_id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>), FeatureDefinition(feature_name='record_id', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='date', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='label', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>)])


In [16]:
processing_instance_type = "ml.m5.large"
processing_instance_count = 2
train_split_percentage = 0.50
validation_split_percentage = 0.25
test_split_percentage = 0.25
balance_dataset = True
max_seq_length = 183

In [17]:
from sagemaker.sklearn.processing import SKLearnProcessor

processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    max_runtime_in_seconds=7200,
)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [18]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

processor.run(
    code="preprocess-scikit-text-to-bert-feature-store.py",
    inputs=[
        ProcessingInput(
            input_name="raw-input-data",
            source=raw_input_data_s3_uri,
            destination="/opt/ml/processing/input/data/",
            s3_data_distribution_type="ShardedByS3Key",
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="bert-train", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/bert/train"
        ),
        ProcessingOutput(
            output_name="bert-validation",
            s3_upload_mode="EndOfJob",
            source="/opt/ml/processing/output/bert/validation",
        ),
        ProcessingOutput(
            output_name="bert-test", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/bert/test"
        ),
    ],
    arguments=[
        "--train-split-percentage",
        str(train_split_percentage),
        "--validation-split-percentage",
        str(validation_split_percentage),
        "--test-split-percentage",
        str(test_split_percentage),
        "--max-seq-length",
        str(max_seq_length),
        "--balance-dataset",
        str(balance_dataset),
        "--feature-store-offline-prefix",
        str(feature_store_offline_prefix),
        "--feature-group-name",
        str(feature_group_name),
    ],
    experiment_config=experiment_config,
    logs=True,
    wait=False,
)

INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2024-04-20-22-49-33-640



Job Name:  sagemaker-scikit-learn-2024-04-20-22-49-33-640
Inputs:  [{'InputName': 'raw-input-data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-team4-bucket/ryanair-data/', 'LocalPath': '/opt/ml/processing/input/data/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'ShardedByS3Key', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-20-22-49-33-640/input/code/preprocess-scikit-text-to-bert-feature-store.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'bert-train', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-20-22-49-33-640/output/bert-train', 'LocalPath': '/opt/ml/processing/output/bert/train', 'S3UploadMode': '

In [19]:
scikit_processing_job_name = processor.jobs[-1].describe()["ProcessingJobName"]
print(scikit_processing_job_name)

sagemaker-scikit-learn-2024-04-20-22-49-33-640


In [20]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/processing-jobs/{}">Processing Job</a></b>'.format(
            region, scikit_processing_job_name
        )
    )
)

In [21]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/ProcessingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(
            region, scikit_processing_job_name
        )
    )
)

In [22]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Processing Job Has Completed</b>'.format(
            bucket, scikit_processing_job_name, region
        )
    )
)

### Monitor the processing job

In [23]:
running_processor = sagemaker.processing.ProcessingJob.from_processing_name(
    processing_job_name=scikit_processing_job_name, sagemaker_session=sess
)

processing_job_description = running_processor.describe()

print(processing_job_description)

{'ProcessingInputs': [{'InputName': 'raw-input-data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-team4-bucket/ryanair-data/', 'LocalPath': '/opt/ml/processing/input/data/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'ShardedByS3Key', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-20-22-49-33-640/input/code/preprocess-scikit-text-to-bert-feature-store.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}], 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'bert-train', 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-20-22-49-33-640/output/bert-train', 'LocalPath': '/opt/ml/processing/output/bert/train', 'S3UploadMode': 'EndOfJob'}, 'AppManaged': False}, {'Out

In [24]:
running_processor.wait(logs=True)

.

............................Collecting tensorflow==2.3.1
  Downloading tensorflow-2.3.1-cp37-cp37m-manylinux2010_x86_64.whl (320.4 MB)
Collecting tensorflow==2.3.1
  Downloading tensorflow-2.3.1-cp37-cp37m-manylinux2010_x86_64.whl (320.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 320.4/320.4 MB 2.7 MB/s eta 0:00:00
Collecting tensorflow-estimator<2.4.0,>=2.3.0
  Downloading tensorflow_estimator-2.3.0-py2.py3-none-any.whl (459 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 459.0/459.0 kB 42.7 MB/s eta 0:00:00
Collecting gast==0.3.3
  Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting tensorboard<3,>=2.3.0
  Downloading tensorboard-2.11.2-py3-none-any.whl (6.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.0/6.0 MB 89.2 MB/s eta 0:00:00
Collecting google-pasta>=0.1.8
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 57.5/57.5 kB 11.0 MB/s eta 0:00:00
Collecting h5py<2.11.0,>=2.10.0
  Downloading h5py-2.10.0-cp

In [25]:
processing_job_description = running_processor.describe()

output_config = processing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "bert-train":
        processed_train_data_s3_uri = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "bert-validation":
        processed_validation_data_s3_uri = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "bert-test":
        processed_test_data_s3_uri = output["S3Output"]["S3Uri"]

print(processed_train_data_s3_uri)
print(processed_validation_data_s3_uri)
print(processed_test_data_s3_uri)

s3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-20-22-49-33-640/output/bert-train
s3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-20-22-49-33-640/output/bert-validation
s3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-20-22-49-33-640/output/bert-test


In [26]:
!aws s3 ls $processed_train_data_s3_uri/

2024-04-20 23:03:06     225645 part-algo-2-ryanair_reviews.tfrecord


In [27]:
!aws s3 ls $processed_validation_data_s3_uri/

2024-04-20 23:03:06     112058 part-algo-2-ryanair_reviews.tfrecord


In [28]:
!aws s3 ls $processed_test_data_s3_uri/

2024-04-20 23:03:06     112106 part-algo-2-ryanair_reviews.tfrecord


In [29]:
from sagemaker.analytics import ExperimentAnalytics

import pandas as pd

pd.set_option("max_colwidth", 500)

experiment_analytics = ExperimentAnalytics(
    sagemaker_session=sess, experiment_name=experiment_name, sort_by="CreationTime", sort_order="Descending"
)

experiment_analytics_df = experiment_analytics.dataframe()
experiment_analytics_df

Unnamed: 0,TrialComponentName,DisplayName,SourceArn,SageMaker.InstanceCount,SageMaker.InstanceType,SageMaker.VolumeSizeInGB,SageMaker.ImageUri - MediaType,SageMaker.ImageUri - Value,code - MediaType,code - Value,raw-input-data - MediaType,raw-input-data - Value,bert-test - MediaType,bert-test - Value,bert-train - MediaType,bert-train - Value,bert-validation - MediaType,bert-validation - Value,Trials,Experiments
0,sagemaker-scikit-learn-2024-04-20-22-49-33-640-aws-processing-job,prepare,arn:aws:sagemaker:us-east-1:211125778552:processing-job/sagemaker-scikit-learn-2024-04-20-22-49-33-640,2.0,ml.m5.large,30.0,,683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3,,s3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-20-22-49-33-640/input/code/preprocess-scikit-text-to-bert-feature-store.py,,s3://sagemaker-team4-bucket/ryanair-data/,,s3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-20-22-49-33-640/output/bert-test,,s3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-20-22-49-33-640/output/bert-train,,s3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-20-22-49-33-640/output/bert-validation,[trial-1713653373],[Ryanair-Reviews-BERT-Experiment-1713653373]


In [30]:
trial_component_name = experiment_analytics_df.TrialComponentName[0]
print(trial_component_name)

sagemaker-scikit-learn-2024-04-20-22-49-33-640-aws-processing-job


In [31]:
trial_component_description = sm.describe_trial_component(TrialComponentName=trial_component_name)
trial_component_description

{'TrialComponentName': 'sagemaker-scikit-learn-2024-04-20-22-49-33-640-aws-processing-job',
 'TrialComponentArn': 'arn:aws:sagemaker:us-east-1:211125778552:experiment-trial-component/sagemaker-scikit-learn-2024-04-20-22-49-33-640-aws-processing-job',
 'DisplayName': 'prepare',
 'Source': {'SourceArn': 'arn:aws:sagemaker:us-east-1:211125778552:processing-job/sagemaker-scikit-learn-2024-04-20-22-49-33-640',
  'SourceType': 'SageMakerProcessingJob'},
 'Status': {'PrimaryStatus': 'Completed',
  'Message': 'Status: Completed, exit message: null, failure reason: null'},
 'StartTime': datetime.datetime(2024, 4, 20, 22, 54, 6, tzinfo=tzlocal()),
 'EndTime': datetime.datetime(2024, 4, 20, 23, 3, 11, tzinfo=tzlocal()),
 'CreationTime': datetime.datetime(2024, 4, 20, 22, 49, 35, 58000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:211125778552:user-profile/d-k0wihxzgpgsi/default-user',
  'UserProfileName': 'default-user',
  'DomainId': 'd-k0wihxzgpgsi'},
 'LastM

In [32]:
from sagemaker.lineage.visualizer import LineageTableVisualizer

lineage_table_viz = LineageTableVisualizer(sess)
lineage_table_viz_df = lineage_table_viz.show(processing_job_name=scikit_processing_job_name)
lineage_table_viz_df

Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...ess-scikit-text-to-bert-feature-store.py,Input,DataSet,ContributedTo,artifact
1,s3://sagemaker-team4-bucket/ryanair-data/,Input,DataSet,ContributedTo,artifact
2,68331...om/sagemaker-scikit-learn:0.23-1-cpu-py3,Input,Image,ContributedTo,artifact
3,s3://...2024-04-20-22-49-33-640/output/bert-test,Output,DataSet,Produced,artifact
4,s3://...4-20-22-49-33-640/output/bert-validation,Output,DataSet,Produced,artifact
5,s3://...024-04-20-22-49-33-640/output/bert-train,Output,DataSet,Produced,artifact


In [33]:
%store processed_train_data_s3_uri
%store processed_validation_data_s3_uri
%store processed_test_data_s3_uri
%store max_seq_length
%store experiment_name
%store trial_name

Stored 'processed_train_data_s3_uri' (str)
Stored 'processed_validation_data_s3_uri' (str)
Stored 'processed_test_data_s3_uri' (str)
Stored 'max_seq_length' (int)
Stored 'experiment_name' (str)
Stored 'trial_name' (str)
