## 1. Convert text to BERT Features

In [25]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = "team-4-project-data"
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [26]:
!aws s3 ls

2024-02-18 20:23:31 211125778552personalizepocvod
2024-02-18 01:27:18 aws-athena-query-results-211125778552-us-east-1
2024-03-27 00:35:44 aws-glue-assets-211125778552-us-east-1
2024-03-27 00:28:12 aws-glue-assets-211125778552-us-east-2
2024-02-15 21:19:44 sagemaker-studio-12jvao34qlkn
2024-02-15 22:38:05 sagemaker-studio-211125778552-3pjkfc2ijfr
2024-02-17 02:02:09 sagemaker-studio-211125778552-4dcj21sopi3
2024-02-19 03:02:29 sagemaker-studio-211125778552-4rfwbx1bibn
2024-02-15 20:23:46 sagemaker-studio-211125778552-4yhhjbuzjdq
2024-02-17 02:02:35 sagemaker-studio-211125778552-8xxlet4bnrv
2024-02-17 02:02:08 sagemaker-studio-211125778552-rfcwvtinree
2024-02-20 00:38:45 sagemaker-studio-211125778552-yu1t8p5304s
2024-03-21 19:20:47 sagemaker-studio-uyd2sz3oy3
2024-03-08 03:11:27 sagemaker-team11-stanford-dogs
2024-03-18 01:49:41 sagemaker-team6-distracted-drivers
2024-02-15 20:23:48 sagemaker-us-east-1-211125778552
2024-03-03 20:33:04 team-3-project-data
2024-04-12 00:29:36 team-4-projec

In [27]:
!aws s3 ls team-4-project-data

                           PRE athena/
                           PRE ryanair-data/


In [28]:
!aws s3 ls s3://team-4-project-data/ryanair-data/

                           PRE parquet/
2024-04-12 00:29:37    1908135 ryanair_reviews.csv


In [29]:
raw_input_data_s3_uri = "s3://{}/ryanair-data/".format(bucket)
print(raw_input_data_s3_uri)

s3://team-4-project-data/ryanair-data/


In [30]:
raw_input_data_athena_uri = "s3://{}/athena/staging".format(bucket)
print(raw_input_data_athena_uri)

s3://team-4-project-data/athena/staging


In [31]:
!pygmentize preprocess-scikit-text-to-bert-feature-store.py

[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mmodel_selection[39;49;00m [34mimport[39;49;00m train_test_split
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mutils[39;49;00m [34mimport[39;49;00m resample
[34mimport[39;49;00m [04m[36mfunctools[39;49;00m
[34mimport[39;49;00m [04m[36mmultiprocessing[39;49;00m

[34mfrom[39;49;00m [04m[36mdatetime[39;49;00m [34mimport[39;49;00m datetime
[34mfrom[39;49;00m [04m[36mtime[39;49;00m [34mimport[39;49;00m gmtime, strftime, sleep

[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mre[39;49;00m
[34mimport[39;49;00m [04m[36mcollections[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mcsv[39;49;00m
[34mimport[39;49;00m [04m[36mglob[39;49;00m
[34mfrom[39;49;00m [04m[36mp

### Create the Experiment

In [32]:
import time
from smexperiments.experiment import Experiment

timestamp = int(time.time())

experiment = Experiment.create(
    experiment_name="Ryanair-Reviews-BERT-Experiment-{}".format(timestamp),
    description="Ryanair Reviews BERT Experiment",
    sagemaker_boto_client=sm,
)

experiment_name = experiment.experiment_name
print("Experiment name: {}".format(experiment_name))

Experiment name: Ryanair-Reviews-BERT-Experiment-1712883165


### Create the Trial

In [33]:
import time
from smexperiments.trial import Trial

timestamp = int(time.time())

trial = Trial.create(
    trial_name="trial-{}".format(timestamp), experiment_name=experiment_name, sagemaker_boto_client=sm
)

trial_name = trial.trial_name
print("Trial name: {}".format(trial_name))

Trial name: trial-1712883165


### Create the Experiment Config

In [34]:
experiment_config = {
    "ExperimentName": experiment_name,
    "TrialName": trial_name,
    "TrialComponentDisplayName": "prepare",
}

### Create Feature Store and Feature Group

In [35]:
featurestore_runtime = boto3.Session().client(service_name="sagemaker-featurestore-runtime", region_name=region)

In [36]:
timestamp = int(time.time())

feature_store_offline_prefix = "reviews-feature-store-" + str(timestamp)

print(feature_store_offline_prefix)

reviews-feature-store-1712883165


In [37]:
feature_group_name = "reviews-feature-group-" + str(timestamp)

print(feature_group_name)

reviews-feature-group-1712883165


In [38]:
from sagemaker.feature_store.feature_definition import (
    FeatureDefinition,
    FeatureTypeEnum,
)

feature_definitions = [
    FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="record_id", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL),
    #FeatureDefinition(feature_name='review', feature_type=FeatureTypeEnum.STRING)
]

In [39]:
from sagemaker.feature_store.feature_group import FeatureGroup

feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sess)

print(feature_group)

FeatureGroup(name='reviews-feature-group-1712883165', sagemaker_session=<sagemaker.session.Session object at 0x7f88c1b79b10>, feature_definitions=[FeatureDefinition(feature_name='input_ids', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='input_mask', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='segment_ids', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='label_id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>), FeatureDefinition(feature_name='record_id', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='date', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='label', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>)])


In [40]:
processing_instance_type = "ml.m5.large"
processing_instance_count = 2
train_split_percentage = 0.50
validation_split_percentage = 0.25
test_split_percentage = 0.25
balance_dataset = True
max_seq_length = 64

In [41]:
from sagemaker.sklearn.processing import SKLearnProcessor

processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    #env={"AWS_DEFAULT_REGION": region},
    max_runtime_in_seconds=7200,
)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [42]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

processor.run(
    code="preprocess-scikit-text-to-bert-feature-store.py",
    inputs=[
        ProcessingInput(
            input_name="raw-input-data",
            source=raw_input_data_s3_uri,
            destination="/opt/ml/processing/input/data/",
            s3_data_distribution_type="ShardedByS3Key",
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="bert-train", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/bert/train"
        ),
        ProcessingOutput(
            output_name="bert-validation",
            s3_upload_mode="EndOfJob",
            source="/opt/ml/processing/output/bert/validation",
        ),
        ProcessingOutput(
            output_name="bert-test", s3_upload_mode="EndOfJob", source="/opt/ml/processing/output/bert/test"
        ),
    ],
    arguments=[
        "--train-split-percentage",
        str(train_split_percentage),
        "--validation-split-percentage",
        str(validation_split_percentage),
        "--test-split-percentage",
        str(test_split_percentage),
        "--max-seq-length",
        str(max_seq_length),
        "--balance-dataset",
        str(balance_dataset),
        "--feature-store-offline-prefix",
        str(feature_store_offline_prefix),
        "--feature-group-name",
        str(feature_group_name),
    ],
    experiment_config=experiment_config,
    logs=True,
    wait=False,
)

INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2024-04-12-00-52-45-830



Job Name:  sagemaker-scikit-learn-2024-04-12-00-52-45-830
Inputs:  [{'InputName': 'raw-input-data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://team-4-project-data/ryanair-data/', 'LocalPath': '/opt/ml/processing/input/data/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'ShardedByS3Key', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-12-00-52-45-830/input/code/preprocess-scikit-text-to-bert-feature-store.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'bert-train', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-12-00-52-45-830/output/bert-train', 'LocalPath': '/opt/ml/processing/output/bert/train', 'S3UploadMode': 'End

In [43]:
scikit_processing_job_name = processor.jobs[-1].describe()["ProcessingJobName"]
print(scikit_processing_job_name)

sagemaker-scikit-learn-2024-04-12-00-52-45-830


In [44]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/processing-jobs/{}">Processing Job</a></b>'.format(
            region, scikit_processing_job_name
        )
    )
)

In [45]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/ProcessingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(
            region, scikit_processing_job_name
        )
    )
)

In [46]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Processing Job Has Completed</b>'.format(
            bucket, scikit_processing_job_name, region
        )
    )
)

### Monitor the processing job

In [47]:
running_processor = sagemaker.processing.ProcessingJob.from_processing_name(
    processing_job_name=scikit_processing_job_name, sagemaker_session=sess
)

processing_job_description = running_processor.describe()

print(processing_job_description)

{'ProcessingInputs': [{'InputName': 'raw-input-data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://team-4-project-data/ryanair-data/', 'LocalPath': '/opt/ml/processing/input/data/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'ShardedByS3Key', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-12-00-52-45-830/input/code/preprocess-scikit-text-to-bert-feature-store.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}], 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'bert-train', 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-12-00-52-45-830/output/bert-train', 'LocalPath': '/opt/ml/processing/output/bert/train', 'S3UploadMode': 'EndOfJob'}, 'AppManaged': False}, {'Output

In [48]:
running_processor.wait(logs=True)

.............................Collecting tensorflow==2.3.1
  Downloading tensorflow-2.3.1-cp37-cp37m-manylinux2010_x86_64.whl (320.4 MB)
Collecting tensorflow==2.3.1
  Downloading tensorflow-2.3.1-cp37-cp37m-manylinux2010_x86_64.whl (320.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 320.4/320.4 MB 2.7 MB/s eta 0:00:00
Collecting h5py<2.11.0,>=2.10.0
  Downloading h5py-2.10.0-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.9/2.9 MB 70.4 MB/s eta 0:00:00
Collecting astunparse==1.6.3
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting absl-py>=0.7.0
  Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 133.7/133.7 kB 18.3 MB/s eta 0:00:00
Collecting tensorboard<3,>=2.3.0
  Downloading tensorboard-2.11.2-py3-none-any.whl (6.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.0/6.0 MB 80.5 MB/s eta 0:00:00
Collecting grpcio>=1.8.6
  Downloading grpcio-1.62.1-cp37-cp37m-manylinux_2_

In [49]:
processing_job_description = running_processor.describe()

output_config = processing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "bert-train":
        processed_train_data_s3_uri = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "bert-validation":
        processed_validation_data_s3_uri = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "bert-test":
        processed_test_data_s3_uri = output["S3Output"]["S3Uri"]

print(processed_train_data_s3_uri)
print(processed_validation_data_s3_uri)
print(processed_test_data_s3_uri)

s3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-12-00-52-45-830/output/bert-train
s3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-12-00-52-45-830/output/bert-validation
s3://sagemaker-us-east-1-211125778552/sagemaker-scikit-learn-2024-04-12-00-52-45-830/output/bert-test


In [50]:
!aws s3 ls $processed_train_data_s3_uri/

2024-04-12 01:06:26     101848 part-algo-2-ryanair_reviews.tfrecord


In [51]:
!aws s3 ls $processed_validation_data_s3_uri/

2024-04-12 01:06:26      50608 part-algo-2-ryanair_reviews.tfrecord


In [52]:
!aws s3 ls $processed_test_data_s3_uri/

2024-04-12 01:06:26      51071 part-algo-2-ryanair_reviews.tfrecord
