In [1]:
import sagemaker
from sagemaker.local import LocalSession
from sagemaker import get_execution_role
from sagemaker.session import Session
import boto3
from sagemaker.sklearn.estimator import SKLearn

In [2]:
sagemaker_session=sagemaker.Session()
local_session=LocalSession()
bucket = sagemaker_session.default_bucket()

role = get_execution_role()
prefix = 'chapter9/data'

print('Training input/output will be stored in {}/{}'.format(bucket, prefix))
print('\nIAM Role: {}'.format(role))

Training input/output will be stored in sagemaker-us-east-1-485822383573/chapter9/data

IAM Role: arn:aws:iam::485822383573:role/service-role/AmazonSageMaker-ExecutionRole-20220426T122295


In [32]:
%%writefile scripts/preprocessing.py
import csv
import wget
import zipfile
import os
import pandas as pd
import boto3
import time
import json
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bucket', type=str)
parser.add_argument('--region',type=str)
args = parser.parse_args()

bucket=args.bucket
cm = boto3.client('comprehendmedical',region_name=args.region)
s3_client = boto3.client('s3',region_name=args.region)

if os.path.exists('data')==False:
    os.mkdir('data')

file_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip'
dest_file = 'data/drugsCom_raw.zip'

print("Downloading source files...")

wget.download(file_url, dest_file)

with zipfile.ZipFile('data/drugsCom_raw.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

os.remove('data/drugsCom_raw.zip')

orig_list = list()
for filename in os.listdir('data'):
    with open('data/'+filename) as csvfile:
        myreader = csv.reader(csvfile, delimiter='\t')
        for row in myreader:
            if row[0] == '':
                continue
            else:
                orig_list.append({
                    'id': row[0],
                    'drugName': row[1],
                    'condition': row[2],
                    'review': row[3]
                })

    
if os.path.exists('processed_data')==False:
    os.mkdir('processed_data')
    
raw_df=pd.DataFrame.from_records(orig_list)
raw_df.to_csv('processed_data/raw_df.csv', index=False)

print("\nRaw data processed from input files")
print("\nRamdomly sampling 100 rows for topic extraction")

df_sample=raw_df.sample(n=100)
sample_list = list()


for index,row in df_sample.iterrows():
    entities = cm.detect_entities(Text=row['review'])
    topic_list = []
    for entity in entities['Entities']:
        if entity['Category'] == 'MEDICAL_CONDITION':
            topic_list.append(entity['Text'])

    sample_list.append({
            'id': row['id'],
            'drugName': row['drugName'],
            'condition': row['condition'],
            'review': row['review'],
            'topics': topic_list[:5]
        })
        
sample_df=pd.DataFrame.from_records(sample_list)
sample_df.to_csv('processed_data/sample_df.csv', index=False) 

s3_client.upload_file('processed_data/sample_df.csv', bucket, 'chapter9/data/sample_df.csv')
s3_client.upload_file('processed_data/raw_df.csv', bucket, 'chapter9/data/raw_df.csv')

print("\nprocessed files uploaded to s3")

Overwriting scripts/preprocessing.py


In [33]:
#!pip install wget

In [34]:
#!python scripts/preprocessing.py --bucket $bucket --region $region


Downloading source files...
100% [....................................................] 42989872 / 42989872^C
Traceback (most recent call last):
  File "scripts/preprocessing.py", line 40, in <module>
    if row[0] == '':
KeyboardInterrupt


In [6]:
%%sh

docker_name=sagemaker-preprocessing
account=$(aws sts get-caller-identity --query Account --output text)
echo $account
region=$(aws configure get region)

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${docker_name}:latest"
# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${docker_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${docker_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)
docker build -t $docker_name -f scripts/Dockerfile .
docker tag ${docker_name} ${fullname}
docker push ${fullname}

485822383573
Login Succeeded

Step 1/7 : FROM python:3.7-slim-buster
 ---> 8fe6e55c0412
Step 2/7 : RUN pip install pandas
 ---> Using cache
 ---> ed3c2aadaa6e
Step 3/7 : RUN pip install wget
 ---> Using cache
 ---> 93dc76d1c100
Step 4/7 : RUN pip install boto3
 ---> Running in 89ac543bf741
Collecting boto3
  Downloading boto3-1.24.44-py3-none-any.whl (132 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.5/132.5 KB 16.6 MB/s eta 0:00:00
Collecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 79.6/79.6 KB 14.2 MB/s eta 0:00:00
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting botocore<1.28.0,>=1.27.44
  Downloading botocore-1.27.44-py3-none-any.whl (9.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.0/9.0 MB 63.5 MB/s eta 0:00:00
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.11-py2.py3-none-any.whl (139 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [17]:
from sagemaker.processing import ScriptProcessor
docker_name = "sagemaker-preprocessing"
account = sagemaker_session.boto_session.client("sts").get_caller_identity()["Account"]
region = sagemaker_session.boto_session.region_name
image = "{}.dkr.ecr.{}.amazonaws.com/{}:latest".format(account, region, docker_name)
print(image)
script_processor = ScriptProcessor(image_uri=image,
                role=role,
                instance_count=1,
                instance_type='ml.m5.xlarge',
                command=['python3'])

485822383573.dkr.ecr.us-east-1.amazonaws.com/sagemaker-preprocessing:latest


In [35]:
script_processor.run(code='scripts/preprocessing.py',
                    arguments = ["--bucket",bucket,'--region',region])


Job Name:  sagemaker-preprocessing-2022-08-03-02-04-50-081
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-485822383573/sagemaker-preprocessing-2022-08-03-02-04-50-081/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  []
.........................[34mDownloading source files...[0m
[34mRaw data processed from input files[0m
[34mRamdomly sampling 100 rows for topic extraction[0m
[34mprocessed files uploaded to s3[0m



In [36]:
%%writefile scripts/train.py

import argparse
import os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    # Hyperparameters are described here. In this simple example we are just including one hyperparameter.
    parser.add_argument('--n_clusters', type=int, default=2)
    parser.add_argument('--random_state', type=int, default=0)
    
    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAINING'])

    args = parser.parse_args()
    
    input_files = [ os.path.join(args.training, file) for file in os.listdir(args.training) ]
    if len(input_files) == 0:
        raise ValueError(('There are no files in {}.\n' +
                          'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                          'the data specification in S3 was incorrectly specified or the role specified\n' +
                          'does not have permission to access the data.').format(args.training, "train"))
    
    raw_data = [ pd.read_csv(file) for file in input_files ]
    train_data = pd.concat(raw_data)
    print(train_data.shape)
    
    raw_sentences=train_data.pop('topics')
    corpus = raw_sentences.values.tolist()
    vectorizer = TfidfVectorizer()
    vecs = vectorizer.fit_transform(corpus)
    normalizer = Normalizer(copy=False)
    normalized_data = normalizer.fit_transform(vecs)
    print(normalized_data.shape)
    kmeans = KMeans(n_clusters=2,random_state=0).fit(normalized_data)
    
    
    
    # Print the coefficients of the trained classifier, and save the coefficients
    joblib.dump(kmeans, os.path.join(args.model_dir, "kmeansmodel.joblib"))
    
def model_fn(model_dir):
    """Deserialized and return fitted model

    Note that this should have the same name as the serialized model in the main method
    """
    kmeans = joblib.load(os.path.join(model_dir, "kmeansmodel.joblib"))
    return kmeans

Overwriting scripts/train.py


In [37]:
sklearn = SKLearn(
    source_dir='scripts',
    entry_point='train.py',
    instance_type="ml.m4.xlarge",
    role = get_execution_role(),
    sagemaker_session=sagemaker_session,
    framework_version='0.20.0',
    hyperparameters={'n_clusters': 2, 'random_state':0})

In [None]:
sklearn.fit({'training': 's3://{}/{}/sample_df.csv'.format(bucket,prefix)})

2022-08-03 02:11:01 Starting - Starting the training job...ProfilerReport-1659492661: InProgress
...
2022-08-03 02:11:45 Starting - Preparing the instances for training........