In [1]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()
role = get_execution_role()

bucket = "mastering-ml-aws"

prefix = "chapter2/blazingtext"


In [2]:
from os.path import expanduser

SRC_PATH = expanduser("~") + '/SageMaker/mastering-ml-on-aws/chapter2/'

with open(SRC_PATH + 'dem.txt', 'r') as file:
    dem_text = ["__label__0 " + line.strip('\n') for line in file]

with open(SRC_PATH + 'gop.txt', 'r') as file:
    gop_text = ["__label__1 " + line.strip('\n') for line in file]
    
corpus = dem_text + gop_text
    
from sklearn.model_selection import train_test_split
corpus_train, corpus_test = train_test_split(corpus, test_size=0.25, random_state=42)    

In [3]:
corpus_train_txt = "\n".join(corpus_train)
corpus_test_txt = "\n".join(corpus_test)
with open('tweets.train', 'w') as file:
    file.write(corpus_train_txt)    
with open('tweets.test', 'w') as file:
    file.write(corpus_test_txt)    


In [6]:
print(corpus_train_txt[:300])

__label__1 “We are forever grateful for your service.” -@FLOTUS https://t.co/22vFTZguAQ
__label__0 RT @CecileRichards: When your strategy relies on fewer people being able to vote, you’re on the wrong side of history. https://t.co/ncthe2W…
__label__0 RT @AFLCIO: Scott Walker. Forever a national disg


In [7]:

train_path = prefix + '/train'
validation_path = prefix + '/validation'

sess.upload_data(path='tweets.train', bucket=bucket, key_prefix=train_path)
sess.upload_data(path='tweets.test', bucket=bucket, key_prefix=validation_path)

s3_train_data = 's3://{}/{}'.format(bucket, train_path)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_path)

In [8]:
container = sagemaker.amazon.amazon_estimator.get_image_uri('us-east-1', "blazingtext", "latest")

s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)


In [10]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

In [11]:
bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=3,
                            learning_rate=0.05,
                            early_stopping=False,
                            patience=5,
                            min_epochs=5,
                            word_ngrams=2)

train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

In [12]:
bt_model.fit(inputs=data_channels, logs=True)


INFO:sagemaker:Creating training-job with name: blazingtext-2019-01-04-01-42-36-379


2019-01-04 01:42:36 Starting - Starting the training job...
2019-01-04 01:42:38 Starting - Launching requested ML instances......
2019-01-04 01:43:51 Starting - Preparing the instances for training......
2019-01-04 01:45:07 Downloading - Downloading input data
2019-01-04 01:45:07 Training - Downloading the training image..
[31mArguments: train[0m
[31m[01/04/2019 01:45:09 INFO 139833108178752] nvidia-smi took: 0.0251860618591 secs to identify 0 gpus[0m
[31m[01/04/2019 01:45:09 INFO 139833108178752] Running single machine CPU BlazingText training using supervised mode.[0m
[31m[01/04/2019 01:45:09 INFO 139833108178752] Processing /opt/ml/input/data/train/tweets.train . File size: 0 MB[0m
[31m[01/04/2019 01:45:09 INFO 139833108178752] Processing /opt/ml/input/data/validation/tweets.test . File size: 0 MB[0m
[31mRead 0M words[0m
[31mNumber of words:  407[0m
[31m##### Alpha: -0.0002  Progress: 100.32%  Million Words/sec: 0.72 #####[0m
[31m##### Alpha: 0.0000  Progress: 100.0

In [13]:
predictor = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')


INFO:sagemaker:Creating model with name: blazingtext-2019-01-04-01-48-23-529
INFO:sagemaker:Creating endpoint with name blazingtext-2019-01-04-01-42-36-379


---------------------------------------------------------------!

In [14]:
!aws s3 ls --recursive s3://mastering-ml-aws/chapter2/blazingtext

2018-12-20 22:47:03        115 chapter2/blazingtext/output/blazingtext-2018-12-20-22-43-49-923/output/model.tar.gz
2018-12-20 22:59:25        114 chapter2/blazingtext/output/blazingtext-2018-12-20-22-57-14-011/output/model.tar.gz
2018-12-20 23:08:03   73240432 chapter2/blazingtext/output/blazingtext-2018-12-20-23-04-29-407/output/model.tar.gz
2018-12-22 14:17:01        116 chapter2/blazingtext/output/blazingtext-2018-12-22-14-14-36-622/output/model.tar.gz
2018-12-22 14:28:17        115 chapter2/blazingtext/output/blazingtext-2018-12-22-14-25-02-622/output/model.tar.gz
2018-12-22 14:31:29   73325773 chapter2/blazingtext/output/blazingtext-2018-12-22-14-28-57-130/output/model.tar.gz
2018-12-22 14:53:52   73325938 chapter2/blazingtext/output/blazingtext-2018-12-22-14-51-28-246/output/model.tar.gz
2018-12-22 15:28:20   73226793 chapter2/blazingtext/output/blazingtext-2018-12-22-15-26-04-556/output/model.tar.gz
2018-12-22 15:43:11   73226915 chapter2/blazingtext/output/blazingtext-2

In [15]:
corpus_test_no_labels = [x[11:] for x in corpus_test]

payload = {"instances" : corpus_test_no_labels}

response = predictor.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "prob": [
      0.5000365972518921
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      0.5000544786453247
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      0.5000317096710205
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      0.5000205636024475
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      0.5000355839729309
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      0.5000495910644531
    ],
    "label": [
      "__label__0"
    ]
  },
  {
    "prob": [
      0.5000455975532532
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      0.500016450881958
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      0.5000253915786743
    ],
    "label": [
      "__label__0"
    ]
  },
  {
    "prob": [
      0.5000267624855042
    ],
    "label": [
      "__label__0"
    ]
  },
  {
    "prob": [
      0.500026524066925
    ],


In [16]:
predicted_labels = [prediction['label'][0] for prediction in predictions]


In [17]:
predicted_labels[:4]

['__label__1', '__label__1', '__label__1', '__label__1']

In [18]:
actual_labels = [x[:10] for x in corpus_test]
actual_labels[:4]

['__label__1', '__label__1', '__label__0', '__label__1']

In [19]:
matches = [(actual_label == predicted_label) for (actual_label, predicted_label) in zip(actual_labels, predicted_labels)]
matches[:4]

[True, True, False, True]

In [20]:
matches.count(True) / len(matches)

0.62