In [2]:
import sys
from os import path
from os.path import exists
import pandas as pd
import json
from collections import Counter
import matplotlib.pyplot as plt
import sagemaker
import boto3
import csv
import nltk
from sklearn.model_selection import train_test_split

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
role = sagemaker.get_execution_role()

s3 = boto3.resource('s3')

bucket = 'ad440-mpg-floop-export-storage'

sess = sagemaker.Session()
obj = s3.Object(bucket, 'auto-floop-s3-export3-sagemaker.json')

s3_client = boto3.client('s3')
client = boto3.client('sagemaker')

prefix = "sagemaker/blazingtext"

region_name = boto3.Session().region_name

### Get Data
* Get csv data (that we created in the emotions file) so we can use the analysis to train the model!

In [5]:
# get clean sentences and emotions
df = pd.read_csv('sentenceEmotion.csv', index_col=[0])

#remove all non-emotion data
df = df[df.Emotion != 'none']
df.head()

Unnamed: 0,Sentence,Emotion
1,"If the temperature in a pond is lower, then th...",Fear
2,Maybe there is a need to be clear about why th...,Surprise
3,More details. A lot happened.,Sad
6,pronoun problem,Fear
7,"{{NAME}}, your handwriting is hard to read...p...",Angry


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1819 entries, 1 to 3236
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  1819 non-null   object
 1   Emotion   1819 non-null   object
dtypes: object(2)
memory usage: 42.6+ KB


In [7]:
df['Emotion'].value_counts()

Fear        701
Sad         414
Surprise    317
Happy       240
Angry       147
Name: Emotion, dtype: int64

In [8]:
# Add categorical values to emotions for simplicity
df['Emotion'] = pd.Categorical(pd.factorize(df.Emotion)[0])

# Fear = 0
# Surprise = 1
# Sad = 2
# Angry = 3
# Happy = 4

df.head()

Unnamed: 0,Sentence,Emotion
1,"If the temperature in a pond is lower, then th...",0
2,Maybe there is a need to be clear about why th...,1
3,More details. A lot happened.,2
6,pronoun problem,0
7,"{{NAME}}, your handwriting is hard to read...p...",3


In [9]:
df['Emotion'].value_counts()

0    701
2    414
1    317
4    240
3    147
Name: Emotion, dtype: int64

### Prepare dataset 
The <b>label</b> needs to be in front with the emotional value and the sentence needs to be tokenized for the blazingtext algo to work.

In [10]:
def tokenize(review):
    # delete commas and quotation marks, apply tokenization and join back into a string separating by spaces
    return ' '.join([str(token) for token in nltk.word_tokenize(str(review).replace(',', '').replace('"', '').lower())])
     
def prepare_data(df):
    df['Emotion'] = df['Emotion'].map(lambda Emotion : '__label__{}'.format(str(Emotion).replace('__label__', '')))
    df['Sentence'] = df['Sentence'].map(lambda Sentence : tokenize(Sentence)) # Replace all None
    return df
 
 
 
df_blazingtext = df[['Emotion', 'Sentence']].reset_index(drop=True)
df_blazingtext = prepare_data(df_blazingtext)
df_blazingtext

Unnamed: 0,Emotion,Sentence
0,__label__0,if the temperature in a pond is lower then the...
1,__label__1,maybe there is a need to be clear about why th...
2,__label__2,more details . a lot happened .
3,__label__0,pronoun problem
4,__label__3,{ { name } } your handwriting is hard to read ...
...,...,...
1814,__label__0,# 12 leave in square root form on these ! ( -1 )
1815,__label__4,# 12 looks good !
1816,__label__4,# 12 needs review . these functions are import...
1817,__label__0,# 12 needs unit


In [11]:
# Split all data into 80% train and 20% holdout
df_train, df_validation = train_test_split(df_blazingtext, test_size=0.20, stratify=df_blazingtext['Emotion'])

Upload training and validation csv files to notebook and s3 bucket.

In [12]:
blazingtext_train_path = './train.csv'
df_train[['Emotion', 'Sentence']].to_csv(blazingtext_train_path, index=False, header=False, sep=' ')
 
blazingtext_validation_path = './validation.csv'
df_validation[['Emotion', 'Sentence']].to_csv(blazingtext_validation_path, index=False, header=False, sep=' ')

s3_output_location = "s3://{}/{}/output".format(bucket, prefix)
 
s3_train_data = sess.upload_data(bucket=bucket, key_prefix=prefix, path=blazingtext_train_path)
s3_validation_data = sess.upload_data(bucket=bucket, key_prefix=prefix, path=blazingtext_validation_path)

### Train the model

Set up estimator object. The estimator launches the training job

In [13]:
#container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
container = sagemaker.image_uris.retrieve(
    region=region_name,
    framework='blazingtext'
 
)

resource configurations for the estimator:

In [14]:
estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m5.large",
    volume_size=30,
    output_path = s3_output_location,
    max_run=8000,
    hyperparameters={
        # supervised text classification
        "mode": "supervised",
        # number of passes through the dataset
        "epochs": 1,
        # discard words that pass through this amount of times
        "min_count": 2,
        "learning_rate": 0.05,
        "vector_dim": 10,
        "early_stopping": True,
        "patience": 4,
        "min_epochs": 5,
        # number of words
        "word_ngrams": 2,
    },
)

Prepare how the data channels and the algorithm communicate. The objects are created from the data channels and put in a dictionary for the algorithm..

In [15]:
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}


### Train the algo
it may take a bit. 
Once the job has finished a "Job complete" message will be printed. The trained model can be found in the S3 bucket that was setup as output_path in the estimator.

In [16]:
estimator.fit(inputs=data_channels, logs=True)

2022-03-24 22:04:33 Starting - Starting the training job...
2022-03-24 22:04:56 Starting - Preparing the instances for trainingProfilerReport-1648159473: InProgress
.........
2022-03-24 22:06:16 Downloading - Downloading input data.....[34mArguments: train[0m
[34m[03/24/2022 22:07:09 INFO 140148829439808] nvidia-smi took: 0.02523350715637207 secs to identify 0 gpus[0m
[34m[03/24/2022 22:07:09 INFO 140148829439808] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[03/24/2022 22:07:09 INFO 140148829439808] Processing /opt/ml/input/data/train/train.csv . File size: 0.13014984130859375 MB[0m
[34m[03/24/2022 22:07:09 INFO 140148829439808] Processing /opt/ml/input/data/validation/validation.csv . File size: 0.03947925567626953 MB[0m
[34mRead 0M words[0m
[34mNumber of words:  1715[0m
[34mLoading validation data from /opt/ml/input/data/validation/validation.csv[0m
[34mLoaded validation data.[0

In [17]:
estimator.latest_training_job.wait(logs=False)
estimator.training_job_analytics.dataframe()


2022-03-24 22:08:37 Starting - Preparing the instances for training
2022-03-24 22:08:37 Downloading - Downloading input data
2022-03-24 22:08:37 Training - Training image download completed. Training in progress.
2022-03-24 22:08:37 Uploading - Uploading generated training model
2022-03-24 22:08:37 Completed - Training job completed






Unnamed: 0,timestamp,metric_name,value
0,0.0,train:accuracy,0.3649
1,0.0,validation:accuracy,0.3626


### Hosting

delpoy the model 

In [18]:
from sagemaker.serializers import JSONSerializer

emotion_model = estimator.deploy(
    initial_instance_count=1, instance_type="ml.m5.large", serializer=JSONSerializer()
)

-----!

In [19]:
sentences = [
    'Generating random paragraphs can be an excellent way for writers to get their creative flow going at the beginning of the day.',
    'The writer has no idea what topic the random paragraph will be about when it appears.'
]

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [" ".join(nltk.word_tokenize(sent)) for sent in sentences]

#payload = {"instances": tokenized_sentences, "configuration": {"k": 2}}
payload = {"instances": tokenized_sentences}
print(payload)

{'instances': ['Generating random paragraphs can be an excellent way for writers to get their creative flow going at the beginning of the day .', 'The writer has no idea what topic the random paragraph will be about when it appears .']}


In [20]:
response = emotion_model.predict(payload)

In [21]:
predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

# Fear = 0
# Surprise = 1
# Sad = 2
# Angry = 3
# Happy = 4

[
  {
    "label": [
      "__label__1"
    ],
    "prob": [
      0.20009951293468475
    ]
  },
  {
    "label": [
      "__label__0"
    ],
    "prob": [
      0.20043520629405975
    ]
  }
]


In [23]:
print(predictions)

for prediction in predictions:
    print('Predicted emotion: {}'.format(prediction['label'][0].lstrip('__label__')))

[{'label': ['__label__1'], 'prob': [0.20009951293468475]}, {'label': ['__label__0'], 'prob': [0.20043520629405975]}]
Predicted emotion: 1
Predicted emotion: 0


### Delete the endpoint!!
save costs and delete the endpoint on sagemaker console- there isn't a working command line right now.
1. aws sagemaker screen
2. left navigation bar- click "inference"
3. choose the most recent blazingtext endpoint name that was auto created running this model
4. top 'actions' dropdown - choose delete
5. verify endpoint is deleted