# Deciding whether to escalate a customer support issue

For updates on the way Sagemaker or AWS behave compared to the notebook code, please refer to https://livebook.manning.com/#!/book/machine-learning-for-business/chapter-4/v-5/108

## Part 1: Load and examine the data

In [1]:
data_bucket = 'mlforbusiness-modnajra'
subfolder = 'ch04/torchtext_data'
dataset1 = 'train.csv'
dataset2 = 'test.csv'

In [2]:
import pandas as pd                               
import boto3
import sagemaker
import s3fs
from sklearn.model_selection import train_test_split
import json
import nltk
import csv
from time import sleep

nltk.download('punkt')

role = sagemaker.get_execution_role()
s3 = s3fs.S3FileSystem(anon=False)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
%%time
df_train = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset1}')
display(df_train.head())

Unnamed: 0,message,label
0,off to bed now... nytynyt tweety tweet tweets...,0
1,"@erinfxs Damn, your new twitter background is...",0
2,Ocean view room in Laguna Beach: beautiful. To...,0
3,@YoungLazy lol yes yes no problem .. thanks fo...,0
4,@CarlitoStaxx Krisy kreme only if the hot sign...,0


CPU times: user 47.3 ms, sys: 3.29 ms, total: 50.6 ms
Wall time: 137 ms


In [7]:
%%time
df_test = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset2}')
display(df_test.head())

Unnamed: 0,message,label
0,just had a real good moment. i missssssssss hi...,0
1,is reading manga http://plurk.com/p/mzp1e,0
2,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,@lapcat Need to send 'em to my accountant tomo...,0
4,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


CPU times: user 30.1 ms, sys: 180 µs, total: 30.3 ms
Wall time: 140 ms


In [6]:
print(f'Number of rows in dataset: {df_train.shape[0]}')
print(df_train['label'].value_counts())

Number of rows in dataset: 8252
0    6400
1    1852
Name: label, dtype: int64


## Part 2: Get the data into the right shape

In [8]:
train_df, val_df, _, _ = train_test_split(df_train,
                                         df_train['label'],
                                         test_size = 0.2,
                                         random_state = 0)

In [12]:
%%time

def preprocess(df):
    all_rows = df.values.tolist()
    transformed_rows = list(map(transform_instance, all_rows))
    transformed_df = pd.DataFrame(transformed_rows)
    return transformed_df

def transform_instance(row):
    cur_row = []
    label = "__label__1" if row[1] == 1 else "__label__0" # Prefix 0 or 1 from sentiment
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[0].lower()))
    return ' '.join(cur_row)

transformed_validation_rows = preprocess(val_df)
display(transformed_validation_rows.head())

Unnamed: 0,0
0,__label__1 # depression is to feel like crying...
1,__label__0 @ top10causes
2,__label__0 @ jmmagroup haha thanks . mayer dom...
3,__label__0 @ davidkuridza the flag is for you ?
4,__label__1 this is what it 's like to live wit...


CPU times: user 399 ms, sys: 11.7 ms, total: 411 ms
Wall time: 409 ms


In [13]:
s3_validation_data = f's3://{data_bucket}/{subfolder}/processed/validation_cap.csv'

data = transformed_validation_rows.to_csv(
        header=False, index=False, quoting=csv.QUOTE_NONE, sep='|', escapechar='^').encode()
with s3.open(s3_validation_data, 'wb') as f:
    f.write(data)

In [14]:
%%time
transformed_train_rows = preprocess(train_df)
display(transformed_train_rows.head())

s3_train_data = f's3://{data_bucket}/{subfolder}/processed/train_cap.csv'

data = transformed_train_rows.to_csv(
        header=False, index=False, quoting=csv.QUOTE_NONE, sep='|', escapechar='^').encode()
with s3.open(s3_train_data, 'wb') as f:
    f.write(data)

Unnamed: 0,0
0,__label__0 @ mrjoatmon you 're very welcome wh...
1,__label__0 funny day today got rubbed with gra...
2,"__label__0 @ msmithpds thanks ... easy , but t..."
3,__label__0 @ bethanydillon i would like to hav...
4,"__label__0 @ nintendored sometimes , you just ..."


CPU times: user 1.7 s, sys: 11.5 ms, total: 1.71 s
Wall time: 1.81 s


In [19]:
%%time
transformed_test_rows = preprocess(df_test)
display(transformed_test_rows.head())

s3_test_data = f's3://{data_bucket}/{subfolder}/processed/test_cap.csv'

data = transformed_test_rows.to_csv(
        header=False, index=False, quoting=csv.QUOTE_NONE, sep='|', escapechar='^').encode()
with s3.open(s3_test_data, 'wb') as f:
    f.write(data)

Unnamed: 0,0
0,__label__0 just had a real good moment . i mis...
1,__label__0 is reading manga http : //plurk.com...
2,__label__0 @ comeagainjen http : //twitpic.com...
3,__label__0 @ lapcat need to send 'em to my acc...
4,__label__0 add me on myspace ! ! ! myspace.com...


CPU times: user 542 ms, sys: 0 ns, total: 542 ms
Wall time: 686 ms


## Part 3: Create training and validation datasets

In [15]:
%%time

train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')

CPU times: user 15 µs, sys: 0 ns, total: 15 µs
Wall time: 19.1 µs


In [20]:
test_data = sagemaker.session.s3_input(s3_test_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')

## Part 4: Train the model

In [16]:
s3_output_location = f's3://{data_bucket}/{subfolder}/output'

sess = sagemaker.Session()

container = sagemaker.amazon.amazon_estimator.get_image_uri(
                            boto3.Session().region_name, "blazingtext", "latest")

estimator = sagemaker.estimator.Estimator(
                            container,
                            role, 
                            train_instance_count=1, 
                            train_instance_type='ml.m5.large',
                            train_max_run = 600,
                            output_path=s3_output_location,
                            sagemaker_session=sess)

estimator.set_hyperparameters(
                            mode="supervised",
                            epochs=10,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

estimator.fit({'train': train_data, 'validation': validation_data})

2020-04-12 16:59:16 Starting - Starting the training job...
2020-04-12 16:59:17 Starting - Launching requested ML instances......
2020-04-12 17:00:19 Starting - Preparing the instances for training...
2020-04-12 17:01:04 Downloading - Downloading input data...
2020-04-12 17:01:47 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[04/12/2020 17:01:48 INFO 140531859679040] nvidia-smi took: 0.0251231193542 secs to identify 0 gpus[0m
[34m[04/12/2020 17:01:48 INFO 140531859679040] Running single machine CPU BlazingText training using supervised mode.[0m
[34m[04/12/2020 17:01:48 INFO 140531859679040] Processing /opt/ml/input/data/train/train_cap.csv . File size: 0 MB[0m
[34m[04/12/2020 17:01:48 INFO 140531859679040] Processing /opt/ml/input/data/validation/validation_cap.csv . File size: 0 MB[0m
[34mRead 0M words[0m
[34mNumber of words:  1970[0m
[34mLoading validation data from /opt/ml/input/data/validation/validation_cap.csv[0m


## Part 5: Host the Model

In [17]:
endpoint_name = 'customer-support-ch04'
try:
    sess.delete_endpoint(endpoint_name)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
    sleep(30)
except:
    pass

In [18]:
text_classifier = estimator.deploy(
                        initial_instance_count = 1,
                        instance_type = 'ml.t2.medium',
                        endpoint_name=endpoint_name)

-------------!

## Test the Model

In [27]:
display(df_test.head())

Unnamed: 0,message,label
0,just had a real good moment. i missssssssss hi...,0
1,is reading manga http://plurk.com/p/mzp1e,0
2,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,@lapcat Need to send 'em to my accountant tomo...,0
4,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [28]:
def get_inference(sentences):
    #print(sentences)
    sentences = [sentences.lower()]
     # using the same nltk tokenizer that we used during data preparation for training
    tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]
    #print(tokenized_sentences)
    payload = {"instances" : tokenized_sentences}

    response = text_classifier.predict(json.dumps(payload))

    predictions = json.loads(response)
    proba = predictions[0].get('prob')[0]
    label = predictions[0].get('label')[0][9:]
    if label == 1:
        resp = 1
    else:
        resp = 0
    return (proba, resp)

In [34]:
import multiprocessing
from multiprocessing import Pool

In [35]:
all_rows=df_test.message
pool = Pool(processes=multiprocessing.cpu_count())
predictions = pool.map(get_inference, all_rows)
pool.close() 
pool.join()

In [36]:
df_test['Proba','Predict'] = df_test.message.map(get_inference)

In [37]:
df_test.columns=['message','label','Prediction']

In [38]:
prob = []
lbl = []
for v in df_test.Prediction.values:
    prob.append(v[0])
    lbl.append(v[1])

In [39]:
df_test['prob'] = pd.Series(prob)
df_test['pred_lbl'] = pd.Series(lbl)

In [40]:
df_test['positive_prob'] = df_test.Prediction.map(lambda v: 1-v[0] if v[1]==1 else v[0])

In [42]:
display(df_test.head(10))

Unnamed: 0,message,label,Prediction,prob,pred_lbl,positive_prob
0,just had a real good moment. i missssssssss hi...,0,"(0.9999982118606567, 0)",0.999998,0,0.999998
1,is reading manga http://plurk.com/p/mzp1e,0,"(0.9996488094329834, 0)",0.999649,0,0.999649
2,@comeagainjen http://twitpic.com/2y2lx - http:...,0,"(0.9921782612800598, 0)",0.992178,0,0.992178
3,@lapcat Need to send 'em to my accountant tomo...,0,"(0.9850204586982727, 0)",0.98502,0,0.98502
4,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0,"(0.9986199140548706, 0)",0.99862,0,0.99862
5,so sleepy. good times tonight though,0,"(1.0000100135803223, 0)",1.00001,0,1.00001
6,"@SilkCharm re: #nbn as someone already said, d...",0,"(0.9953364133834839, 0)",0.995336,0,0.995336
7,23 or 24ï¿½C possible today. Nice,0,"(1.0000100135803223, 0)",1.00001,0,1.00001
8,nite twitterville workout in the am -ciao,0,"(0.999954104423523, 0)",0.999954,0,0.999954
9,"@daNanner Night, darlin'! Sweet dreams to you",0,"(1.0000032186508179, 0)",1.000003,0,1.000003


In [44]:
from sklearn.metrics import confusion_matrix, classification_report
print(pd.crosstab(df_test.label,df_test.pred_lbl, \
                  rownames=['Actual'], colnames=['Predicted']))
print(classification_report(df_test.label,df_test.pred_lbl))

Predicted     0
Actual         
0          1600
1           462
              precision    recall  f1-score   support

           0       0.78      1.00      0.87      1600
           1       0.00      0.00      0.00       462

   micro avg       0.78      0.78      0.78      2062
   macro avg       0.39      0.50      0.44      2062
weighted avg       0.60      0.78      0.68      2062



  'precision', 'predicted', average, warn_for)


In [47]:
import sklearn
sklearn.metrics.accuracy_score(df_test.label,df_test.pred_lbl)

0.7759456838021338

## Remove the Endpoint (optional)

Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

In [48]:
sess.delete_endpoint(endpoint_name)