In [1]:
import boto3
import json
import pandas as pd
import numpy as np
import io
import csv

In [2]:
s3_client = boto3.client('s3')
bucket = 'sagemaker-studio-197812454142-1pz1d5gxfcv'

In [4]:
# Load comprehend tool
# Remember to attach policy "ComprehendFullAccess" in IAM role 
comprehend = boto3.client(service_name='comprehend', region_name='us-east-1')

# Helper function to apply in pd dataframe
def getComprehendSentiment(text):
    result = comprehend.detect_sentiment(Text=text, LanguageCode='en')["Sentiment"]
    
    if result == "NEGATIVE":
        return 1
    elif result == "POSITIVE":
        return 2
    #else MIXED OR NEUTRSL
    return 3

def getDataframeFromS3(file_name, names):
    # Load from S3 boto client
    response = s3_client.get_object(Bucket=bucket ,Key=file_name)
    # Get file from request
    file = response["Body"].read()
    # Load into pandas
    return pd.read_csv(io.BytesIO(file), names=names)
    

# S3 Load

In [5]:
file_name = 'test.csv'
names=["label", "title", "review"]
data_test = getDataframeFromS3(file_name = file_name, names = names)

In [24]:
data_test['predicted'] = None
data_test.loc[:,'incorrect']  = 1
data_test.head()

Unnamed: 0,label,title,review,predicted,incorrect
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...,,1
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...,,1
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...,,1
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...,,1
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...,,1


# Default Amazon Comprehend Sentiment Analysis

## Training

In [10]:
data_test.shape

(400000, 5)

In [12]:
data_test = data_test.sample(5000)

In [13]:
data_test.loc[:,'predicted'] = data_test.loc[:,'review'].apply(getComprehendSentiment)

In [14]:
data_test

Unnamed: 0,label,title,review,predicted,incorrect
315933,1,counterfeit?,the batteries barely held a charge. One to thr...,1,1
330527,2,Awesome! Can't live without this stuff!,I love this stuff! I used to have a horrible t...,2,1
160858,1,waste of money,This CD is for you if you are a die hard Spide...,1,1
226132,1,Definately Not Like Morrowind,I was dissapointed that this wasn't like Morro...,1,1
281701,2,Pretty Cute Comedy,"This isn't the funniest movie I've ever seen, ...",2,1
...,...,...,...,...,...
124740,1,not great,This is a typical book for the Dragonlance ser...,1,1
85054,1,"Broke in 2 months, hideous customer service",My first unit was stolen at a car service plac...,1,1
265831,1,Not worth the money!!,"A good lens, but save your dollars and buy the...",2,1
364079,2,Good guide for those interested in Mustang upg...,This book is basically a republishing of artic...,2,1


In [15]:
data_test.loc[data_test['predicted'] == data_test['label'],'incorrect'] = 0

## Results

In [22]:
errors = data_test.incorrect.sum()
accuracy = (data_test.shape[0]-errors)/data_test.shape[0]
print(f"Accuracy is {accuracy*100}%")

Accuracy is 81.64%


In [24]:
errors = data_test.loc[data_test.predicted != 3, 'incorrect'].sum()
accuracy = (data_test.shape[0]-errors)/data_test.shape[0]
print(f"Accuracy discarding 'MIXED' and 'NEUTRAL' results is {accuracy*100}%")

Accuracy discarding 'MIXED' and 'NEUTRAL' results is 92.84%


In [25]:
del data_test

# Build Custom Classifier

## Prepare Training File

### Doesn't work

In [4]:
file_name = 'train.csv'
names=["label", "title", "review"]
data_train = getDataframeFromS3(file_name = file_name, names = names)

In [5]:
data_train.label = data_train.label.replace(1, "NEGATIVE")
data_train.label = data_train.label.replace(2, "POSITIVE")
data_train.review = data_train.review.str.replace(",", "")

In [6]:
data_train.head()

Unnamed: 0,label,title,review
0,POSITIVE,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,POSITIVE,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,POSITIVE,Amazing!,This soundtrack is my favorite music of all ti...
3,POSITIVE,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,POSITIVE,"Remember, Pull Your Jaw Off The Floor After He...",If you've played the game you know how divine ...


### Try with normalized text

In [4]:
file_name = 'X_train.csv'
names=["review"]
data_train = getDataframeFromS3(file_name = file_name, names = names)

In [10]:
file_name = 'y_train.csv'
names=["label"]
data_train["label"] = getDataframeFromS3(file_name = file_name, names = names)["label"]

In [14]:
data_train.label = data_train.label.replace(0, "NEGATIVE")
data_train.label = data_train.label.replace(1, "POSITIVE")
data_train = data_train[["label", "review"]]

In [15]:
data_train.head()

Unnamed: 0,label,review
0,POSITIVE,get this toy couple day ago and absolutely lov...
1,POSITIVE,though lyrically the overall feel this record ...
2,POSITIVE,have look for anthology outstanding literary m...
3,NEGATIVE,try own both the pink and the orangish color a...
4,POSITIVE,good coffee pot hot coffee cup make strong cup...


## Save Training File to S3

In [27]:
df = data_train.groupby('label').apply(lambda x: x.sample(300)).reset_index(drop=True)
fileName = "amazon-comprehend-train-data-sample.csv"
# Write to a file in S3
csv_buffer = io.StringIO()
df.to_csv(csv_buffer, header=False, index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, fileName).put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '7C5341C4DB3EAF75',
  'HostId': 'RnyliSY02do0s3t4zLH7WWdW3Ae3MlY8+01BamDt6I+oUcnq+v9SR1h9e/vnNrYOe9SBiDIhMtw=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'RnyliSY02do0s3t4zLH7WWdW3Ae3MlY8+01BamDt6I+oUcnq+v9SR1h9e/vnNrYOe9SBiDIhMtw=',
   'x-amz-request-id': '7C5341C4DB3EAF75',
   'date': 'Sun, 06 Sep 2020 22:55:38 GMT',
   'etag': '"5fe16e12344086ad8cdbef7d83055059"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"5fe16e12344086ad8cdbef7d83055059"'}

## Prepare request to create custom classifier

In [28]:
comprehend.create_document_classifier(
    DocumentClassifierName='amazon-reviews-custom-classifier-sample',
    DataAccessRoleArn='arn:aws:iam::197812454142:role/role_comprehend_s3',
    InputDataConfig={
        'S3Uri': f's3://{bucket}/{fileName}'
    },
    OutputDataConfig={
        'S3Uri': f's3://{bucket}/{fileName[:-4]}-output'
    },
    LanguageCode='en',
    Mode='MULTI_CLASS'
)

{'DocumentClassifierArn': 'arn:aws:comprehend:us-east-1:197812454142:document-classifier/amazon-reviews-custom-classifier-sample',
 'ResponseMetadata': {'RequestId': 'd2a99485-5266-47d8-b323-f6a7fea6ac0d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd2a99485-5266-47d8-b323-f6a7fea6ac0d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '129',
   'date': 'Sun, 06 Sep 2020 22:55:49 GMT'},
  'RetryAttempts': 0}}

# Test data on custom classifier

## Save test data to S3

In [19]:
file_name = 'test.csv'
# Load from S3 boto client
response = s3_client.get_object(Bucket=bucket ,Key=file_name)
# Get file from request
file = response["Body"].read()
# Load into pandas
data_test = pd.read_csv(io.BytesIO(file), names=["label", "title", "review"])

In [20]:
df = data_test["review"]
fileName = "amazon-comprehend-test-data"
# Write to a file in S3
csv_buffer = io.StringIO()
np.savetxt(csv_buffer, df.values, fmt='%s')
#sample.to_csv(csv_buffer, header=False, index=False, quoting = csv.QUOTE_NONE, quotechar = '')
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, fileName).put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '3H1XDZ1REY6P8TBP',
  'HostId': '+xbXusvEOweZCq+5cMh3AKGTP737AUfggVXNIvz6f+XMGSVXi2s4fUIxzKMINR5OafvdZyIIgrw=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '+xbXusvEOweZCq+5cMh3AKGTP737AUfggVXNIvz6f+XMGSVXi2s4fUIxzKMINR5OafvdZyIIgrw=',
   'x-amz-request-id': '3H1XDZ1REY6P8TBP',
   'date': 'Sat, 05 Sep 2020 02:56:41 GMT',
   'etag': '"06111f2385683fe63715ed89390a1400"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"06111f2385683fe63715ed89390a1400"'}

## Test Data

In [None]:
# No python function.
# Must to request or use Comprehend interface