# Using NLP to improve Custom Service operations

This is the accompanying notebook for Chapter 6 in the book - Natural Language Processing with AWS AI Services. Please ensure you have read and followed the instructions in Chapter 6 before trying out the steps in this notebook. Briefly this notebook covers the code required for the following topics covered:

1. Prerequisites
1. Preprocess the Text Data
1. Process Topic Modeling Results
1. Train an Amazon Comprehend Custom Classifier
1. Automate Request Routing using the Classifier
1. Automate feedback analysis using Comprehend Sentiment Detection

## Prerequisites

In [1]:
import pandas as pd
import webbrowser, os
import json
import boto3
import re
import sagemaker
from sagemaker import get_execution_role
from sagemaker.s3 import S3Uploader, S3Downloader
import uuid
import time
import io
from io import BytesIO
import sys
import csv
from pprint import pprint
from IPython.display import Image, display
from PIL import Image as PImage, ImageDraw

In [2]:
bucket = '<your-s3-bucket>'
prefix = 'chapter6'

## Preprocess the Text data

We will use the Consumer Complaints data for the State of Ohio from the Consumer Financial Protection Bureau for our solution - https://www.consumerfinance.gov/data-research/consumer-complaints/search/?dataNormalization=None&dateRange=1y&date_received_max=2021-05-17&date_received_min=2020-05-17&searchField=all&state=OH&tab=Map. You can try other datasets from this site, or your own unique customer service data. For your convenience, I have included the complaints data as a CSV file in the github repository, which should be available to you when you clone the repo.  

In [None]:
# Load the csv file into a Pandas DataFrame for easy manipulation
raw_df = pd.read_csv('topic-modeling/initial/complaints_data_initial.csv')
raw_df.shape

In [None]:
# drop all the rows where the complaint field is empty
raw_df = raw_df.dropna(subset=['Consumer complaint narrative'])

In [None]:
# Let's drop the rest of the columns, we only need the complaint field for our solution
raw_df = pd.DataFrame(raw_df['Consumer complaint narrative'].copy())

In [None]:
raw_df.shape

In [None]:
# Convert this back to the CSV file
directory = "raw"
parent_dir = os.getcwd()+'/topic-modeling'
 
# Path
path = os.path.join(parent_dir, directory)
os.makedirs(path, exist_ok = True)
print("Directory '%s' created successfully" %directory)

raw_df.to_csv('topic-modeling/raw/complaints_data_subset.csv', header=False, index=False)

In [None]:
# Define a regex expression method for splitting the text into individual sentences
# source - https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = re.sub(r'(?<=\d)[\.](?=\d)','',text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(". ","")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    #text = text.replace("?","")
    #text = text.replace("!","")
    text = text.replace("<prd>",".")
    text = text.replace('"','')
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [None]:
# Run Regex expression to create a list of sentences
folderpath = r"topic-modeling/raw" # make sure to put the 'r' in front and provide the folder where your files are
filepaths  = [os.path.join(folderpath, name) for name in os.listdir(folderpath) if not name.startswith('.')] # do not select hidden directories
print(filepaths)
all_files = []

for path in filepaths:
    with open(path, 'r') as f:
        structured_text = split_into_sentences(f.read())
        all_files.append(structured_text)        
f.close()

In [None]:
# Write the formatted sentences into a CSV file
import csv
fnfull = "topic-modeling/input/complaints_data_formatted.csv"
directory = "input"
parent_dir = os.getcwd()+'/topic-modeling'

# Path
path = os.path.join(parent_dir, directory)
os.makedirs(path, exist_ok = True)
print("Directory '%s' created successfully" %directory)

with open(fnfull, "w", encoding='utf-8') as ff:
    csv_writer = csv.writer(ff, delimiter=',', quotechar = '"')
    for infile in all_files:
        for num, sentence in enumerate(infile):
            csv_writer.writerow([sentence])

In [None]:
# Let's store the formatted CSV into a Pandas DataFrame 
# as we will use this to create the training dataset for our custom classifier
columns = ['Text']
form_df = pd.read_csv('topic-modeling/input/complaints_data_formatted.csv', header=None, names = columns)
form_df.shape

In [None]:
# Upload the CSV file to the input prefix in S3 to be used in the topic modeling job
s3 = boto3.client('s3')
s3.upload_file('topic-modeling/input/complaints_data_formatted.csv', bucket, prefix+'/topic_modeling/input/topic_input.csv')

#### Now follow the instructions in the book to run the topic modeling job from the Amazon Comprehend console

## Process Topic Modeling Results

In [None]:
# Let's first download the results of the topic modeling job. 
# Please copy the output data location from your topic modeling job for this step and use it below

directory = "results"
parent_dir = os.getcwd()+'/topic-modeling'
 
# Path
path = os.path.join(parent_dir, directory)
os.makedirs(path, exist_ok = True)
print("Directory '%s' created successfully" %directory)

tpprefix = prefix+'/topic_modeling/results/<name-of-your-output-data-s3-prefix>/output/output.tar.gz'
s3.download_file(bucket, tpprefix, 'topic-modeling/results/output.tar.gz')
!tar -xzvf topic-modeling/results/output.tar.gz

In [None]:
# Now load each of the resulting CSV files to their own DataFrames
tt_df = pd.read_csv('topic-terms.csv')
dt_df = pd.read_csv('doc-topics.csv')

In [None]:
# the topic terms DataFrame contains the topic number, what term corresponds to the topic, and 
# the weightage of this term contributing to the topic
for i,x in tt_df.iterrows():
    print(str(x['topic'])+":"+x['term']+":"+str(x['weight']))

In [None]:
# We may have multiple topics in the same line, but for this example we are not interested in these duplicates, so we will drop it
dt_df = dt_df.drop_duplicates(subset=['docname'])

In [None]:
# Filter the rows in the mean range of weightage for a topic
ttdf_max = tt_df.groupby(['topic'], sort=False)['weight'].max()

In [None]:
# Load these into its own DataFrame and remove terms that are masked
newtt_df = pd.DataFrame()
for x in ttdf_max:
    newtt_df = newtt_df.append(tt_df.query('weight == @x'))

newtt_df = newtt_df.reset_index(drop=True)    
newtt_df

In [None]:
# Having review the input document, the masked characters mainly correspond to debt related complaints from customers
# so we will replace the masked terms with Debt and we will replace the word Husband with Family

form_df.assign(Label='')

for i, r in dt_df.iterrows():
    line = int(r['docname'].split(':')[1])
    top = r['topic']
    tdf = newtt_df.query('topic == @top')
    term = tdf['term'].values[0]
    if term == 'xxxx':
        term = 'debt'
    if term == 'husband':
        term = 'family'
    form_df.at[line, 'Label'] = term

In [None]:
form_df['Label'].unique()

In [None]:
# create the custom-classification directory
directory = "custom-classification"
parent_dir = os.getcwd()
 
# Path
path = os.path.join(parent_dir, directory)
os.makedirs(path, exist_ok = True)
print("Directory '%s' created successfully" %directory)

# create the train directory
directory = "train"
parent_dir = os.getcwd()+'/custom-classification'
 
# Path
path = os.path.join(parent_dir, directory)
os.makedirs(path, exist_ok = True)
print("Directory '%s' created successfully" %directory)

# Let's now rearrange the columns to have the label as the first column
form_df = form_df[['Label','Text']]
form_df.to_csv('custom-classification/train/train.csv', header=None, index=False)
s3.upload_file('custom-classification/train/train.csv', bucket, prefix+'/custom_classification/train/train.csv')

#### Now follow the instructions in the book to train your Amazon Comprehend Custom Classifier

## Automate Request Routing

In [None]:
endpoint_arn = '<comprehend-custom-classifier-endpoint-arn>'

In [None]:
# Now let's use the our custom classifier for real-time analysis
test_text = 'because of your inability to accept my payments on time I now have a really bad credit score, you need to fix this now'
comprehend = boto3.client('comprehend')
response = comprehend.classify_document(Text=test_text, EndpointArn=endpoint_arn)
print(response)

In [None]:
# Lets get the label name for the maximum score which is where this request should be routed to
cls_df = pd.DataFrame(response['Classes'])
max_score = cls_df['Score'].max()
routing_type = cls_df.query('Score == @max_score')['Name'].values[0]
print("This request should be routed to: " + routing_type)

## Automate Feeback Analysis

In [None]:
# Now we will use Amazon Comprehend Detect Sentiment API to analyze the customer's feedback
sent_response = comprehend.detect_sentiment(
    Text=test_text,
    LanguageCode='en'
)
print("The customer's feedback sentiment is: " + sent_response['Sentiment'])

#### This concludes the notebook, please go back to the book for reviewing the next steps

### End of Notebook