# For this lab we will be using the following dataset to predict if the text belongs to __label__1 or __label__2

# STEP 1 - SETUP THE JUPYTER NOTEBOOK

## First we update the SageMaker environment

In [None]:
%pip install -qU --upgrade boto3
%pip install -qU --upgrade sagemaker

## Now we install the libraries that we will need.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from sklearn.model_selection import train_test_split
import sagemaker
from sagemaker.estimator import Estimator 
from sagemaker import get_execution_role
import boto3
from sagemaker import image_uris
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
import time
from time import gmtime, strftime

## Now we download the Natural Language Toolkit: Punkt sentence tokenizer to tokenize the words and punctuation

In [None]:
nltk.download('punkt')

## Now we will download the list of stopwords from Natural Language Toolkit so that we can use it to remove the stopwords in our sentences

In [None]:
nltk.download('stopwords')
stopwords = stopwords.words('english')

## Now we setup our SageMaker Environment Variables

In [None]:
role = get_execution_role()
sess = sagemaker.Session() 
region = sess.boto_session.region_name
bucket = sess.default_bucket()
print("The role is ", role)
print("The session is ", sess)
print("The region is ", region)
print("The bucket is ", bucket)

# STEP 2 - PRE PROCESS THE DATA

## In this step your job is to convert the dataset into the following format.   

## _ _ label _ _ {label name} [space] {Text from the dataset that has been identified as the label}

### Lets take a look at the data

In [None]:
# Import the data into a pandas DataFrame

amazon_df = pd.read_csv("Amazon_text.csv")

In [None]:
# Look at the data

amazon_df

### Now let's tokenize the column "Review".

### Note that the column review is turned into a LIST of words and we will need to cahnge that back to a text string

In [None]:
# Use NTLK word_tokenize to seperate the sentence into words

amazon_df['Review'] = amazon_df['Review'].apply(word_tokenize)

In [None]:
print(amazon_df)

In [None]:
# The results create a list in the message column and we will use apply, join and lanbda to replace the list with a string

amazon_df['Review'] = amazon_df['Review'].apply(lambda x: ' '.join(x))

In [None]:
# Look at the data

amazon_df

### Notice now how we are back to a string of text an it is not a list of words

### Now lets turn all the letters to lower case before we remove the stop words because the stop words are all lower case.

In [None]:
# We will convert the case of each word to lower case to simplify the removal. ie: The, THE and the all equal the

amazon_df['Review'] = amazon_df['Review'].str.lower()

In [None]:
# Look at the data

amazon_df

### Now let's remove all the stopwords

In [None]:
# Using Apply, lambda and a For loop remove all the stop words
# word for word in x.split() - Split the first message into words and run the loop for each word
# if the word is not in teh stopword list use the join method to put it back in the sentence

amazon_df['Review'] = amazon_df['Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

In [None]:
# Look at the data

amazon_df

### Now that the column "Review" is in the right format we can now create the Training and Validation file

In [None]:
# Shuffle and Split the data into 80% training and 20% validating

train_data, validate_data = train_test_split(amazon_df, test_size=0.2)

# Convert dataframes to TEXT files and save them locally to the notebook and make sure to exclude the headers so that BlazingText will accept the file

train_data.to_csv('Amazon_text.train', sep=' ', header=False, index=False, quotechar=" ")
validate_data.to_csv('Amazon_text.validate', sep=' ', header=False, index=False, quotechar=" ")

## Now copy the files to the S3 bucket in the appropriate folders so the model can find the files.

In [None]:
# Copy the file to your S3 bucket using sess.upload_data
# You will need to pass the file to be uploaded, a prefix (top level folder to store the data) and the subfolders
# Here we will also define the output folder for the results
# Note I did not call them .csv because the are not csv files but text files

prefix = 'Amazon_text'
training_data_path = sess.upload_data( path='Amazon_text.train', key_prefix=prefix + '/input/train') 
validation_data_path = sess.upload_data( path='Amazon_text.validate', key_prefix=prefix + '/input/validate')
output_data_path = 's3://{}/{}/output'.format(bucket, prefix)
print(training_data_path)
print(validation_data_path)
print(output_data_path)

# STEP 3 - This is where you will Train the Model

## Tell SageMaker which pre-built Docker image to use

In [None]:
# https://sagemaker.readthedocs.io/en/stable/api/utility/image_uris.html



## Tell SageMaker where the data is located
## Hint remember that we copied it to the S3 bucket above

In [None]:
# https://sagemaker.readthedocs.io/en/stable/api/utility/inputs.html



## Create the Estimator and apply only the required hyperaprameters.

In [None]:
#  ESTIMATOR  https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html
#  HYPERPARAMETERS  https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext_hyperparameters.html



## Launch the training job without hyperparameter tuning so that we can compare the results after hyperparameter tuning. 



In [None]:
# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html#sagemaker.estimator.EstimatorBase.fit



## When the training is done it will report the Training Accuracy and Validation Accuracy. Make note of each so that we can see the different

# Now lets tune some Hyperparameters

## Set the Hyperparameter tuning dictionary

In [None]:
# Set the blazingtext hyperparameter dictonary
# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html#sagemaker.estimator.Estimator.set_hyperparameters


## Define the interaction with Amazon SageMaker hyperparameter tuning job


In [None]:
# https://sagemaker.readthedocs.io/en/stable/api/training/tuner.html

# Take the time to review and choose an objective you want to target
# https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext-tuning.html

# Take the time to review the tunable hyperparameters and try some different ones. 
# https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext-tuning.html





## Start the tuning Jobs
## Note: we already did this earlier when we trained our first job. We are going to do it again but with using HyperparameterTuner instead of Estimator

In [None]:
# https://sagemaker.readthedocs.io/en/stable/api/training/tuner.html#sagemaker.tuner.HyperparameterTuner.fit



## When the training is done print out the best values for each parameter to included. 


In [None]:
# https://sagemaker.readthedocs.io/en/stable/api/training/tuner.html#sagemaker.tuner.HyperparameterTuner.best_training_job



## Launch a new training job with the new hyperparameter values you just printed out

## Hint repeat the step earlier where you ran the training job with only the required hyparameters and now include the ones you chose in the hyperparameter training and use the values you printed. 

In [None]:
# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html#sagemaker.estimator.EstimatorBase.fit


## Take note of the different in the Training and Validation accuracy, did they imporve or get worse?

# Deploy and Test the model

## Choose the model you want to use from the training session and load it to a variable. This should be the model from the last trainign session and you can use .model_data to load it

In [None]:
# {variable Name} = {ModelName}.model_Data

## Tell SageMaker which docker container image to use

In [None]:
## https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateModel.html


## To host your model, you create an endpoint configuration with the CreateEndpointConfig API

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateEndpointConfig.html



## and then create an endpoint with the CreateEndpoint API

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateEndpoint.html



## Once the Endpoint has been deployed make an inference

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html

## Don't forget to delete your model, endpoint configuration and endpoint to preserve your budget

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/realtime-endpoints-delete-resources.html