# Module 3 Companion Notebook

This Jupyter notebook is the companion notebook for Module 3 of the AWS Academy Machine Learning Foundations course. The following cells contain the code from the presentation.

## Dataset attributions

This notebook uses the following datasets: 

- [Wine Data Set](https://archive.ics.uci.edu/ml/datasets/Wine)
- [Automobile Data Set](https://archive.ics.uci.edu/ml/datasets/Automobile) 

These datasets are from:
Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.

## pandas

### Reformats data into tabular representation (DataFrame)
### Converts common formats like comma-separated values (CSV), JavaScript Object Notation (JSON), Excel, Pickle, and others

In [None]:
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df_wine = pd.read_csv(url,';')


## pandas DataFrame

In [None]:
df_wine.shape

In [None]:
df_wine.head()

## Index and column names

In [None]:
df_wine.columns

In [None]:
df_wine.index

## DataFrame schema

In [None]:
df_wine.dtypes

In [None]:
df_wine.info()

## Statistical characteristics

In [None]:
df_wine.describe()

## Categorical statistics identify frequency of values and class imbalances

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
df_car = pd.read_csv(url,',',names=['buying','maint','doors','persons','lug_boot','safety','class'])


In [None]:
df_car.head(5)

In [None]:
df_car.describe()

## Plotting attribute statistics

In [None]:
import matplotlib.pyplot as plt
df_wine['sulphates'].hist(bins=10)
plt.show()

In [None]:
df_wine['sulphates'].plot.kde()
plt.show()

In [None]:
df_wine['sulphates'].plot.box()
plt.show()

## Plotting multivariate statistics

In [None]:
df_wine.plot.scatter(
    x='alcohol', 
    y='sulphates')
plt.show()

In [None]:
pd.plotting.scatter_matrix(
    df_wine[['citric acid',
        'alcohol',
        'sulphates']])
plt.show()

## Scatter plot with identification

In [None]:
high = df_wine[['sulphates','alcohol']][df_wine['quality']>5]
low = df_wine[['sulphates','alcohol']][df_wine['quality']<=5]

plt.scatter(high['sulphates'],high['alcohol'],s=50,c='blue',marker='o',label='great')
plt.scatter(x=low['sulphates'],y=low['alcohol'],s=50,c='red',marker='v',label='poor')

## Correlation matrix

In [None]:
corr_matrix = df_wine.corr()
corr_matrix["quality"].sort_values(ascending=False)

## Correlation matrix heatmap

In [None]:
import seaborn as sns
column_names = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']

correlations = df_wine.corr()
fig, ax = plt.subplots(figsize=(10, 10))

colormap = sns.color_palette("BrBG", 10)

sns.heatmap(correlations, 
    cmap=colormap, 
    annot=True, 
    fmt=".2f")

ax.set_yticklabels(column_names);

plt.show()


## Imputing missing data

In [None]:
from sklearn.impute import SimpleImputer

import numpy as np
arr = np.array([[5,3,2,2],[3,None,1,9],[5,2,7,None]])
print(arr)

In [None]:
imputer = SimpleImputer(strategy='mean')
imp = imputer.fit(arr)
imputer.transform(arr) 

## Training a model

In [None]:
df_wine['quality']=df_wine['quality'].map({3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5})

In [None]:
cols = df_wine.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_wine = df_wine[cols]
df_wine.head()

In [None]:
pd.set_option('precision', 6)

In [None]:
df_wine.shape

In [None]:
df_wine.head(20)

## Splitting the data into 80/10/10

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test_and_validate = train_test_split(df_wine, 
                                            test_size=0.2, 
                                            random_state=42, 
                                            stratify=df_wine['quality'])

In [None]:
test, validate = train_test_split(test_and_validate, 
                                  test_size=0.5, 
                                  random_state=42, 
                                  stratify=test_and_validate['quality'])

In [None]:
print(train.shape)
print(test.shape)
print(validate.shape)

In [None]:
t1 = train['quality'].value_counts()
t2 = test['quality'].value_counts()
t3 = validate['quality'].value_counts()
result = pd.concat([t1,t2,t3], axis=1, sort=False)
result

## Uploading to Amazon S3

In [None]:
import boto3
import io
import os

In [None]:
bucket='c45317a617679l1523854t1w00381652629-sandboxbucket-3apxi73oxsw6'
prefix='wine'
train_file='wine_train.csv'
test_file='wine_test.csv'
validate_file='wine_validate.csv'
whole_file='wine.csv'
s3_resource = boto3.Session().resource('s3')

def upload_s3_csv(filename, folder, dataframe):
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, header=False, index=False )
    s3_resource.Bucket(bucket).Object(os.path.join(prefix, folder, filename)).put(Body=csv_buffer.getvalue())

upload_s3_csv(train_file, 'train', train)
upload_s3_csv(test_file, 'test', test)
upload_s3_csv(validate_file, 'validate', validate)

## Creating the estimator

In [None]:
from sagemaker.image_uris import retrieve
import sagemaker
role=sagemaker.get_execution_role()
s3_output_location="s3://{}/{}/output/".format(bucket,prefix)
container = retrieve('xgboost',boto3.Session().region_name,'1.0-1')

hyperparams={
    "num_round":"40",
    "num_class":"6",
    "objective":"multi:softmax"}

xgb_model=sagemaker.estimator.Estimator(container,
                                        role,
                                        instance_count=1,
                                        instance_type='ml.m5.xlarge',
                                        output_path=s3_output_location,
                                        hyperparameters=hyperparams,
                                        sagemaker_session=sagemaker.Session())

## Creating the input channels

In [None]:
train_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train/".format(bucket,prefix,train_file),
    content_type='text/csv')

validate_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validate/".format(bucket,prefix,validate_file),
    content_type='text/csv')

data_channels = {'train': train_channel, 'validation': validate_channel}

## Training the model

In [None]:
xgb_model.fit(inputs=data_channels, logs=False)

## Creating an endpoint

In [None]:
xgb_predictor = xgb_model.deploy(initial_instance_count=1,
                serializer = sagemaker.serializers.CSVSerializer(),
                instance_type='ml.m4.xlarge')


## Viewing the metrics from the training job

In [None]:
s=sagemaker.analytics.TrainingJobAnalytics(xgb_model._current_job_name, 
                                         metric_names = ['train:merror', 
                                                         'validation:merror']
                                        )

s_df=s.dataframe()
s_df = s_df.iloc[:,1:3]
s_df
#(wrong cases)/#(all cases)

## Loading the test data into Amazon S3, without the target column


In [None]:
batch_X = test.iloc[:,1:];
batch_X_file='batch-in.csv'
upload_s3_csv(batch_X_file, 'batch-in', batch_X)


## Performing a batch transform on the test data

In [None]:
batch_output = "s3://{}/{}/batch-out/".format(bucket,prefix)
batch_input = "s3://{}/{}/batch-in/{}".format(bucket,prefix,batch_X_file)

xgb_transformer = xgb_model.transformer(instance_count=1,
                                       instance_type='ml.m5.xlarge',
                                       strategy='MultiRecord',
                                       assemble_with='Line',
                                       output_path=batch_output)

xgb_transformer.transform(data=batch_input,
                         data_type='S3Prefix',
                         content_type='text/csv',
                         split_type='Line')
xgb_transformer.wait()

## Downloading the results from Amazon S3

In [None]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key="{}/batch-out/{}".format(prefix,'batch-in.csv.out'))
target_predicted = pd.read_csv(io.BytesIO(obj['Body'].read()),',',names=['target'])

## Generating a confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
test_labels = test.iloc[:,0]
matrix = confusion_matrix(test_labels, target_predicted)
df_confusion = pd.DataFrame(matrix) 
df_confusion.head()

## Generating a confusion matrix heatmap

In [None]:
colormap = sns.color_palette("BrBG", 10)
sns.heatmap(df_confusion, annot=True, cbar=None, cmap=colormap)
plt.title("Confusion Matrix")
plt.tight_layout()
plt.ylabel("True Class")
plt.xlabel("Predicted Class")
plt.show()

## Creating a hyperparameter tuning job

In [None]:
from sagemaker.parameter import (
    CategoricalParameter,
    ContinuousParameter,
    IntegerParameter,
    ParameterRange,
)
from sagemaker.amazon.hyperparameter import Hyperparameter
from sagemaker.tuner import HyperparameterTuner

container = retrieve('xgboost',boto3.Session().region_name,'1.0-1')

hyperparameter_ranges = {'alpha': ContinuousParameter(0, 1000),
                         'eta': ContinuousParameter(0.1, 0.5),
                         'min_child_weight': ContinuousParameter(1, 120),
                         'subsample': ContinuousParameter(0.5, 1),
                         'num_round': IntegerParameter(1,4000)}

objective_metric_name = 'validation:merror'
objective_type = 'Minimize'

tuner = HyperparameterTuner(xgb_model,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=30,
                            max_parallel_jobs=1,
                            objective_type=objective_type)

tuner.fit(inputs=data_channels, include_cls_metadata=False)
tuner.wait()

In [None]:
#import time
#tuning_job_name = tuner.latest_tuning_job.job_name
#job_run_status = smc.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)['HyperParameterTuningJobStatus']
#print (job_run_status)

#while job_run_status not in ('Failed', 'Completed', 'Stopped'):
#    job_run_status = smc.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)['HyperParameterTuningJobStatus']
#    print (job_run_status)
#    time.sleep(15)