## Environment set up 

In [None]:
%pip install -r requirements.txt

In [None]:
import os

# Use the current working directory as the location for SageMaker Python SDK config file
os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

In [None]:
import pandas
import xgboost
import sklearn

print(f"Pandas version: {pandas.__version__}")
print(f"XGBoost version: {xgboost.__version__}")
print(f"SKLearn version: {sklearn.__version__}")

Install the seaborn data visulization library.

In [None]:
%pip install seaborn

Download the dataset from the UCI website.

In [None]:
import urllib
import os

input_data_dir = 'data/'
if not os.path.exists(input_data_dir):
    os.makedirs(input_data_dir)
input_data_path = os.path.join(input_data_dir, 'predictive_maintenance_raw_data_header.csv')
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00601/ai4i2020.csv"
urllib.request.urlretrieve(dataset_url, input_data_path)

# Exploratory data analysis

Determine the number of samples (rows) and features (columns) in the dataset.

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv(input_data_path)

print('The shape of the dataset is:', df.shape)

Preview the first 10 rows.

In [None]:
df.head(10)

Check the data types for each column and identify columns with missing values.

In [None]:
df.describe()

List the possible values for the "Machine failure" column and frequency of their occurence over the entire dataset.

In [None]:
df['Machine failure'].value_counts()

Plot the target columns to visualise the distribution of values.

In [None]:
import matplotlib.pyplot as plt

df['Machine failure'].value_counts().plot.bar()
plt.show()

In [None]:
# compute the count of unique values for colums in df
df.nunique()

Drop the attributes you are not interested in and keep only the numeric attributes.

In [None]:
df1 = df.sample(frac =.1)
df1 = df1.drop(['UDI', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis=1).select_dtypes(include='number')
df1.head()

View the summary of the pre-processed dataset.

In [None]:
df1.info()

Use a pair plot to spot correlations.

In [None]:
import seaborn
import matplotlib.pyplot as plt

seaborn.pairplot(df1, hue='Machine failure', corner=True)
plt.show()

## Feature Engineering

In [None]:
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from imblearn.over_sampling import SMOTE

from sagemaker.remote_function import remote

@remote(keep_alive_period_in_seconds=3600, job_name_prefix="amzn-sm-btd-preprocess")
def preprocess(df):
    columns = ['Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Machine failure']
    cat_columns = ['Type']
    num_columns = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
    target_column = 'Machine failure'

    df = df[columns]

    training_ratio = 0.8
    validation_ratio = 0.1
    test_ratio = 0.1

    X = df.drop(target_column, axis=1)
    y = df[target_column]

    sm = SMOTE(sampling_strategy = 1, k_neighbors = 3)

    X, y = sm.fit_resample(X, y)

    print(f'Splitting data training ({training_ratio}), validation ({validation_ratio}), and test ({test_ratio}) sets ')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=0, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_ratio/(validation_ratio+training_ratio), random_state=2, stratify=y_train)

    # Apply transformations
    transformer = ColumnTransformer(transformers=[('numeric', StandardScaler(), num_columns),
                                                  ('categorical', OneHotEncoder(), cat_columns)],
                                    remainder='passthrough')
    featurizer_model = transformer.fit(X_train)
    X_train = featurizer_model.transform(X_train)
    X_val = featurizer_model.transform(X_val)

    print(f'Shape of train features after preprocessing: {X_train.shape}')
    print(f'Shape of validation features after preprocessing: {X_val.shape}')
    print(f'Shape of test features after preprocessing: {X_test.shape}')
    
    y_train = y_train.values.reshape(-1)
    y_val = y_val.values.reshape(-1)
    
    print(f'Shape of train labels after preprocessing: {y_train.shape}')
    print(f'Shape of validation labels after preprocessing: {y_val.shape}')
    print(f'Shape of test labels after preprocessing: {y_test.shape}')

    model_file_path="/opt/ml/model/sklearn_model.joblib"
    os.makedirs(os.path.dirname(model_file_path), exist_ok=True)
    joblib.dump(featurizer_model, model_file_path)

    return X_train, y_train, X_val, y_val, X_test, y_test, featurizer_model

The function returns multiple values, including the training, validation, and test features and labels, and the featurizer model.

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test, featurizer_model = preprocess(df)

Analyze the featurizer model structure.

In [None]:
featurizer_model

In [None]:
import pandas as pd
pd.DataFrame(X_train).head(10)

## Model Training

In [None]:
import os
import xgboost
import numpy as np

from sagemaker.remote_function import remote

@remote(keep_alive_period_in_seconds=3600, job_name_prefix="amzn-sm-btd-train")
def train(X_train, y_train, X_val, y_val,
          eta=0.1, 
          max_depth=2, 
          gamma=0.0,
          min_child_weight=1,
          verbosity=0,
          objective='binary:logistic',
          eval_metric='auc',
          num_boost_round=5):

    print('Train features shape: {}'.format(X_train.shape))
    print('Train labels shape: {}'.format(y_train.shape))
    print('Validation features shape: {}'.format(X_val.shape))
    print('Validation labels shape: {}'.format(y_val.shape))

    # Creating DMatrix(es)
    dtrain = xgboost.DMatrix(X_train, label=y_train)
    dval = xgboost.DMatrix(X_val, label=y_val)
    watchlist = [(dtrain, "train"), (dval, "validation")]

    print('')
    print (f'===Starting training with max_depth {max_depth}===')

    param_dist = {
        "max_depth": max_depth,
        "eta": eta,
        "gamma": gamma,
        "min_child_weight": min_child_weight,
        "verbosity": verbosity,
        "objective": objective,
        "eval_metric": eval_metric
    }

    xgb = xgboost.train(
        params=param_dist,
        dtrain=dtrain,
        evals=watchlist,
        num_boost_round=num_boost_round)

    predictions = xgb.predict(dval)

    print ("Metrics for validation set")
    print('')
    print (pd.crosstab(index=y_val, columns=np.round(predictions),
                       rownames=['Actuals'], colnames=['Predictions'], margins=True))
    print('')

    rounded_predict = np.round(predictions)

    val_accuracy = accuracy_score(y_val, rounded_predict)
    val_precision = precision_score(y_val, rounded_predict)
    val_recall = recall_score(y_val, rounded_predict)

    print("Accuracy Model A: %.2f%%" % (val_accuracy * 100.0))
    print("Precision Model A: %.2f" % (val_precision))
    print("Recall Model A: %.2f" % (val_recall))

    from sklearn.metrics import roc_auc_score

    val_auc = roc_auc_score(y_val, predictions)
    print("Validation AUC A: %.2f" % (val_auc))

    model_file_path="/opt/ml/model/xgboost_model.bin"
    os.makedirs(os.path.dirname(model_file_path), exist_ok=True)
    xgb.save_model(model_file_path)

    return xgb

Running the training function will initiate a SageMaker training job because the function is decoarated with the @remote decorator.

In [None]:
eta=0.3
max_depth=8

booster = train(X_train, y_train, X_val, y_val,
              eta=eta, 
              max_depth=max_depth)

Display the information about the trained model.

In [None]:
booster

### Using the models to generate predictions

In [None]:
#@remote(keep_alive_period_in_seconds=600, job_name_prefix="amzn-sm-btd-test")
def test(featurizer_model, booster, X_test, y_test):

    X_test = featurizer_model.transform(X_test)
    y_test = y_test.values.reshape(-1)

    dtest = xgboost.DMatrix(X_test, label=y_test)
    test_predictions = booster.predict(dtest)
    
    print ("===Metrics for Test Set===")
    print('')
    print (pd.crosstab(index=y_test, columns=np.round(test_predictions), 
                                     rownames=['Actuals'], 
                                     colnames=['Predictions'], 
                                     margins=True)
          )
    print('')

    rounded_predict = np.round(test_predictions)

    accuracy = accuracy_score(y_test, rounded_predict)
    precision = precision_score(y_test, rounded_predict)
    recall = recall_score(y_test, rounded_predict)
    print('')

    print("Accuracy Model A: %.2f%%" % (accuracy * 100.0))
    print("Precision Model A: %.2f" % (precision))
    print("Recall Model A: %.2f" % (recall))

    from sklearn.metrics import roc_auc_score

    auc = roc_auc_score(y_test, test_predictions)
    print("AUC A: %.2f" % (auc))

Test the trained model using the text features and labels.

In [None]:
test(featurizer_model, booster, X_test, y_test)