In [10]:
from typing import Union
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV, GroupKFold

from xgboost import XGBRegressor

import helper_functions as helper
import mlflow
mlflow.set_tracking_uri("http://mlops-mlflow-load-balancer-1002257987.us-east-1.elb.amazonaws.com")


In [16]:
def evaluate_model(model, x_train, y_train, x_test, y_test, run):
    """ Evaluates model on both trainig and testing data """
    mf_client = mlflow.tracking.MlflowClient()
    def _get_metrics(pred, target, metrics_dict: dict) -> dict:
        mse = mean_squared_error(target, pred)
        metrics_dict['MSE'] = mse.round(2)
        rmse = np.sqrt(mse)
        metrics_dict['RMSE'] = rmse.round(2)
        r2 = r2_score(target, pred)
        metrics_dict['R2'] = r2.round(2)
        mae = mean_absolute_error(target, pred)
        metrics_dict['MAE'] = mae.round(2)
        return metrics_dict
    # Predict for both datasets
    train_predict = model.predict(x_train)
    test_predict = model.predict(x_test)
    # Calculate metrics
    train_metrics, test_metrics = {'data': 'training'}, {'data': 'test'}
    # Get training metrics
    train_metrics = _get_metrics(train_predict, y_train, train_metrics)
    # Get test metrics
    test_metrics = _get_metrics(test_predict, y_test, test_metrics)
    # Plot the residuals error
    figure = helper.plot_residuals(test_predict, y_test)
    print(train_metrics)
    print(test_metrics)
    # Save metrics to the MLflow
    for metric_name, value in test_metrics.items():
        if metric_name == "data":
            continue
        else:
            mlflow.log_metric(metric_name, value)
    # Save the residuals plot in MLflow Artifacts\
    mf_client.log_figure(run.info.run_id, figure, "Plots/residual_plot.png")
    return train_metrics, test_metrics

def define_ml_dataset(train_data: pd.DataFrame, test_data: pd.DataFrame, test_target: pd.DataFrame, 
                      features: list, run):
    """ Creates the training and test datasets to train the models 
        :argument: train_data - Pandas DataFrame as training dataset, should contain target RUL
        :argument: test_data - Pandas DataFrame as test dataset, does not contain target RUL
        :argument: y_test - Pandas DataFrame test dataset target RUL
        :argument: features - List containing name of columns to use as features from training set
    """
    mf_client = mlflow.tracking.MlflowClient()
    # Get features and label for training dataset
    x_train = train_data[features]
    y_train = train_data['rul']
    # Get only the last row for each of the units/engines because we only have labels for those rows
    test_grouped = test_data.groupby('unit').last().reset_index()
    # Get features and label for test dataset
    x_test = test_grouped[features]
    y_test = test_target['rul']
    features_used = {'Features': features}
    mf_client.log_dict(run.info.run_id, features_used, 'Features/features.json')
    return x_train, y_train, x_test, y_test

def train_baseline(x_train: pd.DataFrame, y_train: pd.Series):
    """ Trains the Linear Regression model as a baseline 
        :argument: train_data - Pandas DataFrame as training dataset, should contain target RUL
        :argument: test_data - Pandas DataFrame as test dataset, does not contain target RUL
        :argument: y_test - Pandas DataFrame test dataset target RUL
    """
    # Create Linear Regression model
    model = LinearRegression()
    # Start training
    model.fit(x_train, y_train)
    mlflow.sklearn.log_model(model, "LinearRegression")
    return model

def train_xgboost(x_train: pd.DataFrame, y_train: pd.Series, parameters: dict) -> Union[XGBRegressor, pd.DataFrame]:
    """ Trains the Linear Regression model as a baseline 
        :argument: x_train - Pandas DataFrame as training dataset, does not contain target RUL
        :argument: y_train - Pandas Series, contain target RUL
        :argument: parameters - Dictionary with XGBoostRegressor parameters to perform GridSearch over
    """
    # Instantiate the model object
    model = XGBRegressor(objective="reg:squarederror", random_state=123, booster='gbtree')
    # Define the group data folding with column 'unit'
    group_fold = GroupKFold(n_splits=3)
    # Define the GridSearchCV
    grid_search = GridSearchCV(model, param_grid=parameters, n_jobs=-1,
                               cv=group_fold.split(x_train, groups=x_train['unit']),
                               verbose=3, error_score='raise', scoring='neg_mean_squared_error')
    # Train the model with GridSearch
    grid_search.fit(x_train, y_train)
    # Grid Search results
    grid_results = pd.DataFrame(grid_search.cv_results_)
    # Get best model from GridSearch
    best_model = grid_search.best_estimator_
    # Print best score and best params
    best_score = abs(grid_search.best_score_)
    print(f'Best model score: \nMSE: {best_score.round(2)}\nRMSE: {np.sqrt(best_score).round(2)}')
    print(f'Best model parameters: \n{grid_search.best_params_}')
    return best_model, grid_results
    

In [17]:
raw_train_data, raw_test_data, test_target = helper.load_raw_data('train_FD004.txt')
train_data = helper.create_target(raw_train_data)
# Round the condition column values and create new conditio column
train_data = helper.round_conditions(train_data)
test_data = helper.round_conditions(raw_test_data)
# ============= FEATURE ENGINEERING ===============
# Standardize the data grouped with condition 
standardized_data = helper.standardize(train_data)
standardized_test_data = helper.standardize(test_data)
# Get statistics for all sensors for every condition group
conditioned_stats = helper.get_condition_stats(train_data)
# Perform data smoothing on sensor columns with moving average
smoothed_data = helper.smooth_data(standardized_data, window=10)
smoothed_test_data = helper.smooth_data(standardized_test_data, window=10)

In [11]:
mlflow.set_experiment("Predictive-Maintenance")

2022/01/20 15:52:30 INFO mlflow.tracking.fluent: Experiment with name 'Predictive-Maintenance' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlops-artifacts-bucket/1', experiment_id='1', lifecycle_stage='active', name='Predictive-Maintenance', tags={}>

In [14]:
baseline_features = ['unit', 'altitude', 'mach', 'tra', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7',
                    'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16',
                    'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21']
selected_features = ['unit', 'altitude', 'mach', 'tra', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_6', 'sensor_7',
                    'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15',
                    'sensor_17', 'sensor_20', 'sensor_21']

In [18]:
current_time = datetime.now().strftime('%Y-%m-%d %H-%M-%S')
with mlflow.start_run(run_name=f'Baseline-{current_time}') as run:
    x_train, y_train, x_test, y_test = define_ml_dataset(train_data, test_data, test_target, baseline_features, run)
    baseline = train_baseline(x_train, y_train)
    train_metrics, test_metrics = evaluate_model(baseline, x_train, y_train, x_test, y_test, run)

NoCredentialsError: Unable to locate credentials