In the previous notebook we've identified, analyzed and solved the inconsistences in our dataset. In some cases, for machine learning models, it is necessary to create new features to model the problem. This notebook aims to do the necessary feature engineering.

# Imports

In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from xgboost.sklearn import XGBRegressor

from IPython.display import Image

# Set-up

In [2]:
os.chdir('../..')

In [3]:
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_style("white")

# Get Data

In [7]:
df = pd.read_parquet('./data/1-bronze/Concrete_Data_Cleaned.parquet')

# Feature Engineering

In [8]:
df.head()

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,272.9,0.0,0.0,162.0,2.5,1040.0,676.0,28.0,7999
1,272.9,0.0,0.0,162.0,2.5,1055.0,676.0,28.0,6189
2,332.5,142.5,0.0,185.0,0.0,932.0,594.0,28.0,4027
3,332.5,142.5,0.0,185.0,0.0,932.0,594.0,28.0,4105
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,28.0,4430


The strength is the target variable, let's divide the dataset in dependent and independent variables and scale the data avoid data leakage.

In [6]:
df.columns

Index(['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg',
       'fineagg', 'age', 'Concrete compressive strength(MPa, megapascals) '],
      dtype='object')

In [None]:
# spitting data into dependent and independent variables

X = df[['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg',
       'fineagg', 'age']]
y = df[['strength']]

In [None]:
X.to_parquet('../../data/2-silver/X.csv', index = False)
y.to_parquet('../../data/2-silver/Y.csv', index = False)

In order to scale our data, let us use the z score.

In [None]:
Xscaled = X.apply(zscore)
X_scaled_df = pd.DataFrame(Xscaled, columns = df.columns)

In [None]:
X_scaled_df = X_scaled_df.drop(columns = 'strength')

In [None]:
X_scaled_df

# Building Different Models

In [None]:
# splitting in train and test data
X_train, X_test, y_train, y_test = train_test_split(Xscaled, y, test_size = 0.3, random_state = 1)

In [None]:
X_train.to_csv('../../data/3-gold/X_train.csv', index = False)
X_test.to_csv('../../data/3-gold/X_test.csv', index = False)

y_train.to_csv('../../data/3-gold/y_train.csv', index = False)
y_test.to_csv('../../data/3-gold/y_test.csv', index = False)

# MLFlow Experiment - Functions

In [None]:
def train(model, X_train: pd.DataFrame, y_train: pd.DataFrame) -> None:
  '''
  Fits a scikit-learn model
  '''
  try:
    model = model.fit(X_train, y_train)
    train_accuracy = model.score(X_train, y_train)
    mlflow.log_metric('train-accuracy', train_accuracy)
    print(f'Train Accuracy: {train_accuracy: .3%}')
  except Exception as e:
    raise(e)

  return None

In [None]:
def evaluate(model, X_test: pd.DataFrame, y_test: pd.DataFrame) -> None:
    # Model predictions
    y_pred = model.predict(X_test)

    # Model performance metrics
    r2_score = metrics.r2_score(y_test, y_pred)
    mse_score = metrics.mean_squared_error(y_test, y_pred)

    # Log metrics
    mlflow.log_metric('r2-score', r2_score)
    mlflow.log_metric('mse', mse_score)

    # Print and log metrics
    print('R2 Score: {:.3f}'.format(r2_score))
    print('-' * 30)
    print('MSE: {:.3f}'.format(mse_score))

    print('-' * 30)
    print('Metrics and artifacts logged!')

    return None


# Random Forest

In [None]:
random_forest = RandomForestRegressor()

In [None]:
mlflow.set_experiment('Concrete-Strength-Experiments')

# Start a new MLflow run
with mlflow.start_run():
    # Set a custom run name
    run_name = "Random Forest"
    mlflow.set_tag("mlflow.runName", run_name)

    # Train the model
    train(model=random_forest, X_train=X_train, y_train=y_train)

    # Log hyperparameters
    mlflow.log_param('n_estimators', random_forest.n_estimators)
    mlflow.log_param('max_depth', random_forest.max_depth)

    # Evaluate on the test set
    evaluate(model=random_forest, X_test=X_test, y_test=y_test)

    # Perform cross-validation
    k = 20
    kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
    K_results = cross_val_score(random_forest, X, y, cv = kfold)
    accuracy = np.mean(abs(K_results))

    # Log cross-validation metric
    mlflow.log_metric('cv_accuracy', accuracy)
    print('cv accuracy: ', accuracy)

    # Log the model
    mlflow.sklearn.log_model(random_forest, 'random-forest')

    # Print the run UUID
    print('Model run: ', mlflow.active_run().info.run_uuid)

# End the MLflow run
mlflow.end_run()


The model has overfitted because the train and test scores are supposed to be closer.

# Gradient Boosting Regressor

In [None]:
# gradient boosting model
gradient_boosting = GradientBoostingRegressor()

In [None]:
# Start a new MLflow run
with mlflow.start_run():
    # Set a custom run name
    run_name = "Gradient Boosting"
    mlflow.set_tag("mlflow.runName", run_name)

    # Train the model
    train(model=gradient_boosting, X_train=X_train, y_train=y_train)

    # Log hyperparameters
    mlflow.log_param('n_estimators', gradient_boosting.n_estimators)

    # Evaluate on the test set
    evaluate(model=gradient_boosting, X_test = X_test, y_test=y_test)

    # Perform cross-validation
    k = 20
    kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
    K_results = cross_val_score(gradient_boosting, X, y, cv = kfold)
    accuracy = np.mean(abs(K_results))

    # Log cross-validation metric
    mlflow.log_metric('cv_accuracy', accuracy)
    print('cv accuracy: ', accuracy)

    # Log the model
    mlflow.sklearn.log_model(gradient_boosting, 'gradient-boosting')

    # Print the run UUID
    print('Model run: ', mlflow.active_run().info.run_uuid)

# End the MLflow run
mlflow.end_run()

# Ada Boosting Regressor

In [None]:
# ada boosting model
ada_boost = AdaBoostRegressor()

In [None]:
# Start a new MLflow run
with mlflow.start_run():
    # Set a custom run name
    run_name = "Ada Boost"
    mlflow.set_tag("mlflow.runName", run_name)

    # Train the model
    train(model=ada_boost, X_train=X_train, y_train=y_train)

    # Log hyperparameters
    mlflow.log_param('n_estimators', ada_boost.n_estimators)

    # Evaluate on the test set
    evaluate(model=ada_boost, X_test = X_test, y_test=y_test)

    # Perform cross-validation
    k = 20
    kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
    K_results = cross_val_score(ada_boost, X, y, cv = kfold)
    accuracy = np.mean(abs(K_results))

    # Log cross-validation metric
    mlflow.log_metric('cv_accuracy', accuracy)

    print('cv accuracy: ', accuracy)

    # Log the model
    mlflow.sklearn.log_model(ada_boost, 'ada-boost')

    # Print the run UUID
    print('Model run: ', mlflow.active_run().info.run_uuid)

# End the MLflow run
mlflow.end_run()

# KNN Regressor

In [None]:
# checking for different values of neighbors to determine the best number

diff_k = []
for i in range(1, 45):
    knn = KNeighborsRegressor(n_neighbors = i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    diff_k.append(np.mean(pred_i != y_test))

Let us visualize the mean error for each value

In [None]:
plt.figure(figsize = (12, 6))
plt.plot(range(1, 45), diff_k, color = 'blue', linestyle = 'dashed', marker = 'o', markerfacecolor = 'red', markersize = 10)
plt.title('Different K - Values', fontsize = 20)
plt.xlabel('K Values', fontsize = 15)
plt.ylabel('Mean error', fontsize = 15);

In [None]:
# k = 3 is a better choice
KNN = KNeighborsRegressor(n_neighbors = 3)

In [None]:
# Start a new MLflow run
with mlflow.start_run():
    # Set a custom run name
    run_name = "KNN Regressor"
    mlflow.set_tag("mlflow.runName", run_name)

    # Train the model
    train(model=KNN, X_train=X_train, y_train=y_train)

    # Log hyperparameters
    mlflow.log_param('n_neighbors', KNN.n_neighbors)

    # Evaluate on the test set
    evaluate(model=KNN, X_test = X_test, y_test=y_test)

    # Perform cross-validation
    k = 20
    kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
    K_results = cross_val_score(KNN, X, y, cv = kfold)
    accuracy = np.mean(abs(K_results))

    # Log cross-validation metric
    mlflow.log_metric('cv_accuracy', accuracy)

    print('cv accuracy: ', accuracy)

    # Log the model
    mlflow.sklearn.log_model(KNN, 'KNN')

    # Print the run UUID
    print('Model run: ', mlflow.active_run().info.run_uuid)

# End the MLflow run
mlflow.end_run()

# Bagging Regressor

In [None]:
# bagging regressor model
bagging = BaggingRegressor()

In [None]:
# Start a new MLflow run
with mlflow.start_run():
    # Set a custom run name
    run_name = "Bagging"
    mlflow.set_tag("mlflow.runName", run_name)

    # Train the model
    train(model=bagging, X_train=X_train, y_train=y_train)

    # Log hyperparameters
    mlflow.log_param('n_estimators', bagging.n_estimators)

    # Evaluate on the test set
    evaluate(model=bagging, X_test = X_test, y_test=y_test)

    # Perform cross-validation
    k = 20
    kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
    K_results = cross_val_score(ada_boost, X, y, cv = kfold)
    accuracy = np.mean(abs(K_results))

    # Log cross-validation metric
    mlflow.log_metric('cv_accuracy', accuracy)
    print('cv accuracy: {:.3f}'.format(accuracy))

    # Log the model
    mlflow.sklearn.log_model(bagging, 'bagging')

    # Print the run UUID
    print('Model run: ', mlflow.active_run().info.run_uuid)

# End the MLflow run
mlflow.end_run()

# Support Vector Regressor

In [None]:
# support vector model
SVR = SVR(kernel = 'linear')

In [None]:
# Start a new MLflow run
with mlflow.start_run():
    # Set a custom run name
    run_name = "svr"
    mlflow.set_tag("mlflow.runName", run_name)

    # Train the model
    train(model=SVR, X_train=X_train, y_train=y_train)

    # Log hyperparameters
    params = {
        'kernel': SVR.kernel,
    }
    for key, value in params.items():
      mlflow.log_param(key, value)

    # Evaluate on the test set
    evaluate(model=SVR, X_test = X_test, y_test=y_test)

    # Perform cross-validation
    k = 20
    kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
    K_results = cross_val_score(ada_boost, X, y, cv = kfold)
    accuracy = np.mean(abs(K_results))

    # Log cross-validation metric
    mlflow.log_metric('cv_accuracy', accuracy)
    print('cv accuracy: {:.3f}'.format(accuracy))

    # Log the model
    mlflow.sklearn.log_model(SVR, 'svr')

    # Print the run UUID
    print('Model run: ', mlflow.active_run().info.run_uuid)

# End the MLflow run
mlflow.end_run()

# XGBoost Regressor

In [None]:
# xgboost regressor model
xgr = XGBRegressor()

In [None]:
# Start a new MLflow run
with mlflow.start_run():
    # Set a custom run name
    run_name = "XGBoost"
    mlflow.set_tag("mlflow.runName", run_name)

    # Train the model
    train(model=xgr, X_train=X_train, y_train=y_train)


    # Evaluate on the test set
    evaluate(model=xgr, X_test = X_test, y_test=y_test)

    # Perform cross-validation
    k = 20
    kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
    K_results = cross_val_score(xgr, X, y, cv = kfold)
    accuracy = np.mean(abs(K_results))

    # Log cross-validation metric
    mlflow.log_metric('cv_accuracy', accuracy)
    print('cv accuracy: {:.3f}'.format(accuracy))

    # Log the model
    mlflow.sklearn.log_model(xgr, 'xgboost')

    # Print the run UUID
    print('Model run: ', mlflow.active_run().info.run_uuid)

# End the MLflow run
mlflow.end_run()

# Decision Tree Regressor

In [None]:
# xgboost regressor model
dt_model = DecisionTreeRegressor()

In [None]:
# Start a new MLflow run
with mlflow.start_run():
    # Set a custom run name
    run_name = "Decision Tree"
    mlflow.set_tag("mlflow.runName", run_name)

    # Train the model
    train(model=dt_model, X_train=X_train, y_train=y_train)

    # Log hyperparameters
    mlflow.log_param("max_depth", dt_model.max_depth)
    mlflow.log_param("min_samples_split", dt_model.min_samples_split)
    mlflow.log_param("min_samples_leaf", dt_model.min_samples_leaf)
    mlflow.log_param("max_features", dt_model.max_features)

    # Evaluate on the test set
    evaluate(model=dt_model, X_test=X_test, y_test=y_test)

    # Perform cross-validation
    k = 20
    kfold = KFold(n_splits = k, random_state = 70, shuffle = True)
    K_results = cross_val_score(dt_model, X, y, cv = kfold)
    accuracy = np.mean(abs(K_results))

    # Log cross-validation metric
    mlflow.log_metric('cv_accuracy', accuracy)
    print('cv accuracy: ', accuracy)

    # Log the model
    mlflow.sklearn.log_model(dt_model, 'decision-tree')

    # Print the run UUID
    print('Model run: ', mlflow.active_run().info.run_uuid)

# End the MLflow run
mlflow.end_run()

# Model Evaluation Metrics

Now let us evaluate the metrics of our tested models using **mlflow**:

In [None]:
Image(filename = './images/MLFlow-image-1.jpeg')

As we can see, our best model was XGBoost. Let's now register and save our model using MLFlow:

In [None]:
xgboost_run_id = '2f650498d25a482cb9f66c9f9bee6b96'

model_uri = f'runs:/{xgboost_run_id}/model'
model_details = mlflow.register_model(model_uri, 'XGBoost Model')

In [None]:
mlflow.sklearn.save_model(model_details, 'model')