# Introduction

The notebook is intended to predict the chance failure of a specific product code given its attributes and measurements.

This is the [dataset used](https://www.kaggle.com/competitions/tabular-playground-series-aug-2022).

In [None]:
# Import Standard Libraries
import pandas as pd
import numpy as np

import plotly.express as px
from plotly.io import read_json

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score, HalvingGridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# Read Data

In [None]:
# Read train & test data
train_data = pd.read_csv('./../../data/2022_08/train.csv')
test_data = pd.read_csv('./../../data/2022_08/test.csv')

In [None]:
train_data.info()

In [None]:
train_data.head()

# Exploratory Data Analysis

## id

In [None]:
# Check for duplicates
print('ID duplicates: {}'.format(len(train_data[train_data['id'].duplicated()])))

## Product Code

In [None]:
# Check for NaN values
print('Product Code NaN values: {}'.format(len(train_data[train_data['product_code'].isna()])))

In [None]:
# Plot the Histogram of 'product_code' Distribution
figure = px.histogram(train_data, 
                      x='product_code', 
                      title='Product Code Distribution', 
                      labels={'product_code':'Product Code',
                              'count': 'Share'},
                      color_discrete_sequence=['darkgreen'],
                      height=500,
                      histnorm='',
                      template='plotly_dark')

figure.update_layout(yaxis_title='Share', 
                     font=dict(family="PT Sans", 
                               size=14), 
                     title_font=dict(family="PT Sans",
                                     size=30), 
                     title_x=0.5)

# Plot
figure.show()

## loading

In [None]:
# Check for NaN values
print('Loading NaN values: {}'.format(len(train_data[train_data['loading'].isna()])))

In [None]:
# Plot boxplot of 'loading'
figure = px.box(train_data, 
                x='loading', 
                title='Loading Distribution',
                color_discrete_sequence=['darkgreen'],
                template='plotly_dark')

figure.update_layout(font=dict(family="PT Sans", 
                               size=14), 
                     title_font=dict(family="PT Sans",
                                     size=30), 
                     title_x=0.5)

# Plot
figure.show()

Fill NaN values and clean outliers.

In [None]:
# Plot the 'loading' distribution with respect to the 'product_code'
figure = px.box(train_data, 
                x='product_code', 
                y='loading',
                points='all',
                color='product_code',
                title='Loading Distribution per Product Code',
                color_discrete_sequence=px.colors.qualitative.Set3,
                height=500,
                template='plotly_dark')

figure.update_layout(font=dict(family="PT Sans", 
                               size=14), 
                     title_font=dict(family="PT Sans",
                                     size=30), 
                     title_x=0.5)

# Plot
figure.show()

## Attributes

In [None]:
# Check for Nan Values
for i in range(4):

    print('Attribute {} NaN value count: {}'.format(i, len(train_data[train_data['attribute_' + str(i)].isna()])))

## Failure

In [None]:
# Check 'failure' NaN values
print('Failure NaN values: {}'.format(len(train_data[train_data['failure'].isna()])))

In [None]:
# Plot the Histogram of 'failure' Distribution
figure = px.histogram(train_data, 
                      x='failure', 
                      color_discrete_sequence=['darkgreen'],
                      title='Failure Distribution', 
                      labels={'failure':'Failure'},
                      height=500,
                      histnorm='percent',
                      template='plotly_dark')

figure.update_layout(yaxis_title='Share', 
                     font=dict(family="PT Sans", 
                               size=14), 
                     title_font=dict(family="PT Sans",
                                     size=30), 
                     title_x=0.5)

figure.show()

Imbalanced classes, classic!

# Data Preparation

## Features and Labels Definition

In [None]:
# Define features and labels
categorical_features = ['product_code', 
                        'attribute_0', 
                        'attribute_1']

numerical_integer_features = ['attribute_2', 
                              'attribute_3', 
                              'measurement_0', 
                              'measurement_1', 
                              'measurement_2']

numerical_float_featues = ['loading', 
                           'measurement_3', 
                           'measurement_4', 
                           'measurement_5', 
                           'measurement_6', 
                           'measurement_7', 
                           'measurement_8', 
                           'measurement_9', 
                           'measurement_10', 
                           'measurement_11', 
                           'measurement_12', 
                           'measurement_13', 
                           'measurement_14', 
                           'measurement_15',
                           'measurement_16', 
                           'measurement_17']

label = ['failure']

## Numerical Features

In [None]:
# Numerical features pipeline
numerical_features_pipeline = Pipeline(steps=[
    ('numerical_imputer', SimpleImputer(strategy='constant')),
    ('numerical_scaler', StandardScaler())
])

## Categorical Features

In [None]:
# Categorical features pipeline
categorical_features_pipeline = Pipeline(steps=[
    ('categorical_imputer', SimpleImputer(strategy='most_frequent')),
    ('categorical_one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

## Bundel Data Preprocessing Steps

In [None]:
# Bunlde data preprocessing steps
data_preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_preprocessing', numerical_features_pipeline, numerical_integer_features + numerical_float_featues),
        ('categorical_preprocessing', categorical_features_pipeline, categorical_features)
    ])

# Train & Test Split

In [None]:
# Define X and y for the training set
X = train_data.drop(['failure'], axis=1)
y = train_data['failure']

In [None]:
# Split training data into train and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Define the a Stratified K-fold Shuffle Splitter
stratified_kfold = StratifiedShuffleSplit(n_splits=5,
                                          test_size=.3, 
                                          random_state=0)

# Model Definition

In [None]:
# Define the used metrics
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'cross_validation']

In [None]:
# Initialize DataFrame of model performance
performance = pd.DataFrame(columns=metrics)

## Logistic Regression

In [None]:
# Model's Hyperparameters
max_iter_lr = 300
solver_lr = 'lbfgs'
cross_validation_scoring_lr = 'roc_auc'
c_lr = 1.3

In [None]:
%%time

# Define the model
model_lr = LogisticRegression(C=c_lr,
                              max_iter=max_iter_lr, 
                              solver=solver_lr)

# Define the pipeline
pipe_lr = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('logistic_regression', model_lr)
])

# Train the pipeline
pipe_lr.fit(X_train, 
            y_train)

# Predictions
predictions_lr = pipe_lr.predict(X_test)

# Model evaluation
accuracy_lr = round(accuracy_score(y_test, predictions_lr) * 100, 2)
precision_lr = round(precision_score(y_test, predictions_lr) * 100, 2)
recall_lr = round(recall_score(y_test, predictions_lr) * 100, 2)
f1_lr = round(f1_score(y_test, predictions_lr) * 100, 2)
roc_auc_lr = round(roc_auc_score(y_test, predictions_lr) * 100, 2)
cross_validation_score_lr = round(cross_val_score(pipe_lr, X_test, y_test, scoring=cross_validation_scoring_lr, cv=stratified_kfold).mean() * 100, 2)

print('Accuracy: {}%'.format(accuracy_lr))
print('Precision: {}%'.format(precision_lr))
print('Recall: {}%'.format(recall_lr))
print('F1 score: {}%'.format(f1_lr))
print('ROC AUC: {}%'.format(roc_auc_lr))
print('Cross-Validation ROC AUC: {}%'.format(cross_validation_score_lr))
print('\n')

In [None]:
# Update 'performance' DataFrame
performance.loc['logistic_regression'] = [accuracy_lr,
                                          precision_lr,
                                          recall_lr,
                                          f1_lr, 
                                          roc_auc_lr, 
                                          cross_validation_score_lr]

## Logistic Regression - Cross-Validation

In [None]:
def train_with_stratified_kfold(estimator, X, y, cv):
    """
    Train an Estimator through a Stratified K-Fold Cross-Validation approach and return evaluation metrics
    
        Paramarameters:
            estimator: sklearn.linear_model estimator to optimise
            X: Pandas DataFrame of data
            y: Pandas DataFrame of labels
            cv: sklearn.model_selection splitter instance
            
        Returns:
            accuracy: Float accuracy score
            precision: Float precision score
            recall: Float recall score
            f1: Float F1 score
            roc_auc: Float ROC AUC score
            cross_validation_score: Float Cross-Validation ROC AUC score
    """
    
    # Initialise empty lists for metrics
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_list = []
    roc_auc_list = []
    
    # Fetch the folds
    for fold, (train_index, validation_index) in enumerate(cv.split(X, y)):
        
        # Split the data
        X_train = X.loc[train_index]
        X_validation = X.loc[validation_index]
        y_train = y.loc[train_index]
        y_validation = y.loc[validation_index]
        
        # Fit the estimator
        estimator.fit(X_train, y_train)
        
        # Predictions
        predictions = estimator.predict(X_validation)
        
        # Compute metrics
        accuracy_fold = round(accuracy_score(y_validation, predictions) * 100, 2)
        precision_fold = round(precision_score(y_validation, predictions) * 100, 2)
        recall_fold = round(recall_score(y_validation, predictions) * 100, 2)
        f1_fold = round(f1_score(y_validation, predictions) * 100, 2)
        roc_auc_fold = round(roc_auc_score(y_validation, predictions) * 100, 2)
        
        print('---- Fold {} ----'.format(fold))
        print('Accuracy: {}%'.format(accuracy_fold))
        print('Precision: {}%'.format(precision_fold))
        print('Recall: {}%'.format(recall_fold))
        print('F1 score: {}%'.format(f1_fold))
        print('ROC AUC: {}%'.format(roc_auc_fold))
        print('\n')
        
        # Append mentrics to the corresponding list
        accuracy_list.append(accuracy_fold)
        precision_list.append(precision_fold)
        recall_list.append(recall_fold)
        f1_list.append(f1_fold)
        roc_auc_list.append(roc_auc_fold)
        
    
        
    return np.mean(accuracy_list), precision_list.mean(), recall_list.mean(), f1_list.mean(), roc_auc_list.mean()
        

In [None]:
# Model's Hyperparameters
max_iter_lr_cv = 300
solver_lr_cv = 'lbfgs'
cross_validation_scoring_lr_cv = 'roc_auc'
c_lr_cv = 1.3

In [None]:
%%time

# Define the model
model_lr_cv = LogisticRegression(C=c_lr_cv,
                                 max_iter=max_iter_lr_cv,
                                 solver=solver_lr_cv)

# Define the pipeline
pipe_lr_cv = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('logistic_regression', model_lr_cv)
])

# Train the pipeline
train_with_stratified_kfold(pipe_lr_cv, X, y, stratified_kfold)

# Model Comparison

In [None]:
# Plot the models' performance
figure = px.bar(performance,
                x=performance.index,
                y=performance.columns.values,
                labels={'index': 'Model', 'value': 'Performance'},
                barmode='group',
                title='Models Comparison',
                color_discrete_sequence=px.colors.qualitative.Set3,
                height=500,
                template='plotly_dark')

figure.update_layout(font=dict(family="PT Sans", 
                               size=14), 
                     title_font=dict(family="PT Sans",
                                     size=30), 
                     title_x=0.5)

# Plot figure
figure.show()