# Introduction

The notebook is intended to predict the chance failure of a specific product code given its attributes and measurements.

This is the [dataset used](https://www.kaggle.com/competitions/tabular-playground-series-aug-2022).

In [None]:
# Import Standard Libraries
import pandas as pd
import numpy as np

import plotly.express as px
from plotly.io import read_json

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression

# Read Data

In [None]:
# Read train & test data
train_data = pd.read_csv('./../../data/2022_08/train.csv')
test_data = pd.read_csv('./../../data/2022_08/test.csv')

In [None]:
train_data.info()

In [None]:
train_data.head()

# Exploratory Data Analysis

## id

In [None]:
# Check for duplicates
print('ID duplicates: {}'.format(len(train_data[train_data['id'].duplicated()])))

## Product Code

In [None]:
# Check for NaN values
print('Product Code NaN values: {}'.format(len(train_data[train_data['product_code'].isna()])))

In [None]:
# Plot the Histogram of 'product_code' Distribution
figure = px.histogram(train_data, 
                      x='product_code', 
                      title='Product Code Distribution', 
                      labels={'product_code':'Product Code',
                              'count': 'Share'},
                      color_discrete_sequence=['darkgreen'],
                      height=500,
                      histnorm='',
                      template='plotly_dark')

figure.update_layout(yaxis_title='Share', 
                     font=dict(family="PT Sans", 
                               size=14), 
                     title_font=dict(family="PT Sans",
                                     size=30), 
                     title_x=0.5)

# Save figure
figure.write_json("./plots/product_code_distribution.json")

# Read & plot figure
read_json('./plots/product_code_distribution.json').show()

## loading

In [None]:
# Check for NaN values
print('Loading NaN values: {}'.format(len(train_data[train_data['loading'].isna()])))

In [None]:
# Plot boxplot of 'loading'
figure = px.box(train_data, 
                x='loading', 
                title='Loading Distribution',
                color_discrete_sequence=['darkgreen'],
                template='plotly_dark')

figure.update_layout(font=dict(family="PT Sans", 
                               size=14), 
                     title_font=dict(family="PT Sans",
                                     size=30), 
                     title_x=0.5)

# Save figure
figure.write_json("./plots/loading_distribution.json")

# Read & plot figure
read_json('./plots/loading_distribution.json').show()

Fill NaN values and clean outliers.

In [None]:
# Plot the 'loading' distribution with respect to the 'product_code'
figure = px.box(train_data, 
                x='product_code', 
                y='loading',
                points='all',
                color='product_code',
                title='Loading Distribution per Product Code',
                color_discrete_sequence=px.colors.qualitative.Set3,
                height=500,
                template='plotly_dark')

figure.update_layout(font=dict(family="PT Sans", 
                               size=14), 
                     title_font=dict(family="PT Sans",
                                     size=30), 
                     title_x=0.5)

# Save figure
figure.write_json("./plots/loading_product_code_distribution.json")

# Read & plot figure
read_json('./plots/loading_product_code_distribution.json').show()

## Attributes

In [None]:
# Check for Nan Values
for i in range(4):

    print('Attribute {} NaN value count: {}'.format(i, len(train_data[train_data['attribute_' + str(i)].isna()])))

## Failure

In [None]:
# Check 'failure' NaN values
print('Failure NaN values: {}'.format(len(train_data[train_data['failure'].isna()])))

In [None]:
# Plot the Histogram of 'failure' Distribution
figure = px.histogram(train_data, 
                      x='failure', 
                      color_discrete_sequence=['darkgreen'],
                      title='Failure Distribution', 
                      labels={'failure':'Failure'},
                      height=500,
                      histnorm='percent',
                      template='plotly_dark')

figure.update_layout(yaxis_title='Share', 
                     font=dict(family="PT Sans", 
                               size=14), 
                     title_font=dict(family="PT Sans",
                                     size=30), 
                     title_x=0.5)

figure.show()

Imbalanced classes, classic!

# Data Preparation

## Features and Labels Definition

In [None]:
# Define features and labels
categorical_features = ['product_code', 
                        'attribute_0', 
                        'attribute_1']

numerical_integer_features = ['attribute_2', 
                              'attribute_3', 
                              'measurement_0', 
                              'measurement_1', 
                              'measurement_2']

numerical_float_featues = ['loading', 
                           'measurement_3', 
                           'measurement_4', 
                           'measurement_5', 
                           'measurement_6', 
                           'measurement_7', 
                           'measurement_8', 
                           'measurement_9', 
                           'measurement_10', 
                           'measurement_11', 
                           'measurement_12', 
                           'measurement_13', 
                           'measurement_14', 
                           'measurement_15',
                           'measurement_16', 
                           'measurement_17']

label = ['failure']

## Fill NaN Values

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('Imputer', SimpleImputer(strategy='most_frequent')),
    ('One-Hot Encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('Numerical Preprocessing', numerical_transformer, numerical_integer_features + numerical_float_featues),
        ('Categorical Preprocessing', categorical_transformer, categorical_features)
    ])

# Train & Test Split

In [None]:
# Define X and y for the training set
X = train_data.drop(['failure'], axis=1)
y = train_data['failure']

In [None]:
# Split training data into train and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Define the a Stratified K-fold Shuffle Splitter
stratified_kfold = StratifiedShuffleSplit(n_splits=10,
                                          test_size=.3, 
                                          random_state=0)

# Model Definition

In [None]:
# Define the used metrics
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'cv_score']

In [None]:
# Initialize DataFrame of model performance
performance = pd.DataFrame(columns=metrics)

## Logistic Regression

In [None]:
# Model's Hyperparameters
max_iter_lr = 1000

In [None]:
# Define the model
model_lr = LogisticRegression(max_iter=max_iter_lr)

# Define the pipeline
lr_pipe = Pipeline([
    ('data_preprocessing', preprocessor),
    ('logistic_regression', model_lr)
])

# Train the pipeline
lr_pipe.fit(X_train, y_train)

# Predictions
predictions_lr_pipe = lr_pipe.predict(X_test)

# Model evaluation
accuracy_lr_pipe = round(accuracy_score(y_test, predictions_lr_pipe) * 100, 2)
precision_lr_pipe = round(precision_score(y_test, predictions_lr_pipe) * 100, 2)
recall_lr_pipe = round(recall_score(y_test, predictions_lr_pipe) * 100, 2)
f1_lr_pipe = round(f1_score(y_test, predictions_lr_pipe) * 100, 2)
roc_auc_lr_pipe = round(roc_auc_score(y_test, predictions_lr_pipe) * 100, 2)
cv_score_lr_pipe = round(cross_val_score(lr_pipe, X, y, cv=stratified_kfold).mean() * 100, 2)

print('Model accuracy: {}%'.format(accuracy_lr_pipe))
print('Model precision: {}%'.format(precision_lr_pipe))
print('Model recall: {}%'.format(recall_lr_pipe))
print('Model f1 score: {}%'.format(f1_lr_pipe))
print('Model ROC AUC: {}%'.format(roc_auc_lr_pipe))
print('Model Cross Validation score: {}%'.format(cv_score_lr_pipe))

In [None]:
# Update 'performance' DataFrame
performance.loc['logistic_regression'] = [accuracy_lr_pipe, 
                                          precision_lr_pipe, 
                                          recall_lr_pipe, 
                                          f1_lr_pipe, 
                                          roc_auc_lr_pipe, 
                                          cv_score_lr_pipe]