# Part II: Model Development

In this part, we develop three unique pipelines for predicting backorder. We use the smart sample from Part I to fit and evaluate these pipelines. 

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, sys
import itertools
import numpy as np
import pandas as pd
import joblib

from sklearn import preprocessing
from sklearn.model_selection import train_test_split


## Reload the smart sample here

In [2]:
# Reload your smart sampling from local file 
# ----------------------------------
X, y, train_undersamp = joblib.load('data/sample-data-v1.pkl')


In [3]:
# Subset easier to manage size for testing pipelines
train_undersamp_less = train_undersamp.groupby('went_on_backorder').apply(lambda x: x.sample(frac=0.1))
train_undersamp_less = pd.DataFrame(train_undersamp_less)
# Split back into X and y
X_less = train_undersamp_less.iloc[:, :-1]
y_less = train_undersamp_less.went_on_backorder
X_less.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2172 entries, (0, 786) to (1, 20979)
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   national_inv      2172 non-null   float64
 1   lead_time         2172 non-null   float64
 2   in_transit_qty    2172 non-null   float64
 3   forecast_3_month  2172 non-null   float64
 4   sales_1_month     2172 non-null   float64
 5   sales_3_month     2172 non-null   float64
 6   min_bank          2172 non-null   float64
 7   potential_issue   2172 non-null   int64  
 8   pieces_past_due   2172 non-null   float64
 9   perf_6_month_avg  2172 non-null   float64
 10  local_bo_qty      2172 non-null   float64
 11  deck_risk         2172 non-null   int64  
 12  oe_constraint     2172 non-null   int64  
 13  ppap_risk         2172 non-null   int64  
 14  stop_auto_buy     2172 non-null   int64  
 15  rev_stop          2172 non-null   int64  
dtypes: float64(10), int64(6)
memo

## Normalize/standardize the data if required; otherwise ignore. You can perform this step inside the pipeline (if required). 

In [5]:
# Standardize data
scaler = preprocessing.StandardScaler().fit(X_less)
X_less_scaled = scaler.transform(X_less)

'''# Combine X_scaled and y
train_under_stand = pd.DataFrame()
train_under_stand = X_scaled
train_under_stand.reset_index(drop = True)
train_under_stand['went_on_backorder'] = y'''

print("Mean of the dataset features")
print(scaler.mean_)
print("Variance of data")
print(scaler.scale_)
print("-" * 35)

print("# Scaled data:")
print(X_less_scaled)

print("# Mean of scaled data")
print(X_less_scaled.mean(axis = 0))
print("# Variance of scaled data")
print(X_less_scaled.std(axis = 0))


Mean of the dataset features
[2.25476980e+02 6.86740331e+00 2.05004604e+01 1.89299263e+02
 3.12697974e+01 9.27762431e+01 2.99456722e+01 2.76243094e-03
 3.24769797e+00 7.36008287e-01 2.62154696e+00 1.85543278e-01
 4.60405157e-04 1.36740331e-01 9.75138122e-01 0.00000000e+00]
Variance of data
[4.73404865e+03 5.53981953e+00 2.63649562e+02 2.80830589e+03
 1.92019175e+02 5.76924761e+02 1.98046811e+02 5.24861878e-02
 6.99672073e+01 2.79369433e-01 4.04502110e+01 3.88737662e-01
 2.14521137e-02 3.43573010e-01 1.55704096e-01 1.00000000e+00]
-----------------------------------
# Scaled data:
[[-0.04086924  0.20444649  0.07396007 ... -0.39799497  0.15967389
   0.        ]
 [-0.04762878 -0.51759869  0.16498999 ... -0.39799497  0.15967389
   0.        ]
 [-0.04699508  0.38495779 -0.07775647 ... -0.39799497  0.15967389
   0.        ]
 ...
 [-0.04784002  0.92649168 -0.07775647 ... -0.39799497  0.15967389
   0.        ]
 [-0.04762878  0.20444649 -0.07775647 ... -0.39799497  0.15967389
   0.        ]
 [-

In [25]:
# Subset non discrete features
X_less_for_norm = X_less[['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month',
                          'sales_1_month', 'sales_3_month', 'min_bank', 'pieces_past_due', 
                          'perf_6_month_avg', 'local_bo_qty']]
#X_less_for_norm.info()

# Subset discrete features
X_less_discrete = pd.DataFrame(X_less[['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 
                                       'stop_auto_buy', 'rev_stop']])
#X_less_discrete.info()

# Normalize data
X_less_norm = pd.DataFrame(preprocessing.normalize(X_less_for_norm, axis = 0, norm = 'l2'))
#X_less_norm = pd.DataFrame(X_less_norm)
#X_less_norm.info()

'''print('# Scaled & normalized values:')
print(X_less_norm)

print("# All have unit norm")
print(np.linalg.norm(X_less_norm, axis = 0))'''

#X_less_norm.info()

# Recombine discrete and nondiscrete features
X_less_norm_comb = pd.DataFrame(pd.concat([X_less_norm, X_less_discrete]).reset_index(drop = True))

X_norm = X_less_norm_comb.rename(columns={0: 'national_inv', 1: 'lead_time', 2: 'in_transit_qty', 
                                 3: 'forecast_3_month', 4: 'sales_1_month', 5: 'sales_3_month', 
                                 6: 'min_bank' , 7: 'potential_issue', 8: 'pieces_past_due', 
                                 9: 'perf_6_month_avg', 10: 'local_bo_qty', 11: 'deck_risk', 
                                 12: 'oe_constraint', 13: 'ppap_risk', 14: 'stop_auto_buy', 
                                 15: 'rev_stop'})
'''#X_less_norm_comb.info()
#column = X.columns.values
#print(column)
#X_norm.info()

#check = pd.DataFrame(X_norm['potential_issue'].unique())
#print(check)

X_norm = pd.Series(X_less_norm_comb)

print('potential_issue', X_norm['potential_issue'].unique())
print('/deck_risk', X_norm['deck_risk'].unique())
print('/oe_constraint', X_norm['oe_constraint'].unique())
print('/ppap_risk', X_norm['ppap_risk'].unique())
print('/stop_auto_buy', X_norm['stop_auto_buy'].unique())
print('/rev_stop', X_norm['rev_stop'].unique())
print('/went_on_backorder', X_norm['went_on_backorder'].unique())

potential_issue

for col in X_norm:
  print(X_norm[col].unique())'''


"#X_less_norm_comb.info()\n#column = X.columns.values\n#print(column)\n#X_norm.info()\n\n#check = pd.DataFrame(X_norm['potential_issue'].unique())\n#print(check)\n\nX_norm = pd.Series(X_less_norm_comb)\n\nprint('potential_issue', X_norm['potential_issue'].unique())\nprint('/deck_risk', X_norm['deck_risk'].unique())\nprint('/oe_constraint', X_norm['oe_constraint'].unique())\nprint('/ppap_risk', X_norm['ppap_risk'].unique())\nprint('/stop_auto_buy', X_norm['stop_auto_buy'].unique())\nprint('/rev_stop', X_norm['rev_stop'].unique())\nprint('/went_on_backorder', X_norm['went_on_backorder'].unique())\n\npotential_issue\n\nfor col in X_norm:\n  print(X_norm[col].unique())"

## Split the data into Train/Test

In [67]:
## Split downsampled dataset into training and testing sets
# Small sample to test pipelines
X_train, X_test, y_train, y_test = train_test_split(X_less, y_less, test_size = 0.2)
# Full sample
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)


## Developing Pipeline

In this section, we design an operationalized machine learning pipeline, which includes:

* Anomaly detection
* Dimensionality Reduction
* Train a classification model


We are free to use any of the models that we learned in the past or we can use new models. Here is a pool of methods: 

### Pool of Anomaly Detection Methods (Discussed in M4)
1. IsolationForest
2. EllipticEnvelope
3. LocalOutlierFactor
4. OneClassSVM
5. SGDOneClassSVM

### Pool of Feature Selection Methods (Discussed in M3)

1. VarianceThreshold
1. SelectKBest with any scoring method (e.g, chi, f_classif, mutual_info_classif)
1. SelectKPercentile
3. SelectFpr, SelectFdr, or  SelectFwe
1. GenericUnivariateSelect
2. PCA
3. Factor Analysis
4. Variance Threshold
5. RFE
7. SelectFromModel


### Classification Methods (Discussed in M1-M2
1. Decision Tree
2. Random Forest
3. Logistic Regression
4. Naive Bayes
5. Linear SVC
6. SVC with kernels
7. KNeighborsClassifier
8. GradientBoostingClassifier
9. XGBClassifier
10. LGBM Classifier



It is difficult to fit an anomaly detection method in the sklearn pipeline without writing custom codes. For simplicity, we avoid fitting an anomaly detection method within a pipeline. So we can create the workflow in two steps. 
* Step I: fit an outlier with the training set
* Step II: define a pipeline using a feature selection and a classification method. Then cross-validate this pipeline using the training data without outliers. 
* Note: if your smart sample is somewhat imbalanced, you might want to change the scoring method in GridSearchCV (see the [doc](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)).


Once we fit the pipeline with gridsearch, we identify the best model and give an unbiased evaluation using the test set that we created in Part II. For unbiased evaluation we report confusion matrix, precision, recall, f1-score, accuracy, and other measures if you like. 

**Optional: Those who are interested in writing custom codes for adding an outlier detection method into the sklearn pipeline, please follow this discussion [thread](https://stackoverflow.com/questions/52346725/can-i-add-outlier-detection-and-removal-to-scikit-learn-pipeline).**


**Note:** <span style='background:yellow'>We will be using Grid Search to find the optimal parameters of the pipelines.</span>

You can add more notebook cells or import any Python modules as needed.

In [27]:
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.feature_selection import (SelectKBest, VarianceThreshold, 
                                       SelectPercentile, chi2, f_classif, mutual_info_classif)
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix 
from numpy import where
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline

import sklearn.feature_selection


### Your 1st pipeline 
  * Anomaly detection
  * Dimensionality reduction
  * Model training/validation
  
Add cells as needed. 

In [63]:
# Add anomaly detection code  (Question #E201)
# ----------------------------------
# Construct local outlier factor
lof = LocalOutlierFactor()

# Get labels from classifier to cull outliers
lof_outliers = lof.fit_predict(X_train) == -1 # This should be scaled for LR, but I could not make it work

X_lof = X_train[~lof_outliers]
y_lof = y_train[~lof_outliers]


In [64]:
# Add codes for feature selection and classification pipeline with grid search  (Question #E202)
# ----------------------------------
# Define pipeline
pipe = Pipeline([
    ('PCA', PCA()),
    ('LR_model', LogisticRegression())
])

# Define parameter grid
param_grid = {'PCA__n_components': [0, 1, 2, 3],
              'PCA__random_state':[17],
              'LR_model__C': [0.001, 0.1, 1.0, 10, 100], 
              'LR_model__max_iter': [1000, 2500, 5000]
              }

# Use Grid Search to train Pipeline
CV_log_reg = GridSearchCV(pipe, param_grid, n_jobs = 2, cv = 10)
CV_log_reg_model = CV_log_reg.fit(X_lof, y_lof)
#print(CV_log_reg.cv_results_)


 0.72889405 0.7490062         nan 0.524377   0.72889405 0.7490062
        nan 0.524377   0.72895916 0.7490062         nan 0.524377
 0.72895916 0.7490062         nan 0.524377   0.72895916 0.7490062
        nan 0.524377   0.72895916 0.7490062         nan 0.524377
 0.72895916 0.7490062         nan 0.524377   0.72895916 0.7490062
        nan 0.524377   0.72895916 0.7490062         nan 0.524377
 0.72895916 0.7490062         nan 0.524377   0.72895916 0.7490062
        nan 0.524377   0.72895916 0.7490062         nan 0.524377
 0.72895916 0.7490062         nan 0.524377   0.72895916 0.7490062 ]


In [41]:
# Given an unbiased evaluation  (Question #E203)
# ----------------------------------
'''# Show parameters of trained models and their rank
pd.set_option("max_colwidth", 80)
CV_log_reg_df = pd.DataFrame(CV_log_reg.cv_results_)
print(CV_log_reg_df[['params','rank_test_score']])'''

# Evaluate best model using test data
predicted_y = CV_log_reg.predict(X_test)

'''# Show parameters of best model
best_params = CV_log_reg.best_params_
print('Best parameter:\n',best_params)'''

# Show best estimator
best_estimator = CV_log_reg.best_estimator_
print('Best estimator:\n',best_estimator) 

# Display confusion matrix
print('\nConfusion Matrix:\n',pd.DataFrame(confusion_matrix(y_test, predicted_y)))

# Create classification report
print('\nClassification Report:\n',classification_report(y_test, predicted_y))


Best estimator:
 Pipeline(steps=[('PCA', PCA(n_components=3, random_state=17)),
                ('LR_model', LogisticRegression(C=0.001, max_iter=1000))])

Confusion Matrix:
      0    1
0  144   86
1   14  191

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.63      0.74       230
           1       0.69      0.93      0.79       205

    accuracy                           0.77       435
   macro avg       0.80      0.78      0.77       435
weighted avg       0.81      0.77      0.77       435



#### <center>Record the optimal hyperparameters and performance resulting from this pipeline.</center>

## <span style="background: yellow;">Commit your code!</span> 

### Your 2nd pipeline
  * Anomaly detection
  * Dimensionality reduction
  * Model training/validation

In [42]:
# Add anomaly detection code  (Question #E205)
# ----------------------------------
# Construct envelope
ee = EllipticEnvelope()
# Fit data to envelope
#envelope = env.fit(X_train_norm, y_train_norm)
envelope = ee.fit(X_train, y_train)

# Get labels from classifier to cull outliers
env_outliers = envelope.predict(X_train) == -1

# Re-slice X,y into a cleaned dataset with outliers excluded
X_env = X_train[~env_outliers]
y_env = y_train[~env_outliers]




In [46]:
# Add codes for feature selection and classification pipeline with grid search  (Question #E206)
# ----------------------------------
#kb = SelectKBest()
#rfc = RandomForestClassifier(random_state = 17)
#fa = FactorAnalysis()
#dtc = DecisionTreeClassifier()

# Define pipeline
pipe2 = Pipeline(
    [
        #('scaler', StandardScaler()),
        ('kbest', SelectKBest(mutual_info_classif)),
        #('rfc_model', rfc)
        #('fa', FactorAnalysis()),
        ('dtc_model', DecisionTreeClassifier())
    ]
)

param_grid2 = [
    {
        #'fa__n_components': [5, 20, 80, 120, 480],
        'kbest__k': [1, 3, 5, 10, 15],
        'dtc_model__max_features': ['auto', 'sqrt', 'log2'],
        'dtc_model__max_leaf_nodes': [None, 15, 10, 5, 1],
        'dtc_model__random_state': [17]
    },
]

# Use Grid Search to train Pipeline
CV_dtc = GridSearchCV(pipe2, param_grid = param_grid2, n_jobs = 2, cv = 10)
CV_dtc_model = CV_dtc.fit(X_env, y_env)
#print(CV_dtc.cv_results_)


 0.81887963 0.84131145 0.82469786 0.80223338 0.80104116 0.82654744
 0.83426017 0.80742691 0.81565817 0.80167402 0.78621182 0.8061612
 0.78633431 0.77090887        nan        nan        nan        nan
        nan 0.77736404 0.81378818 0.8060877  0.80741875 0.79846889
 0.79656623 0.82077005 0.82659644 0.80554058 0.81698106 0.80104116
 0.81378409 0.82849094 0.8074065  0.79908542 0.80167402 0.78621182
 0.80487914 0.78831455 0.774114          nan        nan        nan
        nan        nan 0.77736404 0.81316348 0.8163482  0.80545484
 0.79914666 0.79656623 0.81440879 0.82788257 0.84067451 0.80542218
 0.80104116 0.81821819 0.82788257 0.80739833 0.78431733 0.80167402
 0.78749388 0.80487914 0.77864609 0.78434183        nan        nan
        nan        nan        nan]


In [47]:
# Given an unbiased evaluation  (Question #E207)
# ----------------------------------
'''# Show parameters of trained models and their rank
pd.set_option("max_colwidth", 80)
CV_dtc_df = pd.DataFrame(CV_dtc.cv_results_)
print(CV_dtc_df[['params','rank_test_score']])'''

# Evaluate best model using test data
predicted_y = CV_dtc.predict(X_test)

'''# Show parameters of best model
best_params = CV_dtc.best_params_
print('Best parameter:\n',best_params)'''

# Show best estimator
best_estimator = CV_dtc.best_estimator_
print('Best estimator:\n',best_estimator) 

# Display confusion matrix
print('\nConfusion Matrix:\n',pd.DataFrame(confusion_matrix(y_test, predicted_y)))

# Create classification report
print('\nClassification Report:\n',classification_report(y_test, predicted_y))


Best estimator:
 Pipeline(steps=[('kbest',
                 SelectKBest(k=5,
                             score_func=<function mutual_info_classif at 0x7f63b383a048>)),
                ('dtc_model',
                 DecisionTreeClassifier(max_features='auto', max_leaf_nodes=15,
                                        random_state=17))])

Confusion Matrix:
      0    1
0  139   91
1   18  187

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.60      0.72       230
           1       0.67      0.91      0.77       205

    accuracy                           0.75       435
   macro avg       0.78      0.76      0.75       435
weighted avg       0.79      0.75      0.74       435



#### <center>Record the optimal hyperparameters and performance resulting from this pipeline.</center>

## <span style="background: yellow;">Commit your code!</span> 

### Your 3rd pipeline
  * Anomaly detection
  * Dimensionality reduction
  * Model training/validation

In [61]:
# Add anomaly detection code  (Question #E209)
# ----------------------------------
# Construct IsolationForest
iso_forest = IsolationForest(n_estimators = 250)

iso_outliers = iso_forest.fit(X_train, y_train)

# Get labels from classifier to cull outliers
iso_outliers = iso_forest.predict(X_train) == -1

X_iso = X_train[~iso_outliers]
y_iso = y_train[~iso_outliers]


In [71]:
# Add codes for feature selection and classification pipeline with grid search  (Question #E210)
# ----------------------------------
# Define pipeline
pipe3 = Pipeline(
    [
        #('fa', FactorAnalysis()),
        #('vt', VarianceThreshold()),
        ('k_percentile', SelectPercentile()),
        ('rfc_model', RandomForestClassifier())
    ]
)

param_grid3 = [
    {
        #'fa__n_components': [5, 20, 80, 120, 480],
        #'fa__random_state': [17],
        #'vt__threshold': [0, 0.0001, 0.001, 0.01, 0.1],
        'k_percentile__percentile': [1, 5, 10, 50, 75],
        'rfc_model__max_features' : ['auto', 'sqrt', 'log2'],
        'rfc_model__max_depth': [None, 55, 30, 10, 1],
        'rfc_model__n_estimators':[25, 50, 100, 150, 500]
    },
]

# Use Grid Search to train Pipeline
CV_rfc = GridSearchCV(pipe3, param_grid = param_grid3, n_jobs = 2, cv = 5)
CV_rfc_model = CV_rfc.fit(X_iso, y_iso)
#print(CV_rfc.cv_results_)


In [72]:
# Given an unbiased evaluation  (Question #E211)
# ----------------------------------
'''# Show parameters of trained models and their rank
pd.set_option("max_colwidth", 80)
CV_rfc_df = pd.DataFrame(CV_rfc.cv_results_)
print(CV_rfc_df[['params','rank_test_score']])'''

## Evaluate best model using test data
# Make prediction using test data
predicted_y = CV_rfc.predict(X_test)

'''# Show parameters of best model
best_params = CV_rfc.best_params_
print('Best parameter:\n',best_params)'''

# Show best estimator
best_estimator = CV_rfc.best_estimator_
print('\nBest estimator:\n',best_estimator) 

# Display confusion matrix
print('\nConfusion Matrix:\n',pd.DataFrame(confusion_matrix(y_test, predicted_y)))

'''#Display accuracy score
print('\nAccuracy Score:\n', pd.DataFrame(accuracy_score(y_test.actual_label.values, predicted_y.predicted_RF.values)))'''

# Create classification report
print('\nClassification Report:\n',classification_report(y_test, predicted_y))



Best estimator:
 Pipeline(steps=[('k_percentile', SelectPercentile(percentile=75)),
                ('rfc_model',
                 RandomForestClassifier(max_depth=55, n_estimators=500))])

Confusion Matrix:
      0    1
0  206    6
1    9  214

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96       212
           1       0.97      0.96      0.97       223

    accuracy                           0.97       435
   macro avg       0.97      0.97      0.97       435
weighted avg       0.97      0.97      0.97       435



#### <center>Record the optimal hyperparameters and performance resulting from this pipeline.</center>

## Compare these three pipelines and discuss your findings

## <span style="background: yellow;">Commit your code!</span> 

### Pickle the required pipeline/models for Part III.

In [78]:
# Pickle the best pipeline
joblib.dump([pipe3, CV_rfc, CV_rfc_model], 'data/pipeline-v3.pkl')


['data/pipeline-v3.pkl']

You should have made a few commits so far of this project.  
**Definitely make a commit of the notebook now!**  
Comment should be: `Final Project, Checkpoint - Pipelines done`


# Save your notebook!
## Then `File > Close and Halt`