## Steel Plate Defect Prediction

### Boosters

In this notebook different boosters will be explored, as well as the optimization of its hyperparameters. All the boosting methods will be compared and ensambled in a voting clasifier.

In [1]:
import pandas as pd
import numpy as np
from pipeline2 import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns 

from hyperopt import fmin, tpe, hp, STATUS_OK
from sklearn.model_selection import train_test_split, cross_validate, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score

In [2]:
df_train = pd.read_csv("Data/train.csv")
pl = Pipeline(df_train, True)
df_train, le, sc, faul_encoder = pl.run()

df_test = pd.read_csv("Data/test.csv")
pl = Pipeline(df_test, False)
df_test = pl.run(le, sc)

#### 1. XGBooster

In [3]:
from xgboost import XGBClassifier

In [4]:
X = df_train.drop(columns=['faults'])
y = df_train['faults']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Hyperparameter optimization**

In [5]:
space = {
    'objective': 'multi:softmax',
    'num_class': 8,
    'tree_method': 'hist',
    'grow_policy': 'depthwise',
    'boosting_type': 'gbtree',
    'enable_categorical': True,

    # Variables
    'max_depth': hp.quniform('max_depth', 2, 20, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(1e-3), np.log(0.5)),
    'n_estimators': hp.quniform('n_estimators', 50, 2000, 10),
    'gamma': hp.uniform('gamma', 0.5, 1),
    'min_child_weight': hp.uniform('min_child_weight', 2, 100),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'subsample': hp.uniform('subsample', 0.5, 1), 
    
    'verbosity': 0}

In [6]:
def objective(params):
    params['max_depth'] = int(params['max_depth'])
    params['learning_rate'] = float(params['learning_rate'])
    params['n_estimators'] = int(params['n_estimators'])
    params['gamma'] = float(params['gamma'])
    params['min_child_weight'] = int(params['min_child_weight'])
    params['colsample_bytree'] = float(params['colsample_bytree'])
    params['subsample'] = float(params['subsample'])
    
    # Create XGBoost classifier with given parameters
    clf = XGBClassifier(**params)

    clf.fit(X_train, y_train)

    y_prob = clf.predict_proba(X_test)
    score = roc_auc_score(y_test, y_prob, multi_class="ovr")

    return {'loss': -score, 'status': STATUS_OK}

# Run Hyperopt optimization
best = fmin(objective, space, algo=tpe.suggest, max_evals=20)
print("Best parameters:", best)

100%|██████████| 20/20 [07:04<00:00, 21.21s/trial, best loss: -0.8606209695747803]
Best parameters: {'colsample_bytree': 0.7249204152698141, 'gamma': 0.8093507699039332, 'learning_rate': 0.07512032619541904, 'max_depth': 5.0, 'min_child_weight': 41.14804068289674, 'n_estimators': 190.0, 'subsample': 0.5587606280948411}


In [30]:
hyperparams = {
    'objective': 'multi:softmax',
    'num_class': 8,
    'tree_method': 'hist',
    'grow_policy': 'depthwise',
    'boosting_type': 'gbtree',
    'enable_categorical': True,
    'max_depth': 5,
    'learning_rate': 0.07512032619541904,
    'n_estimators': 190,
    'gamma': 0.8093507699039332,
    'min_child_weight': 41.14804068289674,
    'colsample_bytree': 0.7249204152698141,
    'subsample': 0.5587606280948411, 
    
    'verbosity': 0}


xgb_clf = XGBClassifier(**hyperparams)

xgb_clf.fit(X_train, y_train)

y_prob = xgb_clf.predict_proba(X_test)
score = roc_auc_score(y_test, y_prob, multi_class="ovr")
print(score)

0.8497293044972776


#### 2. LightGBM

In [15]:
from lightgbm import LGBMClassifier

The parameters for the model were obtained from the literature

In [35]:
hyperparams = {
    'objective': 'multiclass',
    'num_class': 8,
    'n_estimators': 918,
    'learning_rate': 0.0014,
    'max_depth': 5,
    'reg_alpha': 0.9522134628349151,
    'reg_lambda': 0.07875944420059292,
    'num_leaves': 20,
    'subsample': 0.33327260735952596,
    'colsample_bytree': 0.45916663480321157,   
    'verbosity': -1}

lgbm_clf = LGBMClassifier(**hyperparams)

lgbm_clf.fit(X_train, y_train)

y_prob = lgbm_clf.predict_proba(X_test)
score = roc_auc_score(y_test, y_prob, multi_class="ovr")

print(score)                          
                            

0.8540490773954753


#### 3. Voting classifier

In [32]:
from sklearn.ensemble import  VotingClassifier

In [36]:
voting_clf = VotingClassifier(
    estimators=[('XGB', xgb_clf),('LGBM', lgbm_clf)],
    voting='soft',n_jobs=-1)

voting_clf.fit(X_train, y_train)

y_prob = voting_clf.predict_proba(X_test)
score = roc_auc_score(y_test, y_prob, multi_class="ovr")

print(score)                          

0.8541584520303275


**Note:** The voting classifier guarantees a score value that is equal to or greater than the score of the best-performing model within the voting system.