In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from IPython.display import clear_output
import time
import catboost
import re
import optuna
import json
import sys
sys.path.append('../..')
import main

from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin

col_names = []
with open('../data/Faults27x7_var','r') as f:
    for line in f:
        col_names.append(line.strip())
        
train_org = pd.read_csv('../data/train.csv')
test_org = pd.read_csv('../data/test.csv')
org_data = pd.read_csv('../data/Faults.NNA', delimiter='\s', engine='python', names=col_names)

df = pd.concat([train_org.drop(['id'],axis=1),org_data]).reset_index(drop=True)
X = df.drop(['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps','Other_Faults'], axis=1)
pastry = df['Pastry'].copy()
z_scratch = df['Z_Scratch'].copy()
k_scatch = df['K_Scatch'].copy()
stains = df['Stains'].copy()
dirtiness = df['Dirtiness'].copy()
bumps = df['Bumps'].copy()
other_faults = df['Other_Faults'].copy()

ys = [pastry, z_scratch, k_scatch, stains, dirtiness, bumps, other_faults]
y_names = ['pastry', 'z_scratch', 'k_scatch', 'stains', 'dirtiness', 'bumps', 'other_faults']

class KMeansTransformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=0)
        
    def fit(self,X, y=None):
        self.kmeans.fit(X)
        return self
        
    def transform(self,X):
        labels = self.kmeans.predict(X)
        return np.c_[X, labels]

In [2]:
xtrain_pastry = pd.concat([X,pastry], axis=1)

In [3]:
pastry_predictor = TabularPredictor(label='Pastry', eval_metric='roc_auc')

No path specified. Models will be saved in: "AutogluonModels\ag-20240320_045657"


In [4]:
pastry_predictor.fit(xtrain_pastry, time_limit=3600, presets='best_quality')

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels\ag-20240320_045657/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 900s
AutoGluon will save models to "AutogluonModels\ag-20240320_045657/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:       

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x19bd9e1abb0>

In [8]:
pastry_predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.885739,roc_auc,4.549237,236.985509,0.001998,5.081892,2,True,67
1,WeightedEnsemble_L3,0.885679,roc_auc,34.048682,2103.580163,0.001999,5.087007,3,True,98
2,CatBoost_BAG_L2,0.883778,roc_auc,27.395413,1668.709525,0.017840,18.653343,2,True,72
3,NeuralNetFastAI_r145_BAG_L2,0.883555,roc_auc,28.016944,1712.332058,0.639371,62.275876,2,True,93
4,LightGBM_r96_BAG_L1,0.883454,roc_auc,2.077634,7.795454,2.077634,7.795454,1,True,19
...,...,...,...,...,...,...,...,...,...,...
93,RandomForest_r39_BAG_L1,0.860639,roc_auc,0.424011,6.527032,0.424011,6.527032,1,True,45
94,RandomForest_r195_BAG_L1,0.855924,roc_auc,0.474392,6.474017,0.474392,6.474017,1,True,26
95,NeuralNetTorch_r158_BAG_L1,0.845540,roc_auc,0.985989,223.494192,0.985989,223.494192,1,True,51
96,KNeighborsDist_BAG_L1,0.541775,roc_auc,0.638645,0.023096,0.638645,0.023096,1,True,2


In [9]:
pastry_predictor.evaluate(xtrain_pastry)

{'roc_auc': 0.9460429942649771,
 'accuracy': 0.9353024574669188,
 'balanced_accuracy': 0.5898013721289583,
 'mcc': 0.383008926114043,
 'f1': 0.30117406840224603,
 'precision': 0.8805970149253731,
 'recall': 0.1816502463054187}

In [10]:
nm = TabularPredictor.load('AutogluonModels/ag-20240320_045657')

In [13]:
nm.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.885739,roc_auc,4.549237,236.985509,0.001998,5.081892,2,True,67
1,WeightedEnsemble_L3,0.885679,roc_auc,34.048682,2103.580163,0.001999,5.087007,3,True,98
2,CatBoost_BAG_L2,0.883778,roc_auc,27.395413,1668.709525,0.017840,18.653343,2,True,72
3,NeuralNetFastAI_r145_BAG_L2,0.883555,roc_auc,28.016944,1712.332058,0.639371,62.275876,2,True,93
4,LightGBM_r96_BAG_L1,0.883454,roc_auc,2.077634,7.795454,2.077634,7.795454,1,True,19
...,...,...,...,...,...,...,...,...,...,...
93,RandomForest_r39_BAG_L1,0.860639,roc_auc,0.424011,6.527032,0.424011,6.527032,1,True,45
94,RandomForest_r195_BAG_L1,0.855924,roc_auc,0.474392,6.474017,0.474392,6.474017,1,True,26
95,NeuralNetTorch_r158_BAG_L1,0.845540,roc_auc,0.985989,223.494192,0.985989,223.494192,1,True,51
96,KNeighborsDist_BAG_L1,0.541775,roc_auc,0.638645,0.023096,0.638645,0.023096,1,True,2


In [14]:
xtrain_z_scratch = pd.concat([X,z_scratch], axis=1)

In [16]:
z_scratch_predictor = TabularPredictor(label='Z_Scratch', eval_metric='roc_auc', path='AutogluonModels/z_scratch')

In [11]:
z_scratch_predictor.fit(xtrain_z_scratch, time_limit=3600, presets='best_quality')

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels\ag-20240320_005921/ds_sub_fit/sub_fit_ho.
Running the sub-fit in a ray process to avoid memory leakage.
Spend 913 seconds for the sub-fit(s) during dynamic stacking.
Time left for full fit of AutoGluon: 2687 seconds.
Starting full fit now with num_stack_levels 1.
Beginning AutoGluon training ... Time limit = 2687s
AutoGluon will save m

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x1c2275f6fd0>

In [12]:
z_scratch_predictor.evaluate(xtrain_z_scratch)

{'roc_auc': 0.9908073726089066,
 'accuracy': 0.9719430902397771,
 'balanced_accuracy': 0.844761534089603,
 'mcc': 0.7477408196272993,
 'f1': 0.7593856655290102,
 'precision': 0.830999066293184,
 'recall': 0.6991358994501178}

In [13]:
z_scratch_predictor.evaluate(xtest_z_scratch)

{'roc_auc': 0.9642754943747458,
 'accuracy': 0.9508506616257089,
 'balanced_accuracy': 0.716312483997771,
 'mcc': 0.5221254098744341,
 'f1': 0.5357142857142857,
 'precision': 0.6666666666666666,
 'recall': 0.44776119402985076}

In [14]:
xtr_k_scatch, xte_k_scatch, ytr_k_scatch, yte_k_scatch = train_test_split(X,k_scatch,test_size=0.05, stratify=k_scatch, random_state=0)

In [15]:
xtrain_k_scatch = pd.concat([xtr_k_scatch,ytr_k_scatch], axis=1)
xtest_k_scatch = pd.concat([xte_k_scatch,yte_k_scatch], axis=1)

In [16]:
k_scatch_predictor = TabularPredictor(label='K_Scatch', eval_metric='roc_auc')

No path specified. Models will be saved in: "AutogluonModels\ag-20240320_015927"


In [18]:
k_scatch_predictor.fit(xtrain_k_scatch, time_limit=3600, presets='best_quality')

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels\ag-20240320_015927/ds_sub_fit/sub_fit_ho.
Running the sub-fit in a ray process to avoid memory leakage.


KeyboardInterrupt: 

In [None]:
k_scatch_predictor.evaluate(xtrain_k_scatch)

{'roc_auc': 0.9310208306075798,
 'accuracy': 0.9292972575905974,
 'balanced_accuracy': 0.5416709480942064,
 'mcc': 0.2601406848662326,
 'f1': 0.15384615384615385,
 'precision': 0.8823529411764706,
 'recall': 0.08426966292134831}

In [None]:
k_scatch_predictor.evaluate(xtest_k_scatch)

{'roc_auc': 0.8849383811832179,
 'accuracy': 0.9240374609781478,
 'balanced_accuracy': 0.5147825419042091,
 'mcc': 0.1171556973782576,
 'f1': 0.060085836909871244,
 'precision': 0.5384615384615384,
 'recall': 0.031818181818181815}

In [None]:
xtr_stains, xte_stains, ytr_stains, yte_stains = train_test_split(X,stains,test_size=0.05, stratify=stains, random_state=0)

In [None]:
xtrain_stains = pd.concat([xtr_stains,ytr_stains], axis=1)
xtest_stains = pd.concat([xte_stains,yte_stains], axis=1)

In [None]:
stains_predictor = TabularPredictor(label='Stains', eval_metric='roc_auc')

No path specified. Models will be saved in: "AutogluonModels\ag-20240319_235032"


In [None]:
stains_predictor.fit(xtrain_stains, time_limit=3600, presets='best_quality')

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 1800 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels\ag-20240319_170823/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 450s
AutoGluon will save models to "AutogluonModels\ag-20240319_170823/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:       

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x15c083439d0>

In [None]:
stains_predictor.evaluate(xtrain_stains)

{'roc_auc': 0.9310208306075798,
 'accuracy': 0.9292972575905974,
 'balanced_accuracy': 0.5416709480942064,
 'mcc': 0.2601406848662326,
 'f1': 0.15384615384615385,
 'precision': 0.8823529411764706,
 'recall': 0.08426966292134831}

In [None]:
stains_predictor.evaluate(xtest_stains)

{'roc_auc': 0.8849383811832179,
 'accuracy': 0.9240374609781478,
 'balanced_accuracy': 0.5147825419042091,
 'mcc': 0.1171556973782576,
 'f1': 0.060085836909871244,
 'precision': 0.5384615384615384,
 'recall': 0.031818181818181815}

In [None]:
xtr_dirtiness, xte_dirtiness, ytr_dirtiness, yte_dirtiness = train_test_split(X,dirtiness,test_size=0.05, stratify=dirtiness, random_state=0)

In [None]:
xtrain_dirtiness = pd.concat([xtr_dirtiness,ytr_dirtiness], axis=1)
xtest_dirtiness = pd.concat([xte_dirtiness,yte_dirtiness], axis=1)

In [None]:
dirtiness_predictor = TabularPredictor(label='Dirtiness', eval_metric='roc_auc')

No path specified. Models will be saved in: "AutogluonModels\ag-20240319_235032"


In [None]:
dirtiness_predictor.fit(xtrain_dirtiness, time_limit=3600, presets='best_quality')

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 1800 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels\ag-20240319_170823/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 450s
AutoGluon will save models to "AutogluonModels\ag-20240319_170823/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:       

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x15c083439d0>

In [None]:
dirtiness_predictor.evaluate(xtrain_dirtiness)

{'roc_auc': 0.9310208306075798,
 'accuracy': 0.9292972575905974,
 'balanced_accuracy': 0.5416709480942064,
 'mcc': 0.2601406848662326,
 'f1': 0.15384615384615385,
 'precision': 0.8823529411764706,
 'recall': 0.08426966292134831}

In [None]:
dirtiness_predictor.evaluate(xtest_dirtiness)

{'roc_auc': 0.8849383811832179,
 'accuracy': 0.9240374609781478,
 'balanced_accuracy': 0.5147825419042091,
 'mcc': 0.1171556973782576,
 'f1': 0.060085836909871244,
 'precision': 0.5384615384615384,
 'recall': 0.031818181818181815}

In [None]:
xtr_bumps, xte_bumps, ytr_bumps, yte_bumps = train_test_split(X,bumps,test_size=0.05, stratify=bumps, random_state=0)

In [None]:
xtrain_bumps = pd.concat([xtr_bumps,ytr_bumps], axis=1)
xtest_bumps = pd.concat([xte_bumps,yte_bumps], axis=1)

In [None]:
bumps_predictor = TabularPredictor(label='Bumps', eval_metric='roc_auc')

No path specified. Models will be saved in: "AutogluonModels\ag-20240319_235904"


In [None]:
bumps_predictor.fit(xtrain_bumps, time_limit=3600, presets='best_quality')

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels\ag-20240319_235904/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 900s
AutoGluon will save models to "AutogluonModels\ag-20240319_235904/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:       

In [None]:
bumps_predictor.evaluate(xtrain_bumps)

{'roc_auc': 0.9310208306075798,
 'accuracy': 0.9292972575905974,
 'balanced_accuracy': 0.5416709480942064,
 'mcc': 0.2601406848662326,
 'f1': 0.15384615384615385,
 'precision': 0.8823529411764706,
 'recall': 0.08426966292134831}

In [None]:
bumps_predictor.evaluate(xtest_bumps)

{'roc_auc': 0.8849383811832179,
 'accuracy': 0.9240374609781478,
 'balanced_accuracy': 0.5147825419042091,
 'mcc': 0.1171556973782576,
 'f1': 0.060085836909871244,
 'precision': 0.5384615384615384,
 'recall': 0.031818181818181815}

In [None]:
xtr_other_faults, xte_other_faults, ytr_other_faults, yte_other_faults = train_test_split(X,other_faults,test_size=0.05, stratify=other_faults, random_state=0)

In [None]:
xtrain_other_faults = pd.concat([xtr_other_faults,ytr_other_faults], axis=1)
xtest_other_faults = pd.concat([xte_other_faults,yte_other_faults], axis=1)

In [None]:
other_faults_predictor = TabularPredictor(label='Other_Faults', eval_metric='roc_auc')

No path specified. Models will be saved in: "AutogluonModels\ag-20240319_235904"


In [None]:
other_faults_predictor.fit(xtrain_other_faults, time_limit=3600, presets='best_quality')

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels\ag-20240319_235904/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 900s
AutoGluon will save models to "AutogluonModels\ag-20240319_235904/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:       

In [None]:
other_faults_predictor.evaluate(xtrain_other_faults)

{'roc_auc': 0.9310208306075798,
 'accuracy': 0.9292972575905974,
 'balanced_accuracy': 0.5416709480942064,
 'mcc': 0.2601406848662326,
 'f1': 0.15384615384615385,
 'precision': 0.8823529411764706,
 'recall': 0.08426966292134831}

In [None]:
other_faults_predictor.evaluate(xtest_other_faults)

{'roc_auc': 0.8849383811832179,
 'accuracy': 0.9240374609781478,
 'balanced_accuracy': 0.5147825419042091,
 'mcc': 0.1171556973782576,
 'f1': 0.060085836909871244,
 'precision': 0.5384615384615384,
 'recall': 0.031818181818181815}