In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from IPython.display import clear_output
import time
import catboost
import re
import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin

col_names = []
with open('../data/Faults27x7_var','r') as f:
    for line in f:
        col_names.append(line.strip())
        
train_org = pd.read_csv('../data/train.csv')
test_org = pd.read_csv('../data/test.csv')
org_data = pd.read_csv('../data/Faults.NNA', delimiter='\s', engine='python', names=col_names)

X = train_org.drop(['id','Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps','Other_Faults'], axis=1)
pastry = train_org['Pastry'].copy()
z_scratch = train_org['Z_Scratch'].copy()
k_scatch = train_org['K_Scatch'].copy()
stains = train_org['Stains'].copy()
dirtiness = train_org['Dirtiness'].copy()
bumps = train_org['Bumps'].copy()
other_faults = train_org['Other_Faults'].copy()

ys = [pastry, z_scratch, k_scatch, stains, dirtiness, bumps, other_faults]
y_names = ['pastry', 'z_scratch', 'k_scatch', 'stains', 'dirtiness', 'bumps', 'other_faults']

class KMeansTransformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=0)
        
    def fit(self,X, y=None):
        self.kmeans.fit(X)
        return self
        
    def transform(self,X):
        labels = self.kmeans.predict(X)
        return np.c_[X, labels]
    
class PCA_Transformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_components):
        self.n_components = n_components
        self.pca = PCA(n_components=self.n_components, random_state=0)
        
    def fit(self, X, y=None):
        self.pca.fit(X)
        return self
    
    def transform(self,X):
        cols = self.pca.transform(X)
        return np.c_[X, cols]

In [7]:
d = PCA_Transformer(2)
d.fit(X)

In [10]:
X

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,584,590,909972,909977,16,8,5,2274,113,140,...,0.0059,1.0000,1.0000,0.0,1.2041,0.9031,0.6990,-0.5000,-0.0104,0.1417
1,808,816,728350,728372,433,20,54,44478,70,111,...,0.0044,0.2500,1.0000,1.0,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,...,0.1077,0.2363,0.3857,0.0,4.0564,2.1790,2.2095,-0.0105,-0.0944,1.0000
3,781,789,3353146,3353173,210,16,29,3202,114,134,...,0.0044,0.3750,0.9310,1.0,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025
4,1540,1560,618457,618502,521,72,67,48231,82,111,...,0.0192,0.2105,0.9861,1.0,2.7694,1.4150,1.8808,0.9158,-0.2455,0.9998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19214,749,757,143210,143219,17,4,4,2193,122,140,...,0.0044,1.0000,0.8000,0.0,1.2305,0.7782,0.6021,-0.1429,0.0044,0.2901
19215,723,735,2488529,2488541,231,17,26,27135,104,133,...,0.0065,0.7333,0.9216,1.0,2.3636,1.0414,1.4150,0.7222,-0.0989,0.5378
19216,6,31,1578055,1578129,780,114,98,71112,41,94,...,0.0199,0.1862,0.9554,1.0,2.8921,1.4314,1.8692,0.7719,-0.4283,0.9997
19217,9,18,1713172,1713184,126,13,26,14808,88,132,...,0.0068,0.7692,1.0000,1.0,2.1004,1.0414,1.4150,0.9610,-0.1162,0.3509


In [9]:
pd.DataFrame(d.transform(X), columns=list(X.columns)+['pca1', 'pca2'])

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,pca1,pca2
0,584.0,590.0,909972.0,909977.0,16.0,8.0,5.0,2274.0,113.0,140.0,...,1.0000,0.0,1.2041,0.9031,0.6990,-0.5000,-0.0104,0.1417,-1.326564e+06,-1.914030e+05
1,808.0,816.0,728350.0,728372.0,433.0,20.0,54.0,44478.0,70.0,111.0,...,1.0000,1.0,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491,-1.583462e+06,-1.495518e+05
2,39.0,192.0,2212076.0,2212144.0,11388.0,705.0,420.0,1311391.0,29.0,141.0,...,0.3857,0.0,4.0564,2.1790,2.2095,-0.0105,-0.0944,1.0000,5.131292e+05,1.120296e+06
3,781.0,789.0,3353146.0,3353173.0,210.0,16.0,29.0,3202.0,114.0,134.0,...,0.9310,1.0,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025,2.128610e+06,-1.857077e+05
4,1540.0,1560.0,618457.0,618502.0,521.0,72.0,67.0,48231.0,82.0,111.0,...,0.9861,1.0,2.7694,1.4150,1.8808,0.9158,-0.2455,0.9998,-1.738862e+06,-1.460133e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19214,749.0,757.0,143210.0,143219.0,17.0,4.0,4.0,2193.0,122.0,140.0,...,0.8000,0.0,1.2305,0.7782,0.6021,-0.1429,0.0044,0.2901,-2.410923e+06,-1.929800e+05
19215,723.0,735.0,2488529.0,2488541.0,231.0,17.0,26.0,27135.0,104.0,133.0,...,0.9216,1.0,2.3636,1.0414,1.4150,0.7222,-0.0989,0.5378,9.058171e+05,-1.634619e+05
19216,6.0,31.0,1578055.0,1578129.0,780.0,114.0,98.0,71112.0,41.0,94.0,...,0.9554,1.0,2.8921,1.4314,1.8692,0.7719,-0.4283,0.9997,-3.818006e+05,-1.212577e+05
19217,9.0,18.0,1713172.0,1713184.0,126.0,13.0,26.0,14808.0,88.0,132.0,...,1.0000,1.0,2.1004,1.0414,1.4150,0.9610,-0.1162,0.3509,-1.906832e+05,-1.773012e+05


In [2]:
y_params_dict = {}
for y_nm in y_names:
    y_params_dict[y_nm] = {}
    for i in range(0,9):
        if i ==1:
            continue
        y_params_dict[y_nm][i] = None

In [3]:
y_params_dict

{'pastry': {0: None,
  2: None,
  3: None,
  4: None,
  5: None,
  6: None,
  7: None,
  8: None},
 'z_scratch': {0: None,
  2: None,
  3: None,
  4: None,
  5: None,
  6: None,
  7: None,
  8: None},
 'k_scatch': {0: None,
  2: None,
  3: None,
  4: None,
  5: None,
  6: None,
  7: None,
  8: None},
 'stains': {0: None,
  2: None,
  3: None,
  4: None,
  5: None,
  6: None,
  7: None,
  8: None},
 'dirtiness': {0: None,
  2: None,
  3: None,
  4: None,
  5: None,
  6: None,
  7: None,
  8: None},
 'bumps': {0: None,
  2: None,
  3: None,
  4: None,
  5: None,
  6: None,
  7: None,
  8: None},
 'other_faults': {0: None,
  2: None,
  3: None,
  4: None,
  5: None,
  6: None,
  7: None,
  8: None}}

In [4]:
y_scores = []

In [5]:
cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for y,yn in zip(ys,y_names):
    for n,i in enumerate(range(1,9)):
        
        if n == 0:
            lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1)        
            pipe = Pipeline(
                steps = [
                    ('scaler', StandardScaler()),
                    ('model', lgbc)
                ]
            )
            score_without_hpt = np.mean(cross_val_score(pipe, X,y, scoring='roc_auc', cv= cvo))
            
            
            def objective_lgb(trial):
    
                cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
                params = dict(
                    n_estimators = trial.suggest_int('n_estimators',100,500),
                    max_depth = trial.suggest_int('max_depth',2,64),
                    num_leaves = trial.suggest_int('num_leaves',2,128),
                    learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
                    min_child_samples = trial.suggest_int('min_child_samples',2,500),
                    min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
                    subsample = trial.suggest_float('subsample', 0.33,0.85),
                    colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
                    reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
                    reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
                )
                
                lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
                
                pipe = Pipeline(
                    steps = [
                        ('scaler', StandardScaler()),
                        ('model', lgbc)
                    ]
                )
                
                score = np.mean(cross_val_score(pipe, X,y, scoring='roc_auc', cv= cvo))
                return score
            
            study_lgb = optuna.create_study(direction='maximize')
            
            study_lgb.optimize(objective_lgb, n_trials=100, n_jobs=-1, show_progress_bar=True)
            
            best_params = study_lgb.best_params
            y_params_dict[yn][0] = best_params
            
            best_score = study_lgb.best_value
            y_scores.append((yn,f'score_without_hpt: {score_without_hpt}', f'score_after_hpt: {best_score}'))
            
            time.sleep(5)
            clear_output()
            continue
            
        lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1)        
        pipe = Pipeline(
            steps = [
                ('scaler', StandardScaler()),
                ('kmeans', KMeansTransformer(n_clusters=i)),
                ('model', lgbc)
            ]
        )
        score_without_hpt = np.mean(cross_val_score(pipe, X,y, scoring='roc_auc', cv= cvo))
        
        def objective_lgb(trial):
    
            cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
            params = dict(
                n_estimators = trial.suggest_int('n_estimators',100,500),
                max_depth = trial.suggest_int('max_depth',2,64),
                num_leaves = trial.suggest_int('num_leaves',2,128),
                learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
                min_child_samples = trial.suggest_int('min_child_samples',2,500),
                min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
                subsample = trial.suggest_float('subsample', 0.33,0.85),
                colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
                reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
                reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
            )
            
            lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
            
            pipe = Pipeline(
                steps = [
                    ('scaler', StandardScaler()),
                    ('kmeans', KMeansTransformer(n_clusters=i)),
                    ('model', lgbc)
                ]
            )
            
            score = np.mean(cross_val_score(pipe, X,y, scoring='roc_auc', cv= cvo))
            return score
        
        study_lgb = optuna.create_study(direction='maximize')
        
        study_lgb.optimize(objective_lgb, n_trials=100, n_jobs=-1, show_progress_bar=True)
        
        best_params = study_lgb.best_params
        y_params_dict[yn][i] = best_params
        
        best_score = study_lgb.best_value
        y_scores.append((yn,f'score_without_hpt(n_clusters={i}): {score_without_hpt}', f'score_after_hpt(n_clusters={i}): {best_score}'))
        
        time.sleep(5)
        clear_output()
        
        with open('results.txt', 'w') as file:
            for tup in y_scores:
                file.write(f'{tup[0]}, {tup[1]}, {tup[2]}\n')