In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from IPython.display import clear_output
import time
import catboost
import re
import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin

col_names = []
with open('../data/Faults27x7_var','r') as f:
    for line in f:
        col_names.append(line.strip())
        
train_org = pd.read_csv('../data/train.csv')
test_org = pd.read_csv('../data/test.csv')
org_data = pd.read_csv('../data/Faults.NNA', delimiter='\s', engine='python', names=col_names)

X = train_org.drop(['id','Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps','Other_Faults'], axis=1)
pastry = train_org['Pastry'].copy()
z_scratch = train_org['Z_Scratch'].copy()
k_scatch = train_org['K_Scatch'].copy()
stains = train_org['Stains'].copy()
dirtiness = train_org['Dirtiness'].copy()
bumps = train_org['Bumps'].copy()
other_faults = train_org['Other_Faults'].copy()

ys = [pastry, z_scratch, k_scatch, stains, dirtiness, bumps, other_faults]
y_names = ['pastry', 'z_scratch', 'k_scatch', 'stains', 'dirtiness', 'bumps', 'other_faults']

class KMeansTransformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=0)
        
    def fit(self,X, y=None):
        self.kmeans.fit(X)
        return self
        
    def transform(self,X):
        labels = self.kmeans.predict(X)
        return np.c_[X, labels]
    
class PCA_Transformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_components):
        self.n_components = n_components
        self.pca = PCA(n_components=self.n_components, random_state=0)
        
    def fit(self, X, y=None):
        self.pca.fit(X)
        return self
    
    def transform(self,X):
        cols = self.pca.transform(X)
        return np.c_[X, cols]

In [2]:
y_scores = []

In [3]:
cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for y,yn in zip(ys,y_names):
    
    print(f'trial: {yn} without pca\n')
    
    def objective_lgb(trial):
    
        cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        params = dict(
            n_estimators = trial.suggest_int('n_estimators',100,500),
            max_depth = trial.suggest_int('max_depth',2,64),
            num_leaves = trial.suggest_int('num_leaves',2,128),
            learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
            min_child_samples = trial.suggest_int('min_child_samples',2,500),
            min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
            subsample = trial.suggest_float('subsample', 0.33,0.85),
            colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
            reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
            reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
        )
        
        lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
        
        pipe = Pipeline(
            steps = [
                ('scaler', StandardScaler()),
                ('model', lgbc)
            ]
        )
        
        score = np.mean(cross_val_score(pipe, X,y, scoring='roc_auc', cv= cvo))
        return score
    
    study_lgb = optuna.create_study(direction='maximize')
    study_lgb.optimize(objective_lgb, n_trials=500, n_jobs=-1, show_progress_bar=True)
    
    best_params = study_lgb.best_params
    best_score_without_pca = study_lgb.best_value
    
    time.sleep(2)
    clear_output()
    
    print(f'trial: {yn} with pca\n')
    def objective_lgb(trial):
    
        cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        params = dict(
            n_estimators = trial.suggest_int('n_estimators',100,500),
            max_depth = trial.suggest_int('max_depth',2,64),
            num_leaves = trial.suggest_int('num_leaves',2,128),
            learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
            min_child_samples = trial.suggest_int('min_child_samples',2,500),
            min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
            subsample = trial.suggest_float('subsample', 0.33,0.85),
            colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
            reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
            reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
        )
        
        lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
        
        pipe = Pipeline(
            steps = [
                ('scaler', StandardScaler()),
                ('pca', PCA_Transformer(n_components=2)),
                ('model', lgbc)
            ]
        )
        
        score = np.mean(cross_val_score(pipe, X,y, scoring='roc_auc', cv= cvo))
        return score
    
    study_lgb = optuna.create_study(direction='maximize')
    study_lgb.optimize(objective_lgb, n_trials=500, n_jobs=-1, show_progress_bar=True)
    
    best_params = study_lgb.best_params
    best_score_after_pca = study_lgb.best_value
    
    time.sleep(2)
    clear_output()
    
    y_scores.append((yn,f'score_without_pca: {best_score_without_pca}', f'score_after_pca: {best_score_after_pca}'))
    
    with open('pca_results.txt', 'w') as file:
        for tup in y_scores:
            file.write(f'{tup[0]}, {tup[1]}, {tup[2]}\n')

[I 2024-03-07 22:21:39,772] A new study created in memory with name: no-name-6e5f03aa-14fa-4cea-917a-d2d87cb524d1


trial: bumps with pca



  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-03-07 22:21:43,608] Trial 1 finished with value: 0.805071608547758 and parameters: {'n_estimators': 242, 'max_depth': 54, 'num_leaves': 4, 'learning_rate': 0.06147197450551581, 'min_child_samples': 448, 'min_child_weight': 9.529324683326816, 'subsample': 0.4149652568056992, 'colsample_bylevel': 0.6179260362073213, 'reg_alpha': 0.06630823981759523, 'reg_lambda': 0.05109566241690084}. Best is trial 1 with value: 0.805071608547758.
[I 2024-03-07 22:21:47,392] Trial 9 finished with value: 0.7836157971769688 and parameters: {'n_estimators': 109, 'max_depth': 58, 'num_leaves': 102, 'learning_rate': 0.2689594550619745, 'min_child_samples': 282, 'min_child_weight': 6.751518102488198, 'subsample': 0.695822136868645, 'colsample_bylevel': 0.4825492620800803, 'reg_alpha': 0.0636562292875197, 'reg_lambda': 0.0443760986775374}. Best is trial 1 with value: 0.805071608547758.
[I 2024-03-07 22:21:48,514] Trial 10 finished with value: 0.8075936123948481 and parameters: {'n_estimators': 122, 'max

KeyboardInterrupt: 