In [3]:
import numpy as np
import pandas as pd
from joblib import delayed, Parallel
import optuna
import xgboost as xgb
import lightgbm as lgbm
import catboost as cat

In [4]:
class paths:
    TRAIN = './data/train.csv'
    TEST = './data/test.csv'

In [5]:
train_df = pd.read_csv(paths.TRAIN)

In [6]:
from sklearn.decomposition import PCA

def feature_engineering(df):    
    new_df = df.copy()
    new_df['Log_Range'] = np.log(1 + new_df['Maximum_of_Luminosity']) - np.log(1 + new_df['Minimum_of_Luminosity'])
    new_df['Log_Lum'] = np.log(new_df['Sum_of_Luminosity'])
    new_df['Log_Avg_Lum'] = new_df['Log_Lum'] - 2 * new_df['LogOfAreas']

    new_df['Abs_Orientation'] = np.abs(new_df['Orientation_Index'])
    new_df['New_Lum'] = 1 + new_df['Luminosity_Index']
    new_df['Lum_Range'] = np.abs(
        new_df['Maximum_of_Luminosity'] - new_df['Minimum_of_Luminosity'])
    
    new_df['Abs_Orientation'] = np.abs(new_df['Orientation_Index'])
    new_df['Norm_Range'] = new_df['Log_Range'] - new_df['LogOfAreas']/2

    cols = ['LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'TypeOfSteel_A300',
            'Edges_Index', 'Outside_Global_Index', 'Abs_Orientation',
            'Steel_Plate_Thickness', 'Luminosity_Index', 
            'Log_Avg_Lum', 'Length_of_Conveyer', 'Empty_Index', 'Norm_Range',
            'Pastry', 'Z_Scratch', 'K_Scatch', 'id',
            'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

    X_cols = ['LogOfAreas', 'Log_X_Index', 'Log_Y_Index',
              'Edges_Index', 'Outside_Global_Index', 'Abs_Orientation',
              'Steel_Plate_Thickness', 'Luminosity_Index', 
              'Log_Avg_Lum', 'Length_of_Conveyer', 'Empty_Index', 'Norm_Range']

    pca_feats = [f'feature_{i}' for i in range(1, 12 + 1)]

    X_feats = new_df[X_cols]
    stats = X_feats.describe().T[['mean', 'std']]
    X_normed = (X_feats - stats['mean']) / stats['std']

    new_df[pca_feats] = PCA().fit_transform(X_normed)

    return new_df[cols+pca_feats]

In [9]:
df = feature_engineering(train_df)
y_cols = train_df.columns[-7:]
y = df[y_cols]
X = df.drop(columns=['id', *y_cols])
ids = df['id']

In [10]:
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=5)

trains = []
valids = []
for fold, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
    print(f'Fold: {fold}')
    train_X = X.loc[train_index]
    train_y = y.loc[train_index]
    valid_X = X.loc[valid_index]
    valid_y = y.loc[valid_index]
    trains.append((train_X, train_y))
    valids.append((valid_X, valid_y))

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4


## XGBoost

In [17]:
from sklearn.metrics import roc_auc_score

def cv_score(col, params):
    aucs = 0
    for fold in range(5):
        X, y = trains[fold]
        y_col = np.array(y[col])
        X_val, y_val = valids[fold]
        y_val_col = np.array(y_val[col])

        # Fit Model
        model = xgb.XGBClassifier(**params)
        model.fit(X, y_col, eval_set=
                  [(X_val, np.array(y_val_col))], verbose=0)

        # Calculate ROCS
        preds = model.predict_proba(X_val)[:,1]
        aucs += roc_auc_score(np.array(y_val[col]), preds)
    return aucs / 5

def xgb_objective(trial):
    params = {
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0, log=False),
        'eval_metric': 'auc'
    }

    return cv_score('Pastry', params)


In [18]:
study = optuna.create_study(direction='maximize')
study.optimize(xgb_objective, n_trials=100)

[I 2024-03-30 13:17:39,634] A new study created in memory with name: no-name-f2307ff3-de3b-48db-8fe8-f1ac78d65ef2
[I 2024-03-30 13:17:42,069] Trial 0 finished with value: 0.7465263493090997 and parameters: {'lambda': 8.804200843118732e-07, 'alpha': 0.006244341711196542, 'subsample': 0.12551689859982285}. Best is trial 0 with value: 0.7465263493090997.
[I 2024-03-30 13:17:44,531] Trial 1 finished with value: 0.8097477526260631 and parameters: {'lambda': 0.00023761941226961716, 'alpha': 0.00016653324434492877, 'subsample': 0.3519288186099533}. Best is trial 1 with value: 0.8097477526260631.
[I 2024-03-30 13:17:46,902] Trial 2 finished with value: 0.7528098325040657 and parameters: {'lambda': 0.5063208523546503, 'alpha': 6.261045193342276e-06, 'subsample': 0.10079745518698413}. Best is trial 1 with value: 0.8097477526260631.
[I 2024-03-30 13:17:49,460] Trial 3 finished with value: 0.8170027557993083 and parameters: {'lambda': 0.040172774935453966, 'alpha': 0.0009602479011805545, 'subsampl

In [20]:
import matplotlib.pyplot as plt
optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_parallel_coordinate(study)

In [21]:
optuna.visualization.plot_slice(study, params=['alpha', 'lambda', 'subsample'])