In [3]:
import numpy as np
import pandas as pd
from joblib import delayed, Parallel
import optuna
import xgboost as xgb
import lightgbm as lgbm
import catboost as cat

In [4]:
class paths:
    TRAIN = './data/train.csv'
    TEST = './data/test.csv'

In [5]:
train_df = pd.read_csv(paths.TRAIN)

In [6]:
from sklearn.decomposition import PCA

def feature_engineering(df):    
    new_df = df.copy()
    new_df['Log_Range'] = np.log(1 + new_df['Maximum_of_Luminosity']) - np.log(1 + new_df['Minimum_of_Luminosity'])
    new_df['Log_Lum'] = np.log(new_df['Sum_of_Luminosity'])
    new_df['Log_Avg_Lum'] = new_df['Log_Lum'] - 2 * new_df['LogOfAreas']

    new_df['Abs_Orientation'] = np.abs(new_df['Orientation_Index'])
    new_df['New_Lum'] = 1 + new_df['Luminosity_Index']
    new_df['Lum_Range'] = np.abs(
        new_df['Maximum_of_Luminosity'] - new_df['Minimum_of_Luminosity'])
    
    new_df['Abs_Orientation'] = np.abs(new_df['Orientation_Index'])
    new_df['Norm_Range'] = new_df['Log_Range'] - new_df['LogOfAreas']/2

    cols = ['LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'TypeOfSteel_A300',
            'Edges_Index', 'Outside_Global_Index', 'Abs_Orientation',
            'Steel_Plate_Thickness', 'Luminosity_Index', 
            'Log_Avg_Lum', 'Length_of_Conveyer', 'Empty_Index', 'Norm_Range',
            'Pastry', 'Z_Scratch', 'K_Scatch', 'id',
            'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

    X_cols = ['LogOfAreas', 'Log_X_Index', 'Log_Y_Index',
              'Edges_Index', 'Outside_Global_Index', 'Abs_Orientation',
              'Steel_Plate_Thickness', 'Luminosity_Index', 
              'Log_Avg_Lum', 'Length_of_Conveyer', 'Empty_Index', 'Norm_Range']

    pca_feats = [f'feature_{i}' for i in range(1, 12 + 1)]

    X_feats = new_df[X_cols]
    stats = X_feats.describe().T[['mean', 'std']]
    X_normed = (X_feats - stats['mean']) / stats['std']

    new_df[pca_feats] = PCA().fit_transform(X_normed)

    return new_df[cols+pca_feats]

In [7]:
df = feature_engineering(train_df)
y_cols = train_df.columns[-7:]
y = df[y_cols]
X = df.drop(columns=['id', *y_cols])
ids = df['id']

In [8]:
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=5)

trains = []
valids = []
for fold, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
    print(f'Fold: {fold}')
    train_X = X.loc[train_index]
    train_y = y.loc[train_index]
    valid_X = X.loc[valid_index]
    valid_y = y.loc[valid_index]
    trains.append((train_X, train_y))
    valids.append((valid_X, valid_y))

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4


## XGBoost

In [32]:
from sklearn.metrics import roc_auc_score

def cv_score(params):
    aucs = np.zeros(7)
    for fold in range(5):

        # Fit Model
        model = xgb.XGBClassifier(**params)
        model.fit(*trains[fold], eval_set=[valids[fold]], verbose=0)

        # Calculate ROCS
        X_val, y_val = valids[fold]
        preds = model.predict_proba(X_val)
        aucs += np.array(roc_auc_score(y_val, preds, multi_class='ovr', average=None))

    return aucs.mean() / 5

def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 5e-3, 1e-1, log=True),
        'lambda': trial.suggest_float('lambda', 1e-6, 1e-2, log=True),
        'alpha': trial.suggest_float('alpha', 1e-2, 1e2, log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0, log=True)
    }

    return cv_score(params)

In [33]:
study = optuna.create_study(direction='maximize')
study.optimize(xgb_objective, n_trials=100)

[I 2024-03-30 17:07:27,998] A new study created in memory with name: no-name-be12cad1-330b-4254-a598-69d39b7d969d
[I 2024-03-30 17:08:54,762] Trial 0 finished with value: 0.8796971324735241 and parameters: {'n_estimators': 576, 'early_stopping_rounds': 10, 'learning_rate': 0.005480634560828661, 'lambda': 7.983231160315566e-05, 'alpha': 0.170402375660281, 'subsample': 0.26131091535074075}. Best is trial 0 with value: 0.8796971324735241.
[I 2024-03-30 17:09:55,074] Trial 1 finished with value: 0.8800449617850907 and parameters: {'n_estimators': 526, 'early_stopping_rounds': 10, 'learning_rate': 0.02914702943518748, 'lambda': 0.0004704971453976472, 'alpha': 18.65928684783534, 'subsample': 0.5552719441642882}. Best is trial 1 with value: 0.8800449617850907.
[I 2024-03-30 17:10:17,129] Trial 2 finished with value: 0.8754957773790328 and parameters: {'n_estimators': 156, 'early_stopping_rounds': 12, 'learning_rate': 0.009811660139869486, 'lambda': 6.986134853140149e-06, 'alpha': 0.9429542617

In [34]:
import matplotlib.pyplot as plt
optuna.visualization.plot_optimization_history(study)

In [35]:
optuna.visualization.plot_parallel_coordinate(study)

In [39]:
optuna.visualization.plot_slice(study, params=['alpha', 'lambda', 'subsample',
                                               'early_stopping_rounds',
                                               'n_estimators',
                                               'learning_rate'])

In [38]:
optuna.visualization.plot_param_importances(study)

In [69]:
print(study.best_params)

{'n_estimators': 539, 'early_stopping_rounds': 10, 'learning_rate': 0.010053949164614676, 'lambda': 0.008225508298397766, 'alpha': 0.2111238039606713, 'subsample': 0.7209550358948689}


## Random Forest

In [62]:
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

def cv_rf_score(params):
    aucs = np.zeros(7)
    for fold in range(5):

        # Fit Model
        model = RandomForestClassifier(**params)
        model.fit(*trains[fold])

        # Calculate ROCS
        X_val, y_val = valids[fold]
        preds = model.predict_proba(X_val)
        preds = np.array(preds)[:,:,1].T
        aucs += np.array(roc_auc_score(y_val, preds, multi_class='ovr', average=None))

    return aucs.mean() / 5

def rf_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000, log=True),
        'max_depth': trial.suggest_int('max_depth', 5, 50, log=True),
        'max_samples': trial.suggest_float('max_samples', 1e-3, 1, log=True),
        'class_weight': 'balanced'
    }

    return cv_rf_score(params)

In [63]:
rf_study = optuna.create_study(direction='maximize')
rf_study.optimize(rf_objective, n_trials=50)

[I 2024-03-30 20:31:33,446] A new study created in memory with name: no-name-12623f6b-ea27-40aa-85a6-67f4a9e65862
[I 2024-03-30 20:31:55,861] Trial 0 finished with value: 0.8634212690146436 and parameters: {'n_estimators': 529, 'max_depth': 6, 'max_samples': 0.013801811087332538}. Best is trial 0 with value: 0.8634212690146436.
[I 2024-03-30 20:32:17,946] Trial 1 finished with value: 0.8730867647047053 and parameters: {'n_estimators': 232, 'max_depth': 10, 'max_samples': 0.11155986566957342}. Best is trial 1 with value: 0.8730867647047053.
[I 2024-03-30 20:32:56,055] Trial 2 finished with value: 0.8755434641646135 and parameters: {'n_estimators': 178, 'max_depth': 12, 'max_samples': 0.3175155932895404}. Best is trial 2 with value: 0.8755434641646135.
[I 2024-03-30 20:32:58,688] Trial 3 finished with value: 0.8431669111770509 and parameters: {'n_estimators': 59, 'max_depth': 16, 'max_samples': 0.00857076003713293}. Best is trial 2 with value: 0.8755434641646135.
[I 2024-03-30 20:33:27,0

In [65]:
optuna.visualization.plot_optimization_history(rf_study)

In [66]:
optuna.visualization.plot_parallel_coordinate(rf_study)

In [67]:
optuna.visualization.plot_slice(rf_study, params=['n_estimators', 
                                                  'max_depth', 
                                                  'max_samples'])

In [68]:
optuna.visualization.plot_param_importances(rf_study)

## Combination

In [73]:
from sklearn.metrics import roc_auc_score

def cv_score(xgb_params, rf_params, xgb_weight):
    rf_weight = 1 - xgb_weight
    aucs = np.zeros(7)
    for fold in range(5):

        # Fit Model
        xgboost = xgb.XGBClassifier(**xgb_params)
        xgboost.fit(*trains[fold], eval_set=[valids[fold]], verbose=0)

        rforest = RandomForestClassifier(**rf_params)
        rforest.fit(*trains[fold])

        # Calculate ROCS
        X_val, y_val = valids[fold]
        xgb_preds = xgboost.predict_proba(X_val)

        X_val, y_val = valids[fold]
        rf_preds = rforest.predict_proba(X_val)
        rf_preds = np.array(rf_preds)[:,:,1].T

        preds = xgb_weight * xgb_preds + rf_weight * rf_preds
        aucs += np.array(roc_auc_score(y_val, preds, multi_class='ovr', average=None))

    return aucs.mean() / 5

def combo_objective(trial):
    xgb_params = {
        'n_estimators': 1000,
        'early_stopping_rounds': 10,
        'learning_rate': 0.01,
        'lambda': 0.008,
        'alpha': trial.suggest_float('alpha', 1e-2, 1, log=True),
        'subsample': trial.suggest_float('subsample', 0.4, 0.85, log=True),
        'n_jobs': -1
    }

    rf_params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000, log=True),
        'max_depth': trial.suggest_int('max_depth', 25, 75, log=True),
        'max_samples': trial.suggest_float('max_samples', 6e-2, 6e-1, log=True),
        'class_weight': 'balanced',
        'n_jobs': -1
    }

    xgb_weight = trial.suggest_float('xgb_weight', 0, 1)

    return cv_score(xgb_params, rf_params, xgb_weight)

In [74]:
study = optuna.create_study(direction='maximize')
study.optimize(combo_objective, n_trials=50)

[I 2024-03-30 23:16:39,226] A new study created in memory with name: no-name-9be5742b-93e0-4f54-a7bd-8bbdbcab0b63
[I 2024-03-30 23:22:42,766] Trial 0 finished with value: 0.883507049828793 and parameters: {'alpha': 0.010598727834987337, 'subsample': 0.8387836007628174, 'n_estimators': 1885, 'max_depth': 47, 'max_samples': 0.4134091892098071, 'xgb_weight': 0.5710150310178316}. Best is trial 0 with value: 0.883507049828793.
[I 2024-03-30 23:25:32,730] Trial 1 finished with value: 0.8794432244647765 and parameters: {'alpha': 0.014544523147008166, 'subsample': 0.44226098662273866, 'n_estimators': 715, 'max_depth': 26, 'max_samples': 0.20911816963122223, 'xgb_weight': 0.01702087738420177}. Best is trial 0 with value: 0.883507049828793.
[I 2024-03-30 23:32:43,587] Trial 2 finished with value: 0.880518517258319 and parameters: {'alpha': 0.07768541900694854, 'subsample': 0.5231241089639049, 'n_estimators': 1946, 'max_depth': 74, 'max_samples': 0.1692064997419163, 'xgb_weight': 0.09343423885345

KeyboardInterrupt: 