In [2]:
import os
import numpy as np
from pathlib import Path
import pandas as pd
import concurrent.futures
from functools import partial
import optuna
import xgboost as xgb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, make_union, FeatureUnion, Pipeline
from sklearn.preprocessing import   StandardScaler,RobustScaler, OneHotEncoder, FunctionTransformer, PowerTransformer, QuantileTransformer, \
                                    PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, RFE, RFECV, SelectFromModel
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
class Config:
    CONTEST_TAG = 'playground-series-s3e24'
    ROOT_DIR = '../'
    FILE_NAME = 'rf-v1'
    
    def __init__(self):
        self.iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
        self.seed = 42
        self.n_jobs = -1
        self.splits = 5
        self.target_col = 'smoking'
        self.index_col = 'id'
        self.optuna_trials = 30
        if self.iskaggle:
            self.train_path = Path(f'{self.ROOT_DIR}/input/{self.CONTEST_TAG}/train.csv')
            self.test_path = Path(f'{self.ROOT_DIR}/input/{self.CONTEST_TAG}/test.csv')
            self.submission_path = Path(f'{self.ROOT_DIR}/input/{self.CONTEST_TAG}/sample_submission.csv')
        else:
            self.train_path = Path(f'{self.ROOT_DIR}/input/{self.CONTEST_TAG}/train.csv')
            self.test_path = Path(f'{self.ROOT_DIR}/input/{self.CONTEST_TAG}/test.csv')
            self.submission_path = Path(f'{self.ROOT_DIR}/input/{self.CONTEST_TAG}/sample_submission.csv')

        self.default_rf_params =  {   
            'criterion': 'gini',
            'n_jobs': -1, 
            'random_state': self.seed
        }

CFG = Config()

In [4]:
cat_cols = ['Urine protein', 'hearing(left)', 'hearing(right)', 'dental caries']
num_cols = ['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'serum creatinine', 'AST', 'ALT', 'Gtp']

In [5]:
train_df = pd.read_csv(CFG.train_path, index_col=CFG.index_col)
test_df = pd.read_csv(CFG.test_path, index_col=CFG.index_col)
X_train  = train_df.drop(CFG.target_col, axis=1).iloc[:10000, :]
y_train  = train_df[CFG.target_col].iloc[:10000]

In [6]:
transformers = [
    ('robust', RobustScaler()),
    ('log', FunctionTransformer(func=np.log1p, feature_names_out=lambda x, features: [f'log_{feature}' for feature in features])), 
    ('power', PowerTransformer()), 
    ('quant', QuantileTransformer()), 
    ('poly', PolynomialFeatures())
]
union = FeatureUnion(transformer_list=transformers)

In [7]:
numerical_transformer = make_pipeline(
    union
)

categorical_transformer = make_pipeline(
    OneHotEncoder()
)

preprocessor  = ColumnTransformer([
    ('cat', categorical_transformer, cat_cols),
    ('num', numerical_transformer, num_cols)
], n_jobs=-1)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(RandomForestClassifier(**CFG.default_rf_params))),
    # ('classifier', RandomForestClassifier(**CFG.default_rf_params))
])

selected_X_train = pipeline.fit_transform(X_train, y_train)

In [8]:
def objective(trial: optuna.Trial, X, y):
    optuna_rf_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 700, step=100),
        'max_depth': trial.suggest_int('max_depth', 4, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
        'criterion': trial.suggest_categorical("criterion", ["gini"]),
        'max_features': trial.suggest_categorical('max_features', ["sqrt", "log2", 1]),
    }
    
    rf_params = {**CFG.default_rf_params, **optuna_rf_params}
    pipeline = RandomForestClassifier(**rf_params)
    # pipeline.named_steps['classifier'].set_params(**rf_params)
    scores = cross_val_score(pipeline, X, y, cv=CFG.splits, scoring='roc_auc')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(partial(objective, X=selected_X_train, y=y_train), n_trials=CFG.optuna_trials, show_progress_bar=True, n_jobs=-1)

[I 2023-11-08 07:57:23,439] A new study created in memory with name: no-name-3ab43d40-7203-4529-9cfc-88184386020c


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2023-11-08 07:58:18,312] Trial 0 finished with value: 0.8452685970984716 and parameters: {'n_estimators': 100, 'max_depth': 31, 'min_samples_split': 142, 'min_samples_leaf': 56, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 0 with value: 0.8452685970984716.
[I 2023-11-08 07:58:19,544] Trial 9 finished with value: 0.8440339490626758 and parameters: {'n_estimators': 100, 'max_depth': 26, 'min_samples_split': 62, 'min_samples_leaf': 2, 'criterion': 'gini', 'max_features': 1}. Best is trial 0 with value: 0.8452685970984716.
[I 2023-11-08 07:59:06,819] Trial 10 finished with value: 0.8481988343602819 and parameters: {'n_estimators': 200, 'max_depth': 44, 'min_samples_split': 108, 'min_samples_leaf': 26, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 10 with value: 0.8481988343602819.
[I 2023-11-08 07:59:11,047] Trial 11 finished with value: 0.8496583912922808 and parameters: {'n_estimators': 200, 'max_depth': 49, 'min_samples_split': 2, 'min_samples_leaf': 12, 

In [None]:
rf_params = {**CFG.default_rf_params, **study.best_params}
pipeline.named_steps['classifier'].set_params(**rf_params)
pipeline.fit(X_train, y_train)

In [None]:
predictions = pipeline.predict_proba(test_df)[:, 1]
sample_submission = pd.read_csv(CFG.submission_path, index_col=CFG.index_col)
sample_submission[CFG.target_col] = predictions
sample_submission.to_csv(f'{CFG.FILE_NAME}_submission.csv')