This notebook presents my complete end-to-end solution for the competition.
While the model names may be confusing at first, but the workflow is structured and easy to follow.

I document each step of the process, including model training, feature engineering, parameter tuning, and the reasoning behind transitioning from Random Forest to XGBoost.

Multiple models were trained and evaluated, and only those that showed a consistent improvement in performance were retained.

The final and best-performing models are presented at the bottom of the notebook (Model 13 and Model 14).
Readers who are mainly interested in the final solution can jump directly to these sections.

Parameter tuning was guided by the official  
[XGBoost documentation](https://xgboost.readthedocs.io/en/stable/parameter.html).

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV



In [2]:
# reading the training data
df = pd.read_csv('train.csv')

In [4]:
df

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,30.0,175.0,75.0,85.0,1.0,1.2,1.0,1.0,127.0,...,55.0,125.0,15.8,1.0,0.9,18.0,17.0,53.0,0.0,1.0
1,1,50.0,155.0,55.0,73.0,1.0,1.2,1.0,1.0,118.0,...,55.0,101.0,13.3,1.0,0.6,16.0,9.0,16.0,0.0,0.0
2,2,30.0,175.0,60.0,72.0,0.8,0.8,1.0,1.0,119.0,...,74.0,93.0,14.7,1.0,0.9,18.0,13.0,22.0,0.0,0.0
3,3,45.0,155.0,55.0,75.5,1.0,1.2,1.0,1.0,90.0,...,55.0,95.0,13.5,1.0,0.8,17.0,8.0,12.0,0.0,0.0
4,4,60.0,155.0,60.0,81.0,0.7,0.7,1.0,1.0,130.0,...,52.0,88.0,13.4,1.0,0.9,22.0,15.0,21.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,14995,55.0,165.0,70.0,89.0,0.4,1.0,1.0,1.0,124.0,...,51.0,123.0,15.8,1.0,0.8,29.0,26.0,25.0,1.0,0.0
14996,14996,40.0,155.0,55.0,82.0,1.5,1.2,1.0,1.0,108.0,...,57.0,138.0,13.1,1.0,0.8,19.0,12.0,14.0,0.0,0.0
14997,14997,60.0,165.0,70.0,87.0,0.7,0.7,1.0,1.0,120.0,...,66.0,133.0,13.8,1.0,0.7,21.0,19.0,21.0,0.0,0.0
14998,14998,40.0,165.0,55.0,71.0,0.6,0.7,1.0,1.0,121.0,...,77.0,107.0,13.1,1.0,0.6,20.0,13.0,12.0,0.0,0.0


In [None]:
# show the columns of the dataframe
df.columns

Index(['id', 'age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries', 'smoking'],
      dtype='object')

In [None]:
# check for missing values
df.isna().sum()

id                     0
age                    0
height(cm)             0
weight(kg)             0
waist(cm)              0
eyesight(left)         0
eyesight(right)        0
hearing(left)          0
hearing(right)         0
systolic               0
relaxation             0
fasting blood sugar    0
Cholesterol            0
triglyceride           0
HDL                    0
LDL                    0
hemoglobin             0
Urine protein          0
serum creatinine       0
AST                    0
ALT                    0
Gtp                    0
dental caries          0
smoking                0
dtype: int64

In [None]:
# show the shape of the dataframe
df.shape

(15000, 24)

In [None]:
# show the distribution of the target variable
df['smoking'].value_counts(normalize=True)

smoking
0.0    0.63
1.0    0.37
Name: proportion, dtype: float64

In [3]:
# separating features and target variable
y_train = df['smoking']
X_train = df.drop(['id','smoking'], axis=1)

In [20]:
# reading the test data
test = pd.read_csv('test.csv')

In [None]:
# show the shape of the test dataframe
test.shape

(10000, 23)

In [6]:
# define stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)

In [None]:
# StratifiedKFold (skf) with 15 splits on a dataset of 15,000 samples.
# In each fold, a Random Forest model is trained on 19 splits and validated on 1 split,
# the AUC score is computed, and the final result is the average AUC across all folds.
aucs = []

for train_idx, val_idx in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_1 = RandomForestClassifier(
        n_estimators=1200,
        max_depth=12,
        min_samples_split=10,
        min_samples_leaf=10,        
        max_features=0.6,           
        bootstrap=True,
        class_weight='balanced_subsample',
        random_state=42
    )

    model_1.fit(X_tr, y_tr)
    y_val_proba = model_1.predict_proba(X_val)[:, 1]

    aucs.append(roc_auc_score(y_val, y_val_proba))

print("Mean CV AUC:", sum(aucs) / len(aucs))

Mean CV AUC: 0.8835887315887316


In [None]:
# making predictions on the test set
y_val_proba = model_1.predict_proba(test)[:, 1]
len(y_val_proba)

10000

In [None]:
# preparing the first submission file
test_df = pd.DataFrame(
    {'id': test['id'].values, 'smoking': y_val_proba})

In [None]:
# saving the submission file
test_df.to_csv('new_predictions.csv', index=False)

Randomforest takes long time to train, but is this the best model I can have?? <br>
let's try XGBoost

In [13]:
# using the same split, let's train and evaluate the XGBoost model using cross-validation
aucs = []

for train_idx, val_idx in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_2 = XGBClassifier(
        n_estimators=900,
        max_depth=3,
        learning_rate=0.05,
        subsample=0.6,
        colsample_bytree=0.6,
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        gamma=0.5,
        reg_alpha=0.5,
        reg_lambda=0,
        min_child_weight=10,
        verbosity=0,
        
        
    )
    model_2.fit(X_tr, y_tr)
    y_val_proba = model_2.predict_proba(X_val)[:, 1]
    aucs.append(roc_auc_score(y_val, y_val_proba))

print("Mean CV AUC:", sum(aucs) / len(aucs))



Mean CV AUC: 0.8927103786457611


#### XGBoost gives better rusltus compared to RandomForest

In [None]:
y_val_proba = model_2.predict_proba(test)[:, 1]
len(y_val_proba)

10000

In [158]:
test_df = pd.DataFrame(
    {'id': test['id'].values, 'smoking': y_val_proba})

let's try changing the parameters to reach 90+

In [None]:

aucs = []

for train_idx, val_idx in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_3 = XGBClassifier(
        n_estimators=1000,
        max_depth=3,
        learning_rate=0.05,
        subsample=0.6,
        colsample_bytree=0.6,
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        gamma=0.5,
        reg_alpha=0.5,
        reg_lambda=0,
        min_child_weight=5,
        verbosity=0,
        
        
    )
    model_3.fit(X_tr, y_tr)
    y_val_proba = model_3.predict_proba(X_val)[:, 1]
    aucs.append(roc_auc_score(y_val, y_val_proba))

print("Mean CV AUC:", sum(aucs) / len(aucs))



Mean CV AUC: 0.8925848705848707


In [None]:
y_val_proba = model_3.predict_proba(test.drop('id', axis=1))[:, 1]
len(y_val_proba)

10000

In [102]:
test_df = pd.DataFrame(
    {'id': test['id'].values, 'smoking': y_val_proba})

In [103]:
test_df.to_csv('submission_4.csv', index=False)

In [None]:
""" 
training XGBoost model with gradient_based sampling and cuda support using the same cross-validation setup
 """
aucs = []

for train_idx, val_idx in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_4 = XGBClassifier(
        n_estimators=1000,
        max_depth=3,
        learning_rate=0.05,
        subsample=0.6,
        colsample_bytree=0.6,
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        gamma=0.5,
        reg_alpha=0.5,
        reg_lambda=0,
        min_child_weight=5,
        booster="gbtree",
        nthread=-1,
        verbosity=0,
        sampling_method='gradient_based',
        device='cuda',   
        
    )
    model_4.fit(X_tr, y_tr)
    y_val_proba = model_4.predict_proba(X_val)[:, 1]
    aucs.append(roc_auc_score(y_val, y_val_proba))

print("Mean CV AUC:", sum(aucs) / len(aucs))



Mean CV AUC: 0.8927218647218648


Changing the model parameters results in a slight improvement in the mean AUC. Since there cuda is used, let increase the folds and other parameter values

In [10]:
skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)

In [18]:

aucs = []

for train_idx, val_idx in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_5 = XGBClassifier(
        n_estimators=2000,
        max_depth=3,
        learning_rate=0.03,
        subsample=0.6,
        colsample_bytree=0.6,
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        gamma=0.5,
        reg_alpha=0.5,
        reg_lambda=0,
        min_child_weight=5,
        booster="gbtree",
        nthread=-1,
        verbosity=0,
        sampling_method='gradient_based',
        device='cuda',
        
        
        
        
    )
    model_5.fit(X_tr, y_tr)
    y_val_proba = model_5.predict_proba(X_val)[:, 1]
    aucs.append(roc_auc_score(y_val, y_val_proba))

print("Mean CV AUC:", sum(aucs) / len(aucs))



Mean CV AUC: 0.8931953286811041


In [21]:
y_val_proba = model_5.predict_proba(test.drop('id', axis=1))[:, 1]
test_df = pd.DataFrame(
    {'id': test['id'].values, 'smoking': y_val_proba})
test_df.to_csv('5th_submission.csv', index=False)

let try simple feature engineering 

In [23]:
train_x = X_train.copy()
test_x = test.copy()
""" 
    creting new features: bmi, waist-to-height ratio, pulse pressure
    
    1. BMI (Body Mass Index): weight(kg) / (height(m)^2)
    2.Waist-to-height ratio:  waist circumference / height
    3. Pulse Pressure: systolic - diastolic

"""

train_x['bmi'] = train_x['weight(kg)'] / ((train_x['height(cm)'] / 100) ** 2)
train_x['waist_height_ratio'] = train_x['waist(cm)'] / train_x['height(cm)']
train_x['pulse_pressure'] = train_x['systolic'] - train_x['relaxation']






test_x['bmi'] = test_x['weight(kg)'] / ((test_x['height(cm)'] / 100) ** 2)
test_x['waist_height_ratio'] = test_x['waist(cm)'] / test_x['height(cm)']
test_x['pulse_pressure'] = test_x['systolic'] - test_x['relaxation']


In [None]:
# training the same XGBoost model with new features
aucs = []

for train_idx, val_idx in skf.split(train_x, y_train):
    X_tr, X_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_6 = XGBClassifier(
        n_estimators=2000,
        max_depth=3,
        learning_rate=0.03,
        subsample=0.6,
        colsample_bytree=0.6,
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        gamma=0.5,
        reg_alpha=5,
        reg_lambda=0,
        min_child_weight=5,
        booster="gbtree",
        nthread=-1,
        verbosity=0,
        sampling_method='gradient_based',
        device='cuda',        
        
    )
    model_6.fit(X_tr, y_tr)
    y_val_proba = model_6.predict_proba(X_val)[:, 1]
    aucs.append(roc_auc_score(y_val, y_val_proba))

print("Mean CV AUC:", sum(aucs) / len(aucs))



Mean CV AUC: 0.8925354405213783


The model with engineered features shows weaker performance. Using the default features and focusing on parameter tuning may lead to better results.

In [22]:
"""
let keep the same model structure but change max_depth to 5, min_child_weight to 7, gamma to 1, reg_lambda to 0.5
"""
aucs = []

for train_idx, val_idx in skf.split(train_x, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_7 = XGBClassifier(
        n_estimators=2000,
        max_depth=5,
        learning_rate=0.03,
        subsample=0.6,
        colsample_bytree=0.3,
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        gamma=1,
        reg_alpha=0,
        reg_lambda=0.5,
        min_child_weight=7,
        booster="gbtree",
        nthread=-1,
        verbosity=0,
        sampling_method='gradient_based',
        device='cuda',
        
        
        
        
    )
    model_7.fit(X_tr, y_tr)
    y_val_proba = model_7.predict_proba(X_val)[:, 1]
    aucs.append(roc_auc_score(y_val, y_val_proba))

print("Mean CV AUC:", sum(aucs) / len(aucs))



Mean CV AUC: 0.8943257184725322


In [None]:
y_val_proba = model_7.predict_proba(test.drop('id', axis=1))[:, 1]
len(y_val_proba)
test_df = pd.DataFrame(
    {'id': test['id'].values, 'smoking': y_val_proba})
test_df.to_csv('6th_submission.csv', index=False)

In [None]:
skf = StratifiedKFold(n_splits=25, shuffle=True, random_state=42)
aucs = []

for train_idx, val_idx in skf.split(train_x, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_8 = XGBClassifier(
        n_estimators=2000,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.6,
        colsample_bytree=0.3,
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        gamma=0.5,
        reg_alpha=1,
        reg_lambda=0,
        min_child_weight=7,
        booster="gbtree",
        nthread=-1,
        verbosity=0,
        sampling_method='gradient_based',
        device='cuda',        
    )
    model_8.fit(X_tr, y_tr)
    y_val_proba = model_8.predict_proba(X_val)[:, 1]
    aucs.append(roc_auc_score(y_val, y_val_proba))

print("Mean CV AUC:", sum(aucs) / len(aucs))



Mean CV AUC: 0.8944291910958577


In [None]:
y_val_proba = model_8.predict_proba(test.drop('id', axis=1))[:, 1]
len(y_val_proba)
test_df = pd.DataFrame(
    {'id': test['id'].values, 'smoking': y_val_proba})
test_df.to_csv('9th_submission.csv', index=False)

#### <center> Experiment

let train 100 models with defined parameters and use CV_search to get the best parameters

In [74]:
param_space = {
    "n_estimators": np.arange(700, 2500, 200),
    "max_depth": [2, 3, 4, 5],
    "learning_rate": [0.01, 0.03,0.05],
    "subsample": [0.3,0.5, 0.6],
    "colsample_bytree": [0.3, 0.5],
    "gamma": [0, 0.3, 0.5,5,7,9,11,15],
    "min_child_weight": [5, 7, 9,11,15],
    "reg_alpha": [0, 0.3, 0.5,5,7,9,11,15],
    "reg_lambda": [0, 0.3, 0.5,5,7,9,11,15]
}


In [75]:
skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)

results = []

N_MODELS = 100

for model_id in range(N_MODELS):
    print(f"\nTraining model {model_id+1}/{N_MODELS}")

    # Randomly sample parameters
    params = {
        "n_estimators": np.random.choice(param_space["n_estimators"]),
        "max_depth": np.random.choice(param_space["max_depth"]),
        "learning_rate": np.random.choice(param_space["learning_rate"]),
        "subsample": np.random.choice(param_space["subsample"]),
        "colsample_bytree": np.random.choice(param_space["colsample_bytree"]),
        "gamma": np.random.choice(param_space["gamma"]),
        "min_child_weight": np.random.choice(param_space["min_child_weight"]),
        "reg_alpha": np.random.choice(param_space["reg_alpha"]),
        "reg_lambda": np.random.choice(param_space["reg_lambda"]),
    }

    fold_aucs = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = XGBClassifier(
            **params,
            booster="gbtree",
            eval_metric="auc",
            use_label_encoder=False,
            random_state=42 + model_id,
            nthread=-1,
            verbosity=0,
            sampling_method="gradient_based",
            device="cuda",
        )

        model.fit(X_tr, y_tr)
        y_val_proba = model.predict_proba(X_val)[:, 1]
        fold_aucs.append(roc_auc_score(y_val, y_val_proba))

    mean_auc = np.mean(fold_aucs)

    results.append({
        "model_id": model_id,
        "mean_cv_auc": mean_auc,
        **params
    })

    print(f"Mean CV AUC: {mean_auc:.6f}")



Training model 1/100
Mean CV AUC: 0.889411

Training model 2/100
Mean CV AUC: 0.891978

Training model 3/100
Mean CV AUC: 0.892846

Training model 4/100
Mean CV AUC: 0.891532

Training model 5/100
Mean CV AUC: 0.887576

Training model 6/100
Mean CV AUC: 0.886527

Training model 7/100
Mean CV AUC: 0.892073

Training model 8/100
Mean CV AUC: 0.884326

Training model 9/100
Mean CV AUC: 0.886349

Training model 10/100
Mean CV AUC: 0.891322

Training model 11/100
Mean CV AUC: 0.882465

Training model 12/100
Mean CV AUC: 0.882334

Training model 13/100
Mean CV AUC: 0.891126

Training model 14/100
Mean CV AUC: 0.886070

Training model 15/100
Mean CV AUC: 0.890259

Training model 16/100
Mean CV AUC: 0.886549

Training model 17/100
Mean CV AUC: 0.891082

Training model 18/100
Mean CV AUC: 0.891996

Training model 19/100
Mean CV AUC: 0.888771

Training model 20/100
Mean CV AUC: 0.881436

Training model 21/100
Mean CV AUC: 0.886082

Training model 22/100
Mean CV AUC: 0.893284

Training model 23/

In [76]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("mean_cv_auc", ascending=False)

best_model_params = results_df.iloc[0]
print("\nBEST MODEL PARAMETERS:")
print(best_model_params)



BEST MODEL PARAMETERS:
model_id              60.000000
mean_cv_auc            0.894084
n_estimators        2100.000000
max_depth              4.000000
learning_rate          0.030000
subsample              0.600000
colsample_bytree       0.300000
gamma                  0.500000
min_child_weight       9.000000
reg_alpha              0.000000
reg_lambda             5.000000
Name: 60, dtype: float64


In [None]:
""" """
aucs = []

for train_idx, val_idx in skf.split(train_x, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_10 = XGBClassifier(
        n_estimators=2100,
        max_depth=4,
        learning_rate=0.03,
        subsample=0.6,
        colsample_bytree=0.3,
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        gamma=0.5,
        reg_alpha=0,
        reg_lambda=5,
        min_child_weight=9,
        booster="gbtree",
        nthread=-1,
        verbosity=0,
        sampling_method='gradient_based',
        device='cuda',
        
        
        
        
    )
    model_10.fit(X_tr, y_tr)
    y_val_proba = model_10.predict_proba(X_val)[:, 1]
    aucs.append(roc_auc_score(y_val, y_val_proba))

print("Mean CV AUC:", sum(aucs) / len(aucs))

Mean CV AUC: 0.8944465678688738


In [None]:
y_val_proba = model_10.predict_proba(test.drop('id', axis=1))[:, 1]
len(y_val_proba)
test_df = pd.DataFrame(
    {'id': test['id'].values, 'smoking': y_val_proba})
test_df.to_csv('13th_submission.csv', index=False)

In [None]:
# model 12: current best model 
skf = StratifiedKFold(n_splits=25, shuffle=True, random_state=42)
aucs = []

for train_idx, val_idx in skf.split(train_x, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_12= XGBClassifier(
        n_estimators=1900,
        max_depth=5,
        learning_rate=0.03,
        subsample=0.6,
        colsample_bytree=0.3,
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        gamma=0.5,
        reg_alpha=1,
        reg_lambda=0,
        min_child_weight=7,
        booster="gbtree",
        nthread=-1,
        verbosity=0,
        sampling_method='gradient_based',
        device='cuda',   
        num_parallel_tree=3,
       # max_leaves=3    
    )
    model_12.fit(X_tr, y_tr)
    y_val_proba = model_12.predict_proba(X_val)[:, 1]
    aucs.append(roc_auc_score(y_val, y_val_proba))
print("Is the new AUC great than Mean CV AUC: 0.8943729443729445? \n")
print("Mean CV AUC:", sum(aucs) / len(aucs))

Is the new AUC great than Mean CV AUC: 0.8943729443729445? 

Mean CV AUC: 0.8950641117307784


In [106]:
y_val_proba = model_12.predict_proba(test.drop('id', axis=1))[:, 1]
len(y_val_proba)
test_df = pd.DataFrame(
    {'id': test['id'].values, 'smoking': y_val_proba})
test_df.to_csv('13th_submission.csv', index=False)

In [9]:
# model 13 current best model
skf = StratifiedKFold(n_splits=25, shuffle=True, random_state=42)
aucs = []

for train_idx, val_idx in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_13= XGBClassifier(
        n_estimators=1900,
        max_depth=5,
        learning_rate=0.03,
        subsample=0.6,
        colsample_bytree=0.3,
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        gamma=0.5,
        reg_alpha=1,
        reg_lambda=0,
        min_child_weight=7,
        booster="gbtree",
        nthread=-1,
        verbosity=0,
        sampling_method='gradient_based',
        device='cuda',   
        num_parallel_tree=5,
    )
    model_13.fit(X_tr, y_tr)
    y_val_proba = model_13.predict_proba(X_val)[:, 1]
    aucs.append(roc_auc_score(y_val, y_val_proba))
print("Is the new AUC great than Mean CV AUC: 0.8950641117307784? \n")
print("Mean CV AUC:", sum(aucs) / len(aucs))

Is the new AUC great than Mean CV AUC: 0.8950641117307784? 

Mean CV AUC: 0.8951422851422852


In [10]:
# Final model on full data
final_model_13 = XGBClassifier(
    n_estimators=1900,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.6,
    colsample_bytree=0.3,
    eval_metric='auc',
    use_label_encoder=False,
    random_state=42,
    gamma=0.5,
    reg_alpha=1,
    reg_lambda=0,
    min_child_weight=7,
    booster="gbtree",
    nthread=-1,
    verbosity=0,
    sampling_method='gradient_based',
    device='cuda',   
    num_parallel_tree=5,
)

final_model_13.fit(X_train, y_train)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,'gbtree'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.3
,device,'cuda'
,early_stopping_rounds,
,enable_categorical,False


In [11]:
y_val_proba = final_model_13.predict_proba(test.drop('id', axis=1))[:, 1]
len(y_val_proba)
test_df = pd.DataFrame(
    {'id': test['id'].values, 'smoking': y_val_proba})
test_df.to_csv('27th_submission.csv', index=False)

In [12]:
# model 14
skf = StratifiedKFold(n_splits=30, shuffle=True, random_state=42)
aucs = []

for train_idx, val_idx in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_14= XGBClassifier(
        n_estimators=1900,
        max_depth=5,
        learning_rate=0.03,
        subsample=0.6,
        colsample_bytree=0.3,
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        gamma=0.5,
        reg_alpha=1,
        reg_lambda=0,
        min_child_weight=7,
        booster="gbtree",
        nthread=-1,
        verbosity=0,
        sampling_method='gradient_based',
        device='cuda',   
        num_parallel_tree=5,
    )
    model_14.fit(X_tr, y_tr)
    y_val_proba = model_14.predict_proba(X_val)[:, 1]
    aucs.append(roc_auc_score(y_val, y_val_proba))
print("Is the new AUC great than Mean CV AUC: 0.8950641117307784? \n")
print("Mean CV AUC:", sum(aucs) / len(aucs))

Is the new AUC great than Mean CV AUC: 0.8950641117307784? 

Mean CV AUC: 0.8952930072930073
