In [151]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import joblib

import warnings
warnings.filterwarnings('ignore')

import time

seed = 0

/kaggle/input/all-data-for-obesity-risk-prediction/train.csv
/kaggle/input/all-data-for-obesity-risk-prediction/test.csv
/kaggle/input/all-data-for-obesity-risk-prediction/ObesityDataSet.csv


In [118]:
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('/kaggle/input/all-data-for-obesity-risk-prediction/train.csv')
test  = pd.read_csv('/kaggle/input/all-data-for-obesity-risk-prediction/test.csv')
extra = pd.read_csv('/kaggle/input/all-data-for-obesity-risk-prediction/ObesityDataSet.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
extra.head()

In [None]:
extra.shape

No 'id'column in the organic dataset.

In [None]:
test.head()

In [None]:
test.shape

In [None]:
train.isna().sum()

In [None]:
extra.isna().sum()

In [None]:
test.isna().sum()

In [None]:
data = pd.concat([extra, train], axis = 0, ignore_index = True)

In [None]:
data.isna().sum()

We get 2111 null values for 'id' column because 'id' column was missing from the organic dataset.

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.drop(columns=['id'], inplace = True)
data

In [None]:
data.columns

In [77]:
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [None]:
data.tail()

In [None]:
num_cols = data._get_numeric_data()
num_cols

In [99]:
def CleanData(data, mode, fit = None):
    
    data['BMI'] =  (data['Weight'] / (data['Height'] ** 2)).astype('float64')
    num_cols_round = data._get_numeric_data().columns
    
    for col in num_cols_round:
        if col == 'id':
            data.drop(columns = col, inplace = True)
            
        elif col == 'Age':
            data[col] = round(data[col]).astype('int64')
            
        elif col == 'Height':
            data[col] = round(data[col], 2)
            
        elif col == 'Weight' or col == 'BMI':
            data[col] = round(data[col], 1)
            
        else:
            data[col] = round(data[col]).astype('int64')
    
    
    sc  = StandardScaler()
    ohe = OneHotEncoder(handle_unknown = 'ignore',
                       drop = 'first',
                       dtype = 'int64',
                       sparse_output = False).set_output(transform = 'pandas')
    
    
    cat_cols = make_column_selector(dtype_include = 'object')
    num_cols = make_column_selector(dtype_exclude = 'object')
    
    col_tx = make_column_transformer(
        (sc, num_cols),
        (ohe, cat_cols),
        remainder = 'passthrough'
    ).set_output(transform = 'pandas')
    
    if mode == 'train':
        fit_obj = col_tx.fit(data)
        data_tx = fit_obj.transform(data)
        
        return data_tx, fit_obj
    
    elif mode == 'test':
        data_tx = fit.transform(data)
        
        return data_tx
    else:
        raise('Mode must be \'train\' or \'test\'!')

In [120]:
le = LabelEncoder()

y = data['NObeyesdad']
X = data.drop(columns = 'NObeyesdad')

# le_fit = le.fit(y)
y = le.fit_transform(y)

In [121]:
x_train, x_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.25, 
                                                    random_state = seed)

In [122]:
x_train_clean, fitObj = CleanData(x_train, mode = 'train')
x_test_clean = CleanData(x_test, mode = 'test', fit = fitObj)

array(['Obesity_Type_III', 'Insufficient_Weight', 'Normal_Weight', ...,
       'Obesity_Type_II', 'Insufficient_Weight', 'Overweight_Level_II'],
      dtype=object)

13386       Obesity_Type_III
322      Insufficient_Weight
19301          Normal_Weight
4263      Overweight_Level_I
22145         Obesity_Type_I
                ...         
11537        Obesity_Type_II
21542         Obesity_Type_I
1644         Obesity_Type_II
11232    Insufficient_Weight
2155     Overweight_Level_II
Name: NObeyesdad, Length: 5718, dtype: object

In [223]:
import optuna

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score

In [146]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 2000),
        'max_depth' : trial.suggest_int('max_depth', 2, 32),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_smaples_leaf', 1, 10)
    }
    rf_opt = RandomForestClassifier(**params, 
                                    random_state = seed, 
                                    bootstrap = True, 
                                    oob_score = True)
    
    rf_opt.fit(x_train_clean, y_train)
    
    y_pred = rf_opt.predict(x_test_clean)
    score = accuracy_score(y_test, y_pred)
    
    return score

In [None]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 50, show_progress_bar = True)

In [149]:
print('Best trial:', study.best_trial)
print('Best hyperparameters:', study.best_params)

Best trial: FrozenTrial(number=4, state=TrialState.COMPLETE, values=[0.9038125218607905], datetime_start=datetime.datetime(2024, 2, 25, 11, 28, 48, 99956), datetime_complete=datetime.datetime(2024, 2, 25, 11, 29, 17, 863909), params={'n_estimators': 1189, 'max_depth': 22, 'min_samples_split': 5, 'min_smaples_leaf': 1}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=2000, log=False, low=1000, step=1), 'max_depth': IntDistribution(high=32, log=False, low=2, step=1), 'min_samples_split': IntDistribution(high=10, log=False, low=2, step=1), 'min_smaples_leaf': IntDistribution(high=10, log=False, low=1, step=1)}, trial_id=4, value=None)
Best hyperparameters: {'n_estimators': 1189, 'max_depth': 22, 'min_samples_split': 5, 'min_smaples_leaf': 1}


In [150]:
study.best_value

0.9038125218607905

In [154]:
best_rf = RandomForestClassifier(n_estimators = 1189, 
                                 max_depth = 22, 
                                 min_samples_split = 5, 
                                 min_samples_leaf = 1)

best_rf.fit(x_train_clean, y_train)

In [156]:
best_rf_pred = best_rf.predict(x_test_clean)

In [159]:
best_rf_op = le.inverse_transform(best_rf_pred)

In [161]:
test_clean = CleanData(test, mode = 'test', fit = fitObj)

In [162]:
best_rf_pred = best_rf.predict(test_clean)

In [163]:
best_rf_op = le.inverse_transform(best_rf_pred)

In [None]:
# ID = test['id']
test

In [171]:
test1 = pd.read_csv('/kaggle/input/all-data-for-obesity-risk-prediction/test.csv')
ID = test1['id']

In [170]:
rf_op = pd.DataFrame({
    'id': ID,
    'NObeyesdad': best_rf_op
})

rf_op.to_csv('RandomForestOptuna.csv', index=False)

In [176]:
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 40),
        'max_depth': trial.suggest_int('max_depth', 1, 40),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 40),
        'l2_regularization': trial.suggest_float('l2_regularization', 0, 2)
#         'max_features': trial.suggest_int('max_features', 1, 10)
#         'tol': trial.suggest_float('tol', 1e-7, 0.0001)
    }
    
    hgb_opt = HistGradientBoostingClassifier(**params, random_state = seed)
    hgb_opt.fit(x_train_clean, y_train)
    pred = hgb_opt.predict(x_test_clean)
    
    score = accuracy_score(y_test, pred)
    
    return score

In [187]:
study = optuna.create_study(direction = 'maximize')

start = time.time()
study.optimize(objective, n_trials = 500, show_progress_bar = True)
end = time.time()

print(f'\nFinding best parameters took {round((end-start)/60, 2)} minutes.')

[I 2024-02-25 13:20:58,258] A new study created in memory with name: no-name-6fc9d659-6144-42ab-b4ef-a596b80125b2


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-02-25 13:20:59,327] Trial 0 finished with value: 0.897691500524659 and parameters: {'learning_rate': 0.9059067122226611, 'max_leaf_nodes': 2, 'max_depth': 12, 'min_samples_leaf': 19, 'l2_regularization': 1.2707945396418818}. Best is trial 0 with value: 0.897691500524659.
[I 2024-02-25 13:20:59,890] Trial 1 finished with value: 0.8319342427422176 and parameters: {'learning_rate': 0.9680831039174209, 'max_leaf_nodes': 14, 'max_depth': 38, 'min_samples_leaf': 38, 'l2_regularization': 0.9702616107725968}. Best is trial 0 with value: 0.897691500524659.
[I 2024-02-25 13:21:00,218] Trial 2 finished with value: 0.7670514165792235 and parameters: {'learning_rate': 0.7812497127668075, 'max_leaf_nodes': 8, 'max_depth': 35, 'min_samples_leaf': 3, 'l2_regularization': 0.9011412908501926}. Best is trial 0 with value: 0.897691500524659.
[I 2024-02-25 13:21:00,913] Trial 3 finished with value: 0.8525708289611752 and parameters: {'learning_rate': 0.8190432947589861, 'max_leaf_nodes': 33, 'max_d

In [188]:
print('Best score:', study.best_value)
print('Best params:', study.best_params)

Best score: 0.9078349073102483
Best params: {'learning_rate': 0.18741142617629775, 'max_leaf_nodes': 9, 'max_depth': 9, 'min_samples_leaf': 37, 'l2_regularization': 1.31577084793702}


In [190]:
best_params_hgb = study.best_params
best_hgb = HistGradientBoostingClassifier(**best_params_hgb)

In [193]:
best_hgb.fit(x_train_clean, y_train)
hgb_pred = best_hgb.predict(test_clean)

In [194]:
df_hgb = pd.DataFrame({
    'id': ID,
    'NObeyesdad': le.inverse_transform(hgb_pred)
})

df_hgb.to_csv('HistGradBoostOptuna.csv')

In [195]:
joblib.dump(best_rf, 'RandomForestOptuna.sav')
joblib.dump(best_hgb, 'HistGradBoostOptuna.sav')

['HistGradBoostOptuna.sav']

In [205]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'eta': trial.suggest_float('eta', 1e-4, 1),
        'gamma': trial.suggest_float('gamma', 1e-4, 4),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'subsample': trial.suggest_float('subsample', 0, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0, 1),
#         'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0, 1),
#         'colsample_bynode': trial.suggest_float('colsample_bylevel', 0, 1),
        'lambda': trial.suggest_float('lambda', 0, 3)
    }
    
    xgb_opt = XGBClassifier(**params, objective = 'multi:softmax')
    xgb_opt.fit(x_train_clean, y_train)
    
    pred = xgb_opt.predict(x_test_clean)
    score = accuracy_score(y_test, pred)
    
    return score

In [206]:
study = optuna.create_study(direction = 'maximize')
start = time.time()
study.optimize(objective, n_trials = 500, n_jobs = -1, show_progress_bar = True)
end = time.time()

print(f'Tuning finished in {round((end-start)/60, 2)} minutes')

params_xgb = study.best_params

best_xgb = XGBClassifier(**params_xgb, objective = 'multi:softmax')
joblib.dump(best_xgb, 'XGBoostClassifierOptuna.sav')

[I 2024-02-25 14:14:43,778] A new study created in memory with name: no-name-d1783050-6a64-454f-a7aa-eef315532d22


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-02-25 14:15:14,877] Trial 3 finished with value: 0.8992654774396642 and parameters: {'n_estimators': 933, 'eta': 0.1762936401830223, 'gamma': 0.24008337666812352, 'max_depth': 1, 'min_child_weight': 3, 'subsample': 0.8948629786569934, 'colsample_bytree': 0.5595853200787015, 'lambda': 2.973882861888567}. Best is trial 3 with value: 0.8992654774396642.
[I 2024-02-25 14:15:14,908] Trial 0 finished with value: 0.9090591115774747 and parameters: {'n_estimators': 783, 'eta': 0.15510235235703804, 'gamma': 0.24326618658960641, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.9955758556769184, 'colsample_bytree': 0.2735176963293663, 'lambda': 2.398934899129327}. Best is trial 0 with value: 0.9090591115774747.
[I 2024-02-25 14:15:24,296] Trial 2 finished with value: 0.8686603707590066 and parameters: {'n_estimators': 1308, 'eta': 0.8720827293072594, 'gamma': 3.0104946393472902, 'max_depth': 1, 'min_child_weight': 3, 'subsample': 0.01561870977288815, 'colsample_bytree': 0.45211531864

['XGBoostClassifierOptuna.sav']

In [212]:
print('Best score:', study.best_value)
print('Best params:', study.best_params)

Best score: 0.9118572927597062
Best params: {'n_estimators': 1555, 'eta': 0.061457628377228235, 'gamma': 0.5888759353970527, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.7327372979866329, 'colsample_bytree': 0.1844237958173927, 'lambda': 0.3231747649713702}


In [None]:
import optuna.visualization as vis

# Plot optimization history
# vis.plot_optimization_history(study)

# Plot parameter importance
# vis.plot_param_importances(study)

# Plot slice plot
# vis.plot_slice(study, params=["learning_rate", "max_depth"])

# Plot contour plot
vis.plot_contour(study, params=["eta", "subsample"])

# Plot parallel_coordinate
# vis.plot_parallel_coordinate(study)

In [220]:
best_params_xgb = study.best_params
best_xgb = XGBClassifier(**best_params_xgb, objective = 'multi:softmax')
best_xgb.fit(x_train_clean, y_train)

In [221]:
pred_xgb = best_xgb.predict(test_clean)

In [222]:
df_xgb = pd.DataFrame({
    'id': ID,
    'NObeyesdad': le.inverse_transform(pred_xgb)
})

df_xgb.to_csv('XGBoostOptuna.csv')

In [225]:
vclf = VotingClassifier(estimators = [('xgb', best_xgb), ('hgb', best_hgb), ('RandomForest', best_rf)], 
                        voting = 'hard', 
                        n_jobs = -1)

vclf.fit(x_train_clean, y_train)

In [226]:
vclf_pred = vclf.predict(test_clean)

In [228]:
df_vclf = pd.DataFrame({
    'id': ID,
    'NObeyesdad': le.inverse_transform(vclf_pred)
})

df_vclf.to_csv('VotingClassifierOptuna.csv')