In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
import optuna
from sklearn.model_selection import cross_validate

In [3]:
df = pd.read_csv('data/train.csv', index_col=0)
df_test = pd.read_csv("data/test.csv", index_col=0)
df.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
features = df.drop('NObeyesdad', axis=1)
labels = pd.DataFrame(df['NObeyesdad'])

mask_numeric = features.dtypes == float
df_numerical = features.loc[:, mask_numeric]

df_numerical

Unnamed: 0_level_0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,24.443011,1.699998,81.669950,2.000000,2.983297,2.763573,0.000000,0.976473
1,18.000000,1.560000,57.000000,2.000000,3.000000,2.000000,1.000000,1.000000
2,18.000000,1.711460,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584
3,20.952737,1.710730,131.274851,3.000000,3.000000,1.674061,1.467863,0.780199
4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721
...,...,...,...,...,...,...,...,...
20753,25.137087,1.766626,114.187096,2.919584,3.000000,2.151809,1.330519,0.196680
20754,18.000000,1.710000,50.000000,3.000000,4.000000,1.000000,2.000000,1.000000
20755,20.101026,1.819557,105.580491,2.407817,3.000000,2.000000,1.158040,1.198439
20756,33.852953,1.700000,83.520113,2.671238,1.971472,2.144838,0.000000,0.973834


In [5]:
mask_categorical = features.dtypes != float
df_categorical = features.loc[:, mask_categorical]

for i in range(df_categorical.shape[1]):
    print(df_categorical.iloc[:, i].value_counts())

Gender
Female    10422
Male      10336
Name: count, dtype: int64
family_history_with_overweight
yes    17014
no      3744
Name: count, dtype: int64
FAVC
yes    18982
no      1776
Name: count, dtype: int64
CAEC
Sometimes     17529
Frequently     2472
Always          478
no              279
Name: count, dtype: int64
SMOKE
no     20513
yes      245
Name: count, dtype: int64
SCC
no     20071
yes      687
Name: count, dtype: int64
CALC
Sometimes     15066
no             5163
Frequently      529
Name: count, dtype: int64
MTRANS
Public_Transportation    16687
Automobile                3534
Walking                    467
Motorbike                   38
Bike                        32
Name: count, dtype: int64


In [6]:
df_encoded = df_categorical.copy(deep=True)

# label encoding
df_encoded['Gender'] = df_categorical['Gender'].map({'Male':0, 'Female':1})
df_encoded['family_history_with_overweight'] = df_categorical['family_history_with_overweight'].map({'no':0, 'yes':1})
df_encoded['FAVC'] = df_categorical['FAVC'].map({'no':0, 'yes':1})
df_encoded['CAEC'] = df_categorical['CAEC'].map({'no':0, 'Sometimes':1, 'Frequently':2, 'Always':3})
df_encoded['SMOKE'] = df_categorical['SMOKE'].map({'no':0, 'yes':1})
df_encoded['SCC'] = df_categorical['SCC'].map({'no':0, 'yes':1})
df_encoded['CALC'] = df_categorical['CALC'].map({'no':0, 'Sometimes':1, 'Frequently':2, 'Always':3})

# one-hot encoding
df_onehot = pd.get_dummies(df_categorical['MTRANS']).astype(int)
df_encoded.drop('MTRANS', axis=1, inplace=True)

# concatenate
# one feature of df_encoded is redundant; we can remove it
df_encoded = pd.concat([df_encoded, df_onehot.iloc[:, 0:-1]], axis=1)

#df_encoded
df_all_features = pd.concat([df_numerical, df_encoded], axis=1)


In [7]:
def apply_preprocessing(data):
    features = data.copy(deep=True)

    # numerical dataframe
    mask_numeric = features.dtypes == float
    df_numerical = features.loc[:, mask_numeric]

    # categorical dataframe
    mask_categorical = features.dtypes != float
    df_categorical = features.loc[:, mask_categorical]

    # label encoding
    df_encoded = df_categorical.copy(deep=True)
    df_encoded['Gender'] = df_categorical['Gender'].map({'Male':0, 'Female':1})
    df_encoded['family_history_with_overweight'] = df_categorical['family_history_with_overweight'].map({'no':0, 'yes':1})
    df_encoded['FAVC'] = df_categorical['FAVC'].map({'no':0, 'yes':1})
    df_encoded['CAEC'] = df_categorical['CAEC'].map({'no':0, 'Sometimes':1, 'Frequently':2, 'Always':3})
    df_encoded['SMOKE'] = df_categorical['SMOKE'].map({'no':0, 'yes':1})
    df_encoded['SCC'] = df_categorical['SCC'].map({'no':0, 'yes':1})
    df_encoded['CALC'] = df_categorical['CALC'].map({'no':0, 'Sometimes':1, 'Frequently':2, 'Always':3})

    # one-hot encoding
    df_onehot = pd.get_dummies(df_categorical['MTRANS']).astype(int)
    df_encoded.drop('MTRANS', axis=1, inplace=True)

    # concatenate
    # one feature of df_encoded is redundant; we can remove it
    df_encoded = pd.concat([df_encoded, df_onehot.iloc[:, 0:-1]], axis=1)

    df_all_features = pd.concat([df_numerical, df_encoded], axis=1)

    return df_all_features


In [8]:
df_test = apply_preprocessing(df_test)

print('Train columns', df_all_features.columns)
print('Test columns', df_test.columns)

assert all(df_test.columns ==  df_all_features.columns)


Train columns Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'Automobile', 'Bike', 'Motorbike',
       'Public_Transportation'],
      dtype='object')
Test columns Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'Automobile', 'Bike', 'Motorbike',
       'Public_Transportation'],
      dtype='object')


In [9]:
labels['NObeyesdad'].unique()

labels_encoded = labels.copy(deep=True)

dict_conversion = {'Insufficient_Weight':0,
                   'Normal_Weight':1,
                   'Overweight_Level_I':2,
                   'Overweight_Level_II':3,
                   'Obesity_Type_I':4,
                   'Obesity_Type_II':5,
                   'Obesity_Type_III':6}

labels_encoded['NObeyesdad'] = labels_encoded['NObeyesdad'].map(dict_conversion)
#labels_encoded


In [10]:
X = df_all_features
y = np.ravel(labels_encoded)

# compare train and test data
X_test = df_test
assert all(X_test.columns == X.columns), "Columns of training and test data must be the same"

print('Number of samples', len(X))
print('Number of featires', X.shape[1])

from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y,
                                                                test_size=0.2,
                                                                random_state=42,
                                                                stratify=y)

Number of samples 20758
Number of featires 19


In [11]:
from sklearn.utils.class_weight import compute_class_weight

# calculate class weights based on the training data
class_weights = compute_class_weight('balanced',
                                     classes=np.unique(y),
                                     y=y)

class_weights = dict(zip(np.unique(y), class_weights))

In [12]:
class_weights

{0: 1.1753581337410113,
 1: 0.962176694168907,
 2: 1.2218494319854023,
 3: 1.1758241758241759,
 4: 1.019047619047619,
 5: 0.9130014074595355,
 6: 0.7329284655038486}

In [16]:
clf1 = GradientBoostingClassifier()
clf2 = RandomForestClassifier(class_weight='balanced')
clf3 = XGBClassifier()

clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)

In [17]:
def evaluate_model(clf):
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_validation)
    train_score = np.mean(y_pred_train == y_train)
    validation_score = np.mean(y_pred_test == y_validation)

    print('Train score', round(train_score, 3))
    print('Test score', round(validation_score, 3))

    if train_score - validation_score > 0.05:
        print('Overfitting detected')

In [18]:
print('Gradient Boosting')
evaluate_model(clf1)

print('Random Forest')
evaluate_model(clf2)

print('XGBoost')
evaluate_model(clf3)


Gradient Boosting
Train score 0.923
Test score 0.903
Random Forest
Train score 1.0
Test score 0.895
Overfitting detected
XGBoost
Train score 0.987
Test score 0.904
Overfitting detected


In [19]:
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 4, 10)
    n_estimators = trial.suggest_int('n_estimators', 500, 2000)
    gamma = trial.suggest_float('gamma', 0, 1)
    reg_alpha = trial.suggest_float('reg_alpha', 0, 1)
    reg_lambda = trial.suggest_float('reg_lambda', 0, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 0, 10)
    subsample = trial.suggest_float('subsample', 0, 1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0, 1)
    learning_rate = trial.suggest_float('learning_rate', 0, 1)

    print('Training the model with', X.shape[1], 'features')

    params = {'n_estimators': n_estimators,
              'learning_rate': learning_rate,
              'gamma': gamma,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_depth': max_depth,
              'min_child_weight': min_child_weight,
              'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'eval_metric':'mlogloss'}

    clf = XGBClassifier(**params)

    cv_results = cross_validate(clf, X, y, cv=5, scoring='accuracy')

    validation_score = np.mean(cv_results['test_score'])

    return validation_score

In [21]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-02-06 15:45:59,508] A new study created in memory with name: no-name-e655aedd-1e36-4a46-9b4d-d1fb07b5f220


Training the model with 19 features


[I 2024-02-06 15:47:44,046] Trial 0 finished with value: 0.8909338186726601 and parameters: {'max_depth': 6, 'n_estimators': 1443, 'gamma': 0.35318659439437683, 'reg_alpha': 0.07371791734032251, 'reg_lambda': 0.3371574568039213, 'min_child_weight': 6, 'subsample': 0.46662012683810106, 'colsample_bytree': 0.20667440703849504, 'learning_rate': 0.7531822705432965}. Best is trial 0 with value: 0.8909338186726601.


Training the model with 19 features


[I 2024-02-06 15:49:17,305] Trial 1 finished with value: 0.9102516676576762 and parameters: {'max_depth': 5, 'n_estimators': 1326, 'gamma': 0.8882688425419403, 'reg_alpha': 0.47416998437788693, 'reg_lambda': 0.7956118668278194, 'min_child_weight': 10, 'subsample': 0.8722254223041621, 'colsample_bytree': 0.18466018987247323, 'learning_rate': 0.10287164868796095}. Best is trial 1 with value: 0.9102516676576762.


Training the model with 19 features


[I 2024-02-06 15:51:05,726] Trial 2 finished with value: 0.9022065857798733 and parameters: {'max_depth': 8, 'n_estimators': 978, 'gamma': 0.7972524142703985, 'reg_alpha': 0.9359361027505799, 'reg_lambda': 0.22751028104564175, 'min_child_weight': 0, 'subsample': 0.4248593185876649, 'colsample_bytree': 0.23292634525275147, 'learning_rate': 0.463323919626892}. Best is trial 1 with value: 0.9102516676576762.


Training the model with 19 features


[I 2024-02-06 16:00:29,610] Trial 3 finished with value: 0.911166958863593 and parameters: {'max_depth': 5, 'n_estimators': 1434, 'gamma': 0.6735796582050424, 'reg_alpha': 0.08819527284734396, 'reg_lambda': 0.2529041594431367, 'min_child_weight': 8, 'subsample': 0.9149142958790631, 'colsample_bytree': 0.3842410623695345, 'learning_rate': 0.024143528275160087}. Best is trial 3 with value: 0.911166958863593.


Number of finished trials:  4
Best trial:
  Value: 0.911166958863593
  Params: 
    max_depth: 5
    n_estimators: 1434
    gamma: 0.6735796582050424
    reg_alpha: 0.08819527284734396
    reg_lambda: 0.2529041594431367
    min_child_weight: 8
    subsample: 0.9149142958790631
    colsample_bytree: 0.3842410623695345
    learning_rate: 0.024143528275160087


In [22]:
best_params = study.best_trial.params
best_params['eval_metric'] = 'mlogloss'
best_params['use_label_encoder'] = False

clf = XGBClassifier(**best_params)

clf.fit(X, y)

y_pred = clf.predict(X_test)

In [26]:
best_params

{'max_depth': 5,
 'n_estimators': 1434,
 'gamma': 0.6735796582050424,
 'reg_alpha': 0.08819527284734396,
 'reg_lambda': 0.2529041594431367,
 'min_child_weight': 8,
 'subsample': 0.9149142958790631,
 'colsample_bytree': 0.3842410623695345,
 'learning_rate': 0.024143528275160087,
 'eval_metric': 'mlogloss',
 'use_label_encoder': False}

In [24]:
reverse_dict_conversion = dict(zip(dict_conversion.values(), dict_conversion.keys()))

df_submission = pd.read_csv("data/sample_submission.csv")
df_submission['NObeyesdad'] = y_pred
df_submission['NObeyesdad'] = df_submission['NObeyesdad'].map(reverse_dict_conversion)

df_submission.to_csv('data/submission_03.csv', index=False)
df_submission.head(5)

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
