In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
import optuna
from sklearn.model_selection import cross_validate, train_test_split

# STATISTICS
from statsmodels.graphics.gofplots import qqplot
from sklearn.utils.class_weight import compute_class_weight

# turn of warning
import warnings
warnings.filterwarnings("ignore")

In [40]:
df_train = pd.read_csv('data/train.csv')
df_original = pd.read_csv('data/ObesityDataSet.csv')
df_test = pd.read_csv('data/test.csv', index_col='id')

df = pd.concat((df_train, df_original), axis=0).drop(['id'], axis=1)

In [41]:
features = df.drop('NObeyesdad', axis=1)
labels = pd.DataFrame(df['NObeyesdad'])

mask_numeric = features.dtypes == float
df_numerical = features.loc[:, mask_numeric]

df_numerical

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,24.443011,1.699998,81.669950,2.000000,2.983297,2.763573,0.000000,0.976473
1,18.000000,1.560000,57.000000,2.000000,3.000000,2.000000,1.000000,1.000000
2,18.000000,1.711460,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584
3,20.952737,1.710730,131.274851,3.000000,3.000000,1.674061,1.467863,0.780199
4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721
...,...,...,...,...,...,...,...,...
2106,20.976842,1.710730,131.408528,3.000000,3.000000,1.728139,1.676269,0.906247
2107,21.982942,1.748584,133.742943,3.000000,3.000000,2.005130,1.341390,0.599270
2108,22.524036,1.752206,133.689352,3.000000,3.000000,2.054193,1.414209,0.646288
2109,24.361936,1.739450,133.346641,3.000000,3.000000,2.852339,1.139107,0.586035


In [42]:
mask_categorical = features.dtypes != float
df_categorical = features.loc[:, mask_categorical]

for i in range(df_categorical.shape[1]):
    print(df_categorical.iloc[:, i].value_counts())

Gender
Female    11465
Male      11404
Name: count, dtype: int64
family_history_with_overweight
yes    18740
no      4129
Name: count, dtype: int64
FAVC
yes    20848
no      2021
Name: count, dtype: int64
CAEC
Sometimes     19294
Frequently     2714
Always          531
no              330
Name: count, dtype: int64
SMOKE
no     22580
yes      289
Name: count, dtype: int64
SCC
no     22086
yes      783
Name: count, dtype: int64
CALC
Sometimes     16467
no             5802
Frequently      599
Always            1
Name: count, dtype: int64
MTRANS
Public_Transportation    18267
Automobile                3991
Walking                    523
Motorbike                   49
Bike                        39
Name: count, dtype: int64


In [43]:
df_encoded = df_categorical.copy(deep=True)

# label encoding
df_encoded['Gender'] = df_categorical['Gender'].map({'Male':0, 'Female':1})
df_encoded['family_history_with_overweight'] = df_categorical['family_history_with_overweight'].map({'no':0, 'yes':1})
df_encoded['FAVC'] = df_categorical['FAVC'].map({'no':0, 'yes':1})
df_encoded['CAEC'] = df_categorical['CAEC'].map({'no':0, 'Sometimes':1, 'Frequently':2, 'Always':3})
df_encoded['SMOKE'] = df_categorical['SMOKE'].map({'no':0, 'yes':1})
df_encoded['SCC'] = df_categorical['SCC'].map({'no':0, 'yes':1})
df_encoded['CALC'] = df_categorical['CALC'].map({'no':0, 'Sometimes':1, 'Frequently':2, 'Always':3})

# one-hot encoding
df_onehot = pd.get_dummies(df_categorical['MTRANS']).astype(int)
df_encoded.drop('MTRANS', axis=1, inplace=True)

# concatenate
# one feature of df_encoded is redundant; we can remove it
df_encoded = pd.concat([df_encoded, df_onehot.iloc[:, 0:-1]], axis=1)

#df_encoded
df_all_features = pd.concat([df_numerical, df_encoded], axis=1)


In [44]:
def apply_preprocessing(data):
    features = data.copy(deep=True)

    # numerical dataframe
    mask_numeric = features.dtypes == float
    df_numerical = features.loc[:, mask_numeric]

    # categorical dataframe
    mask_categorical = features.dtypes != float
    df_categorical = features.loc[:, mask_categorical]

    # label encoding
    df_encoded = df_categorical.copy(deep=True)
    df_encoded['Gender'] = df_categorical['Gender'].map({'Male':0, 'Female':1})
    df_encoded['family_history_with_overweight'] = df_categorical['family_history_with_overweight'].map({'no':0, 'yes':1})
    df_encoded['FAVC'] = df_categorical['FAVC'].map({'no':0, 'yes':1})
    df_encoded['CAEC'] = df_categorical['CAEC'].map({'no':0, 'Sometimes':1, 'Frequently':2, 'Always':3})
    df_encoded['SMOKE'] = df_categorical['SMOKE'].map({'no':0, 'yes':1})
    df_encoded['SCC'] = df_categorical['SCC'].map({'no':0, 'yes':1})
    df_encoded['CALC'] = df_categorical['CALC'].map({'no':0, 'Sometimes':1, 'Frequently':2, 'Always':3})

    # one-hot encoding
    df_onehot = pd.get_dummies(df_categorical['MTRANS']).astype(int)
    df_encoded.drop('MTRANS', axis=1, inplace=True)

    # concatenate
    # one feature of df_encoded is redundant; we can remove it
    df_encoded = pd.concat([df_encoded, df_onehot.iloc[:, 0:-1]], axis=1)

    df_all_features = pd.concat([df_numerical, df_encoded], axis=1)

    return df_all_features


In [45]:
df_test = apply_preprocessing(df_test)

print('Train columns', df_all_features.columns)
print('Test columns', df_test.columns)

assert all(df_test.columns ==  df_all_features.columns)


Train columns Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'Automobile', 'Bike', 'Motorbike',
       'Public_Transportation'],
      dtype='object')
Test columns Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'Automobile', 'Bike', 'Motorbike',
       'Public_Transportation'],
      dtype='object')


In [46]:
labels['NObeyesdad'].unique()

labels_encoded = labels.copy(deep=True)

dict_conversion = {'Insufficient_Weight':0,
                   'Normal_Weight':1,
                   'Overweight_Level_I':2,
                   'Overweight_Level_II':3,
                   'Obesity_Type_I':4,
                   'Obesity_Type_II':5,
                   'Obesity_Type_III':6}

labels_encoded['NObeyesdad'] = labels_encoded['NObeyesdad'].map(dict_conversion)
#labels_encoded


In [47]:
X = df_all_features
y = np.ravel(labels_encoded)

# compare train and test data
X_test = df_test
assert all(X_test.columns == X.columns), "Columns of training and test data must be the same"

print('Number of samples', len(X))
print('Number of featires', X.shape[1])

X_train, X_validation, y_train, y_validation = train_test_split(X, y,
                                                                test_size=0.2,
                                                                random_state=42,
                                                                stratify=y)

Number of samples 22869
Number of featires 19


In [48]:


# calculate class weights based on the training data
class_weights = compute_class_weight('balanced',
                                     classes=np.unique(y),
                                     y=y)

class_weights = dict(zip(np.unique(y), class_weights))

In [49]:
class_weights

{0: 1.1688729874776387,
 1: 0.9697239536954586,
 2: 1.2024291497975708,
 3: 1.1618065433854907,
 4: 1.0018399264029438,
 5: 0.9215796897038082,
 6: 0.7475972540045767}

In [50]:
clf1 = GradientBoostingClassifier()
clf2 = RandomForestClassifier(class_weight='balanced')
clf3 = XGBClassifier()

clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)

In [51]:
def evaluate_model(clf):
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_validation)
    train_score = np.mean(y_pred_train == y_train)
    validation_score = np.mean(y_pred_test == y_validation)

    print('Train score', round(train_score, 3))
    print('Test score', round(validation_score, 3))

    if train_score - validation_score > 0.05:
        print('Overfitting detected')

In [52]:
print('Gradient Boosting')
evaluate_model(clf1)

print('Random Forest')
evaluate_model(clf2)

print('XGBoost')
evaluate_model(clf3)


Gradient Boosting
Train score 0.925
Test score 0.912
Random Forest
Train score 1.0
Test score 0.908
Overfitting detected
XGBoost
Train score 0.988
Test score 0.915
Overfitting detected


In [53]:
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 4, 10)
    n_estimators = trial.suggest_int('n_estimators', 500, 2000)
    gamma = trial.suggest_float('gamma', 0, 1)
    reg_alpha = trial.suggest_float('reg_alpha', 0, 1)
    reg_lambda = trial.suggest_float('reg_lambda', 0, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 0, 10)
    subsample = trial.suggest_float('subsample', 0, 1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0, 1)
    learning_rate = trial.suggest_float('learning_rate', 0, 1)

    print('Training the model with', X.shape[1], 'features')

    params = {'n_estimators': n_estimators,
              'learning_rate': learning_rate,
              'gamma': gamma,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_depth': max_depth,
              'min_child_weight': min_child_weight,
              'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'eval_metric':'mlogloss'}

    clf = XGBClassifier(**params)

    cv_results = cross_validate(clf, X, y, cv=5, scoring='accuracy')

    validation_score = np.mean(cv_results['test_score'])

    return validation_score

In [54]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-02-07 14:21:13,191] A new study created in memory with name: no-name-043a5544-b723-4829-84ce-85d51e71f2da


Training the model with 19 features


[I 2024-02-07 14:22:19,809] Trial 0 finished with value: 0.9031008416064672 and parameters: {'max_depth': 9, 'n_estimators': 593, 'gamma': 0.8440523497390293, 'reg_alpha': 0.7807865231039206, 'reg_lambda': 0.38470239671907536, 'min_child_weight': 10, 'subsample': 0.5142767549260333, 'colsample_bytree': 0.35515170869620516, 'learning_rate': 0.9017722054372702}. Best is trial 0 with value: 0.9031008416064672.


Training the model with 19 features


[I 2024-02-07 14:22:49,236] Trial 1 finished with value: 0.8597229264639668 and parameters: {'max_depth': 4, 'n_estimators': 501, 'gamma': 0.7022943357667949, 'reg_alpha': 0.9435489316422072, 'reg_lambda': 0.43784011596067496, 'min_child_weight': 1, 'subsample': 0.02215628848882878, 'colsample_bytree': 0.7365001660165138, 'learning_rate': 0.4667520726849561}. Best is trial 0 with value: 0.9031008416064672.


Training the model with 19 features


[I 2024-02-07 14:25:58,911] Trial 2 finished with value: 0.8760335063003115 and parameters: {'max_depth': 8, 'n_estimators': 1047, 'gamma': 0.9293849562331639, 'reg_alpha': 0.19840982997098489, 'reg_lambda': 0.843108490952849, 'min_child_weight': 5, 'subsample': 0.0974695961481501, 'colsample_bytree': 0.5331861514769544, 'learning_rate': 0.6536108718438252}. Best is trial 0 with value: 0.9031008416064672.


Training the model with 19 features


[I 2024-02-07 14:26:50,008] Trial 3 finished with value: 0.8916438772816356 and parameters: {'max_depth': 5, 'n_estimators': 1609, 'gamma': 0.24269498288430624, 'reg_alpha': 0.4929508968829117, 'reg_lambda': 0.2759483198928736, 'min_child_weight': 10, 'subsample': 0.11336744181551273, 'colsample_bytree': 0.005978537719118271, 'learning_rate': 0.5206159000706477}. Best is trial 0 with value: 0.9031008416064672.


Training the model with 19 features


[I 2024-02-07 14:28:10,719] Trial 4 finished with value: 0.8932617363699462 and parameters: {'max_depth': 8, 'n_estimators': 1875, 'gamma': 0.9005475195931154, 'reg_alpha': 0.3198647235132195, 'reg_lambda': 0.7482187838975902, 'min_child_weight': 6, 'subsample': 0.22676624678791446, 'colsample_bytree': 0.06113026342459282, 'learning_rate': 0.6651292989488645}. Best is trial 0 with value: 0.9031008416064672.


Training the model with 19 features


[I 2024-02-07 14:31:08,300] Trial 5 finished with value: 0.892387801979471 and parameters: {'max_depth': 9, 'n_estimators': 1834, 'gamma': 0.34113488591745655, 'reg_alpha': 0.07454577724370515, 'reg_lambda': 0.6275650338643323, 'min_child_weight': 2, 'subsample': 0.32008072432418977, 'colsample_bytree': 0.27741958143115486, 'learning_rate': 0.7505638224296701}. Best is trial 0 with value: 0.9031008416064672.


Training the model with 19 features


[I 2024-02-07 14:36:34,960] Trial 6 finished with value: 0.9006959443611677 and parameters: {'max_depth': 8, 'n_estimators': 1415, 'gamma': 0.671908884565877, 'reg_alpha': 0.5523817368838536, 'reg_lambda': 0.4016793421392031, 'min_child_weight': 0, 'subsample': 0.30941298201193435, 'colsample_bytree': 0.728864528133102, 'learning_rate': 0.4480377489820614}. Best is trial 0 with value: 0.9031008416064672.


Number of finished trials:  7
Best trial:
  Value: 0.9031008416064672
  Params: 
    max_depth: 9
    n_estimators: 593
    gamma: 0.8440523497390293
    reg_alpha: 0.7807865231039206
    reg_lambda: 0.38470239671907536
    min_child_weight: 10
    subsample: 0.5142767549260333
    colsample_bytree: 0.35515170869620516
    learning_rate: 0.9017722054372702


In [55]:
best_params = study.best_trial.params
best_params['eval_metric'] = 'mlogloss'
best_params['use_label_encoder'] = False

clf = XGBClassifier(**best_params)

clf.fit(X, y)

y_pred = clf.predict(X_test)

In [56]:
best_params

{'max_depth': 9,
 'n_estimators': 593,
 'gamma': 0.8440523497390293,
 'reg_alpha': 0.7807865231039206,
 'reg_lambda': 0.38470239671907536,
 'min_child_weight': 10,
 'subsample': 0.5142767549260333,
 'colsample_bytree': 0.35515170869620516,
 'learning_rate': 0.9017722054372702,
 'eval_metric': 'mlogloss',
 'use_label_encoder': False}

In [57]:
reverse_dict_conversion = dict(zip(dict_conversion.values(), dict_conversion.keys()))

df_submission = pd.read_csv("data/sample_submission.csv")
df_submission['NObeyesdad'] = y_pred
df_submission['NObeyesdad'] = df_submission['NObeyesdad'].map(reverse_dict_conversion)

df_submission.to_csv('data/submission4.csv', index=False)
df_submission.head(5)

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
