In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import optuna
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler
from category_encoders import TargetEncoder
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
import logging
logging.getLogger('lightgbm').setLevel(logging.INFO)
logging.getLogger('lightgbm').setLevel(logging.ERROR)

In [2]:
# Load the data
train_data = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
# Load the original data
original_data = pd.read_csv("/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv")

In [3]:
original_data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
# Feature Engineering

# Calculate BMI (Body Mass Index)
train_data['BMI'] = train_data['Weight'] / ((train_data['Height'] / 100) ** 2)
test_data['BMI'] = test_data['Weight'] / ((test_data['Height'] / 100) ** 2)

# Calculate the ratio of Frequency of consumption of vegetables to Number of main meals per day
train_data['FCVC_NCP_Ratio'] = train_data['FCVC'] / train_data['NCP']
test_data['FCVC_NCP_Ratio'] = test_data['FCVC'] / test_data['NCP']

# Family history with overweight: Convert to binary indicator (1 for having family history, 0 otherwise)
train_data['family_history_with_overweight'] = train_data['family_history_with_overweight'].apply(lambda x: 1 if x == 'yes' else 0)
test_data['family_history_with_overweight'] = test_data['family_history_with_overweight'].apply(lambda x: 1 if x == 'yes' else 0)



In [5]:
# Interaction features
train_data['Age_BMI'] = train_data['Age'] * train_data['BMI']
train_data['FCVC_NCP'] = train_data['FCVC'] * train_data['NCP']
train_data['Physical_Activity_Sedentary'] = train_data['FAF'] * (1 - train_data['TUE'] / (train_data['TUE'] + 1))

test_data['Age_BMI'] = test_data['Age'] * test_data['BMI']
test_data['FCVC_NCP'] = test_data['FCVC'] * test_data['NCP']
test_data['Physical_Activity_Sedentary'] = test_data['FAF'] * (1 - test_data['TUE'] / (test_data['TUE'] + 1))

# Incorporating domain-specific features (example)
train_data['Protein_Intake'] = np.random.randint(50, 150, size=len(train_data))
test_data['Protein_Intake'] = np.random.randint(50, 150, size=len(test_data))



In [6]:
# Height-Weight Ratio
train_data['Height_Weight_Ratio'] = train_data['Height'] / train_data['Weight']
test_data['Height_Weight_Ratio'] = test_data['Height'] / test_data['Weight']

# Physical Activity Level
train_data['Physical_Activity_Level'] = train_data['FAF'] / (train_data['TUE'] + 1)
test_data['Physical_Activity_Level'] = test_data['FAF'] / (test_data['TUE'] + 1)

# Dietary Patterns
train_data['Dietary_Patterns'] = train_data['FCVC'] * train_data['NCP']
test_data['Dietary_Patterns'] = test_data['FCVC'] * test_data['NCP']

# Hydration Index
train_data['Hydration_Index'] = train_data['CH2O'] - (train_data['TUE'] * 0.5)
test_data['Hydration_Index'] = test_data['CH2O'] - (test_data['TUE'] * 0.5)

# Lifestyle Habits
train_data['Lifestyle_Habits'] = (train_data['SMOKE'] + train_data['CALC']) * (train_data['MTRANS'].astype(str).apply(lambda x: len(x)) + 1)
test_data['Lifestyle_Habits'] = (test_data['SMOKE'] + test_data['CALC']) * (test_data['MTRANS'].astype(str).apply(lambda x: len(x)) + 1)



In [7]:
# Combine train and test data for preprocessing
combined_data = pd.concat([train_data.drop(columns=['NObeyesdad']), test_data])

# Age Group: Categorize age into different groups
bins = [0, 18, 30, 45, 60, np.inf]
labels = ['Child', 'Young Adult', 'Adult', 'Middle-aged', 'Senior']
combined_data['Age_Group'] = pd.cut(combined_data['Age'], bins=bins, labels=labels)

# BMI Category: Categorize BMI into different weight status groups
bmi_bins = [0, 18.5, 25, 30, np.inf]
bmi_labels = ['Underweight', 'Normal Weight', 'Overweight', 'Obese']
combined_data['BMI_Category'] = pd.cut(combined_data['BMI'], bins=bmi_bins, labels=bmi_labels)



In [8]:
# Preprocessing

# Convert categorical variables to one-hot encoding
combined_data = pd.get_dummies(combined_data, drop_first=True)

In [9]:
# Split back into train and test data
X_train = combined_data[:len(train_data)]
X_test = combined_data[len(train_data):]
y_train = train_data['NObeyesdad']



In [10]:
# Mapping target labels to integers
label_mapping = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}
y_train = y_train.map(label_mapping)



In [11]:
# Train-test split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [12]:
# Define LightGBM dataset
lgbm_train_dataset = X_train_split
lgbm_val_dataset = X_val_split


In [13]:
# Objective function for Optuna for LightGBM
def lgbm_objective(trial):
    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': 7,
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
    }

    model = LGBMClassifier(**params)
    model.fit(lgbm_train_dataset, y_train_split, eval_set=[(lgbm_val_dataset, y_val_split)])
    preds = model.predict(lgbm_val_dataset)
    accuracy = accuracy_score(y_val_split, preds)
    return 1.0 - accuracy

In [14]:
# Optimize hyperparameters for LightGBM
lgbm_study = optuna.create_study(direction='minimize')
lgbm_study.optimize(lgbm_objective, n_trials=20)

[I 2024-02-23 16:08:49,652] A new study created in memory with name: no-name-d9c303ed-6d75-464c-8c1c-166f9fc51ef7
[I 2024-02-23 16:08:53,525] Trial 0 finished with value: 0.10814065510597304 and parameters: {'lambda_l1': 1.0518246262181696, 'lambda_l2': 3.631000339062156e-07, 'num_leaves': 195, 'learning_rate': 0.014361678271443037, 'feature_fraction': 0.7662604711604587, 'bagging_fraction': 0.7728786007757364, 'bagging_freq': 6, 'min_child_samples': 100}. Best is trial 0 with value: 0.10814065510597304.
[I 2024-02-23 16:08:54,819] Trial 1 finished with value: 0.33501926782273606 and parameters: {'lambda_l1': 1.6539971832211708, 'lambda_l2': 0.00663817479827543, 'num_leaves': 3, 'learning_rate': 0.0013482271081730393, 'feature_fraction': 0.31218040071349407, 'bagging_fraction': 0.2417781364918458, 'bagging_freq': 5, 'min_child_samples': 77}. Best is trial 0 with value: 0.10814065510597304.
[I 2024-02-23 16:08:57,083] Trial 2 finished with value: 0.12018304431599225 and parameters: {'la

In [15]:
# Get best hyperparameters for LightGBM
best_lgbm_params = lgbm_study.best_params

# Train final LightGBM model with best hyperparameters
final_lgbm_model = LGBMClassifier(**best_lgbm_params)
final_lgbm_model.fit(lgbm_train_dataset, y_train_split, eval_set=[(lgbm_val_dataset, y_val_split)])



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006440 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4738
[LightGBM] [Info] Number of data points in the train set: 16606, number of used features: 52
[LightGBM] [Info] Start training from score -2.117117
[LightGBM] [Info] Start training from score -1.911230
[LightGBM] [Info] Start training from score -2.145531
[LightGBM] [Info] Start training from score -2.112625
[LightGBM] [Info] Start training from score -1.948141
[LightGBM] [Info] Start training from score -1.857720
[LightGBM] [Info] Start training from score -1.633574


In [16]:
# Make predictions using LightGBM
lgbm_test_preds = final_lgbm_model.predict(X_test)





In [17]:
# Mapping integer predictions back to original labels
inverse_label_mapping = {v: k for k, v in label_mapping.items()}
lgbm_test_pred_labels = [inverse_label_mapping[int(label)] for label in lgbm_test_preds]



In [18]:
# Create submission file for LightGBM
lgbm_submission = pd.DataFrame({'id': test_data['id'], 'NObeyesdad': lgbm_test_pred_labels})
lgbm_submission.to_csv('lgbm_submission.csv', index=False)