In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import xgboost as xgb

In [2]:
# Load Data
train_data = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
original_data = pd.read_csv("/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv")


In [3]:
# Merge Data
train_data = train_data.drop("id", axis=1)
train_data = pd.concat([train_data, original_data], ignore_index=True).drop_duplicates()


In [4]:
# Feature Scaling
scaler = StandardScaler()
num_cols = list(train_data.select_dtypes(exclude=['object']).columns)
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])



In [5]:
# Encoding Categorical Variables
labelencoder = LabelEncoder()
object_columns = train_data.select_dtypes(include='object').columns.difference(['NObeyesdad'])

for col_name in object_columns:
    if train_data[col_name].dtypes == 'object':
        train_data[col_name] = labelencoder.fit_transform(train_data[col_name])
        test_data[col_name] = labelencoder.transform(test_data[col_name])



In [6]:
# Define Features and Target
X = train_data.drop(['NObeyesdad'], axis=1)
y = labelencoder.fit_transform(train_data['NObeyesdad'])
X_test = test_data.drop(["id"], axis=1)


In [7]:
# Define LightGBM model parameters
lgb_param = {
    "objective": "multiclass",
    "metric": "multi_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "random_state": 42,
    "num_class": 7,
    'learning_rate': 0.030962211546832760,
    'n_estimators': 500,
    'lambda_l1': 0.009667446568254372,
    'lambda_l2': 0.04018641437301800,
    'max_depth': 10,
    'colsample_bytree': 0.40977129346872643,
    'subsample': 0.9535797422450176,
    'min_child_samples': 26
}




In [8]:
# Define XGBoost model parameters
xgb_param = {
    'objective': 'multi:softmax',
    'num_class': 7,
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'seed': 42
}

In [9]:
# Multi-Level Cross-Validation
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store validation accuracies and predictions
val_accuracies_lgb = []
val_accuracies_xgb = []
test_preds_lgb = []
test_preds_xgb = []

for train_index, val_index in outer_cv.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Train LightGBM model
    model_lgb = lgb.LGBMClassifier(**lgb_param, verbose=100)
    model_lgb.fit(X_train, y_train)

    # Make predictions on validation set using LightGBM
    pred_proba_lgb = model_lgb.predict_proba(X_val)
    y_pred_lgb = np.argmax(pred_proba_lgb, axis=1)
    val_accuracy_lgb = accuracy_score(y_val, y_pred_lgb)
    val_accuracies_lgb.append(val_accuracy_lgb)

    # Train XGBoost model
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val)
    num_round = 100
    model_xgb = xgb.train(xgb_param, dtrain, num_round)

    # Make predictions on validation set using XGBoost
    pred_proba_xgb = model_xgb.predict(dval)
    y_pred_xgb = np.array(pred_proba_xgb, dtype=int)
    val_accuracy_xgb = accuracy_score(y_val, y_pred_xgb)
    val_accuracies_xgb.append(val_accuracy_xgb)

    # Make predictions on test set using both models
    dtest = xgb.DMatrix(X_test)
    test_preds_lgb.append(model_lgb.predict_proba(X_test))
    test_preds_xgb.append(model_xgb.predict(dtest))


In [10]:
# Average validation accuracies
mean_val_accuracy_lgb = np.mean(val_accuracies_lgb)
mean_val_accuracy_xgb = np.mean(val_accuracies_xgb)
print("Mean Validation Accuracy (LightGBM):", mean_val_accuracy_lgb)
print("Mean Validation Accuracy (XGBoost):", mean_val_accuracy_xgb)

Mean Validation Accuracy (LightGBM): 0.9141606478441672
Mean Validation Accuracy (XGBoost): 0.9140731013350842


In [11]:
# Average predictions on test set
mean_test_preds_lgb = np.mean(test_preds_lgb, axis=0)
mean_test_preds_xgb = np.mean(test_preds_xgb, axis=0)

final_test_preds_lgb = np.argmax(mean_test_preds_lgb, axis=1)
final_test_preds_xgb = np.array(mean_test_preds_xgb, dtype=int)



In [12]:
# Decode predictions
pred_lgb = labelencoder.inverse_transform(final_test_preds_lgb)
pred_xgb = labelencoder.inverse_transform(final_test_preds_xgb)



In [13]:
# Create submission files for both models
submission_lgb = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred_lgb})
submission_xgb = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred_xgb})

submission_lgb.to_csv('submission_lgb.csv', index=False)
submission_xgb.to_csv('submission_xgb.csv', index=False)