In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import xgboost as xgb

In [2]:
# Load Data
train_data = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
original_data = pd.read_csv("/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv")
sample_submission_data = pd.read_csv("/kaggle/input/playground-series-s4e2/sample_submission.csv")


In [3]:
# Merge Data
train_data = train_data.drop("id", axis=1)
train_data = pd.concat([train_data, original_data], ignore_index=True).drop_duplicates()


In [4]:
# Feature Scaling
scaler = StandardScaler()
num_cols = list(train_data.select_dtypes(exclude=['object']).columns)
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])


In [5]:
# Encoding Categorical Variables
labelencoder = LabelEncoder()
object_columns = train_data.select_dtypes(include='object').columns.difference(['NObeyesdad'])

for col_name in object_columns:
    if train_data[col_name].dtypes == 'object':
        train_data[col_name] = labelencoder.fit_transform(train_data[col_name])
        test_data[col_name] = labelencoder.transform(test_data[col_name])


In [6]:
# Define Features and Target
X = train_data.drop(['NObeyesdad'], axis=1)
y = labelencoder.fit_transform(train_data['NObeyesdad'])
X_test = test_data.drop(["id"], axis=1)



In [7]:
# Define LightGBM model parameters
lgb_param = {
    "objective": "multiclass",
    "metric": "multi_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "random_state": 42,
    "num_class": 7,
    'learning_rate': 0.030962211546832760,
    'n_estimators': 500,
    'lambda_l1': 0.009667446568254372,
    'lambda_l2': 0.04018641437301800,
    'max_depth': 10,
    'colsample_bytree': 0.40977129346872643,
    'subsample': 0.9535797422450176,
    'min_child_samples': 26
}


In [8]:
# Define XGBoost model parameters
xgb_param = {
    "objective": "multi:softmax",
    "eval_metric": "mlogloss",
    "verbosity": 0,
    "random_state": 42,
    "num_class": 7,
    'learning_rate': 0.030962211546832760,
    'n_estimators': 500,
    'reg_lambda': 0.009667446568254372,
    'reg_alpha': 0.04018641437301800,
    'max_depth': 10,
    'colsample_bytree': 0.40977129346872643,
    'subsample': 0.9535797422450176,
    'min_child_weight': 26
}


In [9]:
# Define Shuffle-Split Cross-Validation
num_splits = 10
ss = ShuffleSplit(n_splits=num_splits, test_size=0.1, random_state=42)

# Initialize lists to store validation accuracies and predictions
val_accuracies_lgb = []
test_preds_lgb = []
val_accuracies_xgb = []
test_preds_xgb = []

# Perform Shuffle-Split Cross-Validation
for train_index, val_index in ss.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Train LightGBM model
    model_lgb = lgb.LGBMClassifier(**lgb_param, verbose=100)
    model_lgb.fit(X_train, y_train)
    
    # Make predictions on validation set (LightGBM)
    pred_proba_lgb = model_lgb.predict_proba(X_val)
    y_pred_lgb = np.argmax(pred_proba_lgb, axis=1)
    
    # Calculate accuracy and store (LightGBM)
    val_accuracy_lgb = accuracy_score(y_val, y_pred_lgb)
    val_accuracies_lgb.append(val_accuracy_lgb)
    
    # Make predictions on test set (LightGBM)
    test_pred_proba_lgb = model_lgb.predict_proba(X_test)
    test_preds_lgb.append(test_pred_proba_lgb)
    
    # Train XGBoost model
    model_xgb = xgb.XGBClassifier(**xgb_param)
    model_xgb.fit(X_train, y_train)
    
    # Make predictions on validation set (XGBoost)
    pred_proba_xgb = model_xgb.predict_proba(X_val)
    y_pred_xgb = np.argmax(pred_proba_xgb, axis=1)
    
    # Calculate accuracy and store (XGBoost)
    val_accuracy_xgb = accuracy_score(y_val, y_pred_xgb)
    val_accuracies_xgb.append(val_accuracy_xgb)
    
    # Make predictions on test set (XGBoost)
    test_pred_proba_xgb = model_xgb.predict_proba(X_test)
    test_preds_xgb.append(test_pred_proba_xgb)


In [10]:
# Average validation accuracy (LightGBM)
mean_val_accuracy_lgb = np.mean(val_accuracies_lgb)
print("Mean Validation Accuracy (LightGBM):", mean_val_accuracy_lgb)



Mean Validation Accuracy (LightGBM): 0.915886214442013


In [11]:
# Average predictions on test set (LightGBM)
mean_test_preds_lgb = np.mean(test_preds_lgb, axis=0)
final_test_preds_lgb = np.argmax(mean_test_preds_lgb, axis=1)



In [12]:
# Average validation accuracy (XGBoost)
mean_val_accuracy_xgb = np.mean(val_accuracies_xgb)
print("Mean Validation Accuracy (XGBoost):", mean_val_accuracy_xgb)



Mean Validation Accuracy (XGBoost): 0.9173741794310721


In [13]:
# Average predictions on test set (XGBoost)
mean_test_preds_xgb = np.mean(test_preds_xgb, axis=0)
final_test_preds_xgb = np.argmax(mean_test_preds_xgb, axis=1)



In [14]:
# Ensemble predictions
ensemble_preds = np.vstack((final_test_preds_lgb, final_test_preds_xgb)).T
final_preds = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=ensemble_preds)



In [15]:
# Decode predictions
pred = labelencoder.inverse_transform(final_preds)



In [16]:
# Create submission file
submission = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred})
submission.to_csv('submission_ensemble.csv', index=False)