In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier


In [2]:
# Load Data
train_data = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
original_data = pd.read_csv("/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv")
sample_submission_data = pd.read_csv("/kaggle/input/playground-series-s4e2/sample_submission.csv")


In [3]:
# Merge Data
train_data = train_data.drop("id", axis=1)
train_data = pd.concat([train_data, original_data], ignore_index=True).drop_duplicates()


In [4]:
# Feature Scaling
scaler = StandardScaler()
num_cols = list(train_data.select_dtypes(exclude=['object']).columns)
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])

In [5]:
# Encoding Categorical Variables
labelencoder = LabelEncoder()
object_columns = train_data.select_dtypes(include='object').columns.difference(['NObeyesdad'])

for col_name in object_columns:
    if train_data[col_name].dtypes == 'object':
        train_data[col_name] = labelencoder.fit_transform(train_data[col_name])
        test_data[col_name] = labelencoder.transform(test_data[col_name])



In [6]:
# Define Features and Target
X = train_data.drop(['NObeyesdad'], axis=1)
y = labelencoder.fit_transform(train_data['NObeyesdad'])
X_test = test_data.drop(["id"], axis=1)


In [7]:
# Define LightGBM model parameters
lgb_param = {
    "objective": "multiclass",
    "metric": "multi_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "random_state": 42,
    "num_class": 7,
    'learning_rate': 0.030962211546832760,
    'n_estimators': 500,
    'lambda_l1': 0.009667446568254372,
    'lambda_l2': 0.04018641437301800,
    'max_depth': 10,
    'colsample_bytree': 0.40977129346872643,
    'subsample': 0.9535797422450176,
    'min_child_samples': 26
}



In [8]:
# Define XGBoost model parameters
xgb_param = {
    "objective": "multi:softmax",
    "eval_metric": "mlogloss",
    "verbosity": 0,
    "random_state": 42,
    "num_class": 7,
    'learning_rate': 0.030962211546832760,
    'n_estimators': 500,
    'reg_lambda': 0.009667446568254372,
    'reg_alpha': 0.04018641437301800,
    'max_depth': 10,
    'colsample_bytree': 0.40977129346872643,
    'subsample': 0.9535797422450176,
    'min_child_weight': 26
}

In [9]:
# Define CatBoost model parameters
catboost_param = {
    'objective': 'MultiClass',
    'eval_metric': 'MultiClass',
    'random_seed': 42,
    'learning_rate': 0.030962211546832760,
    'n_estimators': 500,
    'l2_leaf_reg': 0.009667446568254372,
    'border_count': 128,
    'depth': 10,
    'colsample_bylevel': 0.40977129346872643,
    'subsample': 0.9535797422450176,
    'min_child_samples': 26,
    'bootstrap_type': 'MVS'  # Change the bootstrap type
}

In [10]:
# Define Shuffle-Split Cross-Validation
num_splits = 5
ss = ShuffleSplit(n_splits=num_splits, test_size=0.1, random_state=42)

# Initialize lists to store validation accuracies and predictions
val_accuracies_lgb = []
test_preds_lgb = []
val_accuracies_xgb = []
test_preds_xgb = []
val_accuracies_catboost = []
test_preds_catboost = []

# Perform Shuffle-Split Cross-Validation
for train_index, val_index in ss.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Train LightGBM model
    model_lgb = lgb.LGBMClassifier(**lgb_param, verbose=100)
    model_lgb.fit(X_train, y_train)
    
    # Make predictions on validation set (LightGBM)
    pred_proba_lgb = model_lgb.predict_proba(X_val)
    y_pred_lgb = np.argmax(pred_proba_lgb, axis=1)
    
    # Calculate accuracy and store (LightGBM)
    val_accuracy_lgb = accuracy_score(y_val, y_pred_lgb)
    val_accuracies_lgb.append(val_accuracy_lgb)
    
    # Make predictions on test set (LightGBM)
    test_pred_proba_lgb = model_lgb.predict_proba(X_test)
    test_preds_lgb.append(test_pred_proba_lgb)
    
    # Train XGBoost model
    model_xgb = xgb.XGBClassifier(**xgb_param)
    model_xgb.fit(X_train, y_train)
    
    # Make predictions on validation set (XGBoost)
    pred_proba_xgb = model_xgb.predict_proba(X_val)
    y_pred_xgb = np.argmax(pred_proba_xgb, axis=1)
    
    # Calculate accuracy and store (XGBoost)
    val_accuracy_xgb = accuracy_score(y_val, y_pred_xgb)
    val_accuracies_xgb.append(val_accuracy_xgb)
    
    # Make predictions on test set (XGBoost)
    test_pred_proba_xgb = model_xgb.predict_proba(X_test)
    test_preds_xgb.append(test_pred_proba_xgb)
    
    # Train CatBoost model
    model_catboost = CatBoostClassifier(**catboost_param, verbose=100)
    model_catboost.fit(X_train, y_train)
    
    # Make predictions on validation set (CatBoost)
    pred_proba_catboost = model_catboost.predict_proba(X_val)
    y_pred_catboost = np.argmax(pred_proba_catboost, axis=1)
    
    # Calculate accuracy and store (CatBoost)
    val_accuracy_catboost = accuracy_score(y_val, y_pred_catboost)
    val_accuracies_catboost.append(val_accuracy_catboost)
    
    # Make predictions on test set (CatBoost)
    test_pred_proba_catboost = model_catboost.predict_proba(X_test)
    test_preds_catboost.append(test_pred_proba_catboost)


0:	learn: 1.8248771	total: 148ms	remaining: 1m 14s
100:	learn: 0.3195379	total: 5.68s	remaining: 22.4s
200:	learn: 0.2111214	total: 11.2s	remaining: 16.7s
300:	learn: 0.1598129	total: 16.4s	remaining: 10.8s
400:	learn: 0.1258316	total: 21.6s	remaining: 5.32s
499:	learn: 0.1014154	total: 26.8s	remaining: 0us
0:	learn: 1.8256615	total: 51ms	remaining: 25.4s
100:	learn: 0.3206345	total: 5.34s	remaining: 21.1s
200:	learn: 0.2089068	total: 10.5s	remaining: 15.7s
300:	learn: 0.1587370	total: 15.6s	remaining: 10.3s
400:	learn: 0.1249904	total: 20.9s	remaining: 5.15s
499:	learn: 0.1012619	total: 26s	remaining: 0us
0:	learn: 1.8249242	total: 50.4ms	remaining: 25.2s
100:	learn: 0.3188575	total: 5.21s	remaining: 20.6s
200:	learn: 0.2106385	total: 10.3s	remaining: 15.4s
300:	learn: 0.1587905	total: 15.3s	remaining: 10.1s
400:	learn: 0.1253567	total: 20.7s	remaining: 5.1s
499:	learn: 0.1015750	total: 26.1s	remaining: 0us
0:	learn: 1.8270952	total: 54ms	remaining: 27s
100:	learn: 0.3199735	total: 5.

In [11]:
# Average validation accuracy (LightGBM)
mean_val_accuracy_lgb = np.mean(val_accuracies_lgb)
print("Mean Validation Accuracy (LightGBM):", mean_val_accuracy_lgb)



Mean Validation Accuracy (LightGBM): 0.9141356673960612


In [12]:
# Average predictions on test set (LightGBM)
mean_test_preds_lgb = np.mean(test_preds_lgb, axis=0)
final_test_preds_lgb = np.argmax(mean_test_preds_lgb, axis=1)



In [13]:
# Average validation accuracy (XGBoost)
mean_val_accuracy_xgb = np.mean(val_accuracies_xgb)
print("Mean Validation Accuracy (XGBoost):", mean_val_accuracy_xgb)



Mean Validation Accuracy (XGBoost): 0.9149234135667396


In [14]:
# Average predictions on test set (XGBoost)
mean_test_preds_xgb = np.mean(test_preds_xgb, axis=0)
final_test_preds_xgb = np.argmax(mean_test_preds_xgb, axis=1)



In [15]:
# Decode predictions (LightGBM)
pred_lgb = labelencoder.inverse_transform(final_test_preds_lgb)



In [16]:
# Create submission file (LightGBM)
submission_lgb = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred_lgb})
submission_lgb.to_csv('submission_lgb.csv', index=False)



In [17]:
# Decode predictions (XGBoost)
pred_xgb = labelencoder.inverse_transform(final_test_preds_xgb)



In [18]:
# Create submission file (XGBoost)
submission_xgb = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred_xgb})
submission_xgb.to_csv('submission_xgb.csv', index=False)

In [19]:
# Average validation accuracy (CatBoost)
mean_val_accuracy_catboost = np.mean(val_accuracies_catboost)
print("Mean Validation Accuracy (CatBoost):", mean_val_accuracy_catboost)



Mean Validation Accuracy (CatBoost): 0.9051203501094092


In [20]:
# Average predictions on test set (CatBoost)
mean_test_preds_catboost = np.mean(test_preds_catboost, axis=0)
final_test_preds_catboost = np.argmax(mean_test_preds_catboost, axis=1)



In [21]:
# Decode predictions (CatBoost)
pred_catboost = labelencoder.inverse_transform(final_test_preds_catboost)



In [22]:
# Create submission file (CatBoost)
submission_catboost = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred_catboost})
submission_catboost.to_csv('submission_catboost.csv', index=False)