In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb


In [2]:
# Load Data
train_data = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
original_data = pd.read_csv("/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv")



In [3]:
# Merge Data
train_data = train_data.drop("id", axis=1)
train_data = pd.concat([train_data, original_data], ignore_index=True).drop_duplicates()



In [4]:
# Feature Scaling
scaler = StandardScaler()
num_cols = list(train_data.select_dtypes(exclude=['object']).columns)
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])



In [5]:
# Encoding Categorical Variables
labelencoder = LabelEncoder()
object_columns = train_data.select_dtypes(include='object').columns.difference(['NObeyesdad'])

for col_name in object_columns:
    if train_data[col_name].dtypes == 'object':
        train_data[col_name] = labelencoder.fit_transform(train_data[col_name])
        test_data[col_name] = labelencoder.transform(test_data[col_name])



In [6]:
# Define Features and Target
X = train_data.drop(['NObeyesdad'], axis=1)
y = labelencoder.fit_transform(train_data['NObeyesdad'])
X_test = test_data.drop(["id"], axis=1)



In [7]:
# Define LightGBM model parameters
lgb_param = {
    "objective": "multiclass",
    "metric": "multi_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "random_state": 42,
    "num_class": 7,
    'learning_rate': 0.030962211546832760,
    'n_estimators': 500,
    'lambda_l1': 0.009667446568254372,
    'lambda_l2': 0.04018641437301800,
    'max_depth': 10,
    'colsample_bytree': 0.40977129346872643,
    'subsample': 0.9535797422450176,
    'min_child_samples': 26
}



In [8]:
# Define Spatial Cross-Validation
num_splits = 19
skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

# Initialize lists to store validation accuracies and predictions
val_accuracies_lgb = []
test_preds_lgb = []

# Perform Spatial Cross-Validation
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Train LightGBM model
    model_lgb = lgb.LGBMClassifier(**lgb_param, verbose=150)
    model_lgb.fit(X_train, y_train)
    
    # Make predictions on validation set using LightGBM
    pred_proba_lgb = model_lgb.predict_proba(X_val)
    y_pred_lgb = np.argmax(pred_proba_lgb, axis=1)
    val_accuracy_lgb = accuracy_score(y_val, y_pred_lgb)
    val_accuracies_lgb.append(val_accuracy_lgb)
    
    # Make predictions on test set using LightGBM
    test_preds_lgb.append(model_lgb.predict_proba(X_test))

In [9]:
# Average validation accuracies
mean_val_accuracy_lgb = np.mean(val_accuracies_lgb)
print("Mean Validation Accuracy (LightGBM):", mean_val_accuracy_lgb)



Mean Validation Accuracy (LightGBM): 0.9155614262938562


In [10]:
# Average predictions on test set
mean_test_preds_lgb = np.mean(test_preds_lgb, axis=0)
final_test_preds_lgb = np.argmax(mean_test_preds_lgb, axis=1)



In [11]:
# Decode predictions
pred_lgb = labelencoder.inverse_transform(final_test_preds_lgb)



In [12]:
# Create submission file for LightGBM
submission_lgb = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred_lgb})
submission_lgb.to_csv('submission_lgb.csv', index=False)