In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier


In [2]:
# Load Data
train_data = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
original_data = pd.read_csv("/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv")
sample_submission_data = pd.read_csv("/kaggle/input/playground-series-s4e2/sample_submission.csv")


In [3]:
# Merge Data
train_data = train_data.drop("id", axis=1)
train_data = pd.concat([train_data, original_data], ignore_index=True).drop_duplicates()


In [4]:
# Feature Scaling
scaler = StandardScaler()
num_cols = list(train_data.select_dtypes(exclude=['object']).columns)
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])



In [5]:
# Encoding Categorical Variables
labelencoder = LabelEncoder()
object_columns = train_data.select_dtypes(include='object').columns.difference(['NObeyesdad'])

for col_name in object_columns:
    if train_data[col_name].dtypes == 'object':
        train_data[col_name] = labelencoder.fit_transform(train_data[col_name])
        test_data[col_name] = labelencoder.transform(test_data[col_name])


In [6]:
# Define Features and Target
X = train_data.drop(['NObeyesdad'], axis=1)
y = labelencoder.fit_transform(train_data['NObeyesdad'])
X_test = test_data.drop(["id"], axis=1)


In [7]:
# Define GBM model parameters
gbm_param = {
    'learning_rate': 0.1,
    'n_estimators': 100,
    'max_depth': 6,
    'subsample': 0.8,
    'verbose': 100,
    'random_state': 42
}



In [8]:
# Define Spatial Cross-Validation
num_splits = 14
skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

# Initialize lists to store validation accuracies and predictions
val_accuracies_gbm = []
test_preds_gbm = []

# Perform Spatial Cross-Validation
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Train GBM model
    model_gbm = GradientBoostingClassifier(**gbm_param)
    model_gbm.fit(X_train, y_train)
    
    # Make predictions on validation set using GBM
    pred_proba_gbm = model_gbm.predict_proba(X_val)
    y_pred_gbm = np.argmax(pred_proba_gbm, axis=1)
    val_accuracy_gbm = accuracy_score(y_val, y_pred_gbm)
    val_accuracies_gbm.append(val_accuracy_gbm)
    
    # Make predictions on test set using GBM
    test_preds_gbm.append(model_gbm.predict_proba(X_test))

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.5108           0.4166           41.68s
         2           1.2558           0.2493           41.82s
         3           1.0729           0.1791           42.45s
         4           0.9343           0.1354           42.70s
         5           0.8235           0.1066           42.14s
         6           0.7370           0.0856           41.59s
         7           0.6625           0.0718           41.23s
         8           0.5985           0.0592           40.90s
         9           0.5447           0.0479           40.37s
        10           0.5007           0.0404           39.75s
        11           0.4640           0.0336           39.16s
        12           0.4312           0.0274           38.88s
        13           0.4082           0.0249           38.47s
        14           0.3829           0.0198           38.13s
        15           0.3606           0.0178           37.71s
       

In [9]:
# Average validation accuracy
mean_val_accuracy_gbm = np.mean(val_accuracies_gbm)
print("Mean Validation Accuracy (GBM):", mean_val_accuracy_gbm)


Mean Validation Accuracy (GBM): 0.9121910298454147


In [10]:
# Average predictions on test set
mean_test_preds_gbm = np.mean(test_preds_gbm, axis=0)
final_test_preds_gbm = np.argmax(mean_test_preds_gbm, axis=1)



In [11]:
# Decode predictions
pred_gbm = labelencoder.inverse_transform(final_test_preds_gbm)



In [12]:
# Create submission file for GBM predictions
submission_gbm = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred_gbm})
submission_gbm.to_csv('submission_gbm.csv', index=False)