In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier


In [2]:
# Load Data
train_data = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
original_data = pd.read_csv("/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv")
sample_submission_data = pd.read_csv("/kaggle/input/playground-series-s4e2/sample_submission.csv")


In [3]:
# Merge Data
train_data = train_data.drop("id", axis=1)
train_data = pd.concat([train_data, original_data], ignore_index=True).drop_duplicates()


In [4]:
# Feature Scaling
scaler = StandardScaler()
num_cols = list(train_data.select_dtypes(exclude=['object']).columns)
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])



In [5]:
# Encoding Categorical Variables
labelencoder = LabelEncoder()
object_columns = train_data.select_dtypes(include='object').columns.difference(['NObeyesdad'])

for col_name in object_columns:
    if train_data[col_name].dtypes == 'object':
        train_data[col_name] = labelencoder.fit_transform(train_data[col_name])
        test_data[col_name] = labelencoder.transform(test_data[col_name])



In [6]:
# Define Features and Target
X = train_data.drop(['NObeyesdad'], axis=1)
y = labelencoder.fit_transform(train_data['NObeyesdad'])
X_test = test_data.drop(["id"], axis=1)

In [7]:
# Define HistGradientBoostingClassifier model parameters
hgb_param = {
    "loss": "categorical_crossentropy",
    "learning_rate": 0.1,
    "max_iter": 100,
    "max_depth": 6,
    "random_state": 42,
}



In [8]:
# Define Spatial Cross-Validation
num_splits = 14
skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

# Initialize lists to store validation accuracies and predictions
val_accuracies_hgb = []
test_preds_hgb = []

# Perform Spatial Cross-Validation
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Train HistGradientBoostingClassifier model
    model_hgb = HistGradientBoostingClassifier(**hgb_param)
    model_hgb.fit(X_train, y_train)
    
    # Make predictions on validation set using HistGradientBoostingClassifier
    pred_proba_hgb = model_hgb.predict_proba(X_val)
    y_pred_hgb = np.argmax(pred_proba_hgb, axis=1)
    val_accuracy_hgb = accuracy_score(y_val, y_pred_hgb)
    val_accuracies_hgb.append(val_accuracy_hgb)
    
    # Make predictions on test set using HistGradientBoostingClassifier
    test_preds_hgb.append(model_hgb.predict_proba(X_test))




In [9]:
# Average validation accuracy
mean_val_accuracy_hgb = np.mean(val_accuracies_hgb)
print("Mean Validation Accuracy (HistGradientBoostingClassifier):", mean_val_accuracy_hgb)


Mean Validation Accuracy (HistGradientBoostingClassifier): 0.9120159329826352


In [10]:
# Average predictions on test set
mean_test_preds_hgb = np.mean(test_preds_hgb, axis=0)
final_test_preds_hgb = np.argmax(mean_test_preds_hgb, axis=1)


In [11]:
# Decode predictions
pred_hgb = labelencoder.inverse_transform(final_test_preds_hgb)



In [12]:
# Create submission file for HistGradientBoostingClassifier predictions
submission_hgb = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred_hgb})
submission_hgb.to_csv('submission_hgb.csv', index=False)