In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import optuna

In [2]:
# Load Data
train_data = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
original_data = pd.read_csv("/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv")
sample_submission_data = pd.read_csv("/kaggle/input/playground-series-s4e2/sample_submission.csv")

In [3]:
# Merge Data
train_data = train_data.drop("id", axis=1)
train_data = pd.concat([train_data, original_data], ignore_index=True).drop_duplicates()

In [4]:
# Feature Scaling
scaler = StandardScaler()
num_cols = list(train_data.select_dtypes(exclude=['object']).columns)
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])

In [5]:
# Encoding Categorical Variables
labelencoder = LabelEncoder()
object_columns = train_data.select_dtypes(include='object').columns.difference(['NObeyesdad'])

for col_name in object_columns:
    if train_data[col_name].dtypes == 'object':
        train_data[col_name] = labelencoder.fit_transform(train_data[col_name])
        test_data[col_name] = labelencoder.transform(test_data[col_name])

In [6]:
# Define Features and Target
X = train_data.drop(['NObeyesdad'], axis=1)
y = labelencoder.fit_transform(train_data['NObeyesdad'])
X_test = test_data.drop(["id"], axis=1)

# Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Define LightGBM model parameters with L1/L2 regularization
param = {
    "objective": "multiclass",
    "metric": "multi_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "random_state": 42,
    "num_class": 7,
    'learning_rate': 0.030962211546832760,
    'n_estimators': 500,
    'lambda_l1': 0.1,  # Adjust lambda_l1 for L1 regularization
    'lambda_l2': 0.1,  # Adjust lambda_l2 for L2 regularization
    'max_depth': 10,
    'colsample_bytree': 0.40977129346872643,
    'subsample': 0.9535797422450176,
    'min_child_samples': 26
}

# Train LightGBM model with L1/L2 regularization
model_lgb = lgb.LGBMClassifier(**param, verbose=100)
model_lgb.fit(X_train, y_train)
pred_proba = model_lgb.predict_proba(X_val)

In [8]:
# Optimize thresholds using Optuna
def objective(trial):
    thresholds = {}
    for i in range(num_classes):
        thresholds[f'threshold_{i}'] = trial.suggest_uniform(f'threshold_{i}', 0.0, 1.0)
    y_pred = apply_thresholds(pred_proba, thresholds)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

def apply_thresholds(y_proba, thresholds):
    y_pred_labels = np.argmax(y_proba, axis=1)
    for i in range(y_proba.shape[1]):
        y_pred_labels[y_proba[:, i] > thresholds[f'threshold_{i}']] = i
    return y_pred_labels

num_classes = 7
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=150)

[I 2024-02-24 21:05:39,342] A new study created in memory with name: no-name-3136d3ab-3687-49ce-902a-c5ef2321f662
  thresholds[f'threshold_{i}'] = trial.suggest_uniform(f'threshold_{i}', 0.0, 1.0)
[I 2024-02-24 21:05:39,348] Trial 0 finished with value: 0.8737141606478441 and parameters: {'threshold_0': 0.23141592629964014, 'threshold_1': 0.9231462926742676, 'threshold_2': 0.17502646226629515, 'threshold_3': 0.06583138230555863, 'threshold_4': 0.15445037920259197, 'threshold_5': 0.09377894377431162, 'threshold_6': 0.9776444410696525}. Best is trial 0 with value: 0.8737141606478441.
  thresholds[f'threshold_{i}'] = trial.suggest_uniform(f'threshold_{i}', 0.0, 1.0)
[I 2024-02-24 21:05:39,354] Trial 1 finished with value: 0.8975705843729481 and parameters: {'threshold_0': 0.10347505108289445, 'threshold_1': 0.9931418876060929, 'threshold_2': 0.8836253063983743, 'threshold_3': 0.6826114162464682, 'threshold_4': 0.9599899180302264, 'threshold_5': 0.3420137244535165, 'threshold_6': 0.1871137

In [9]:
# Get the best thresholds
best_thresholds = study.best_params
print("Best Thresholds:", best_thresholds)



Best Thresholds: {'threshold_0': 0.11667926611439128, 'threshold_1': 0.42804528418417065, 'threshold_2': 0.7390741239033471, 'threshold_3': 0.3371063815720261, 'threshold_4': 0.9334140397880553, 'threshold_5': 0.7622116314583457, 'threshold_6': 0.7250740655993669}


In [10]:
# Apply thresholds to test data and make predictions
threshold = {'threshold_0':  0.023135470125971107, 'threshold_1':  0.4306148457546867, 'threshold_2': 0.9095773167034931,
             'threshold_3': 0.34810041437566724, 'threshold_4':  0.26358184022861064, 'threshold_5': 0.6755795763539134,
             'threshold_6': 0.4235223651605575}
test_label = model_lgb.predict_proba(X_test)
test_label = apply_thresholds(test_label, threshold)
pred = labelencoder.inverse_transform(test_label)


In [11]:
# Create submission file
submission = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred})
submission.to_csv('submission.csv', index=False)