In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import optuna


In [2]:
# Load Data
train_data = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
original_data = pd.read_csv("/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv")
sample_submission_data = pd.read_csv("/kaggle/input/playground-series-s4e2/sample_submission.csv")


In [3]:
# Merge Data
train_data = train_data.drop("id", axis=1)
train_data = pd.concat([train_data, original_data], ignore_index=True).drop_duplicates()


In [4]:
# Feature Scaling
scaler = StandardScaler()
num_cols = list(train_data.select_dtypes(exclude=['object']).columns)
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])

In [5]:
# Encoding Categorical Variables
labelencoder = LabelEncoder()
object_columns = train_data.select_dtypes(include='object').columns.difference(['NObeyesdad'])

for col_name in object_columns:
    if train_data[col_name].dtypes == 'object':
        train_data[col_name] = labelencoder.fit_transform(train_data[col_name])
        test_data[col_name] = labelencoder.transform(test_data[col_name])


In [6]:
# Define Features and Target
X = train_data.drop(['NObeyesdad'], axis=1)
y = labelencoder.fit_transform(train_data['NObeyesdad'])
X_test = test_data.drop(["id"], axis=1)

# Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [7]:
# Define LightGBM model parameters
param = {
    "objective": "multiclass",
    "metric": "multi_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "random_state": 42,
    "num_class": 7,
    'learning_rate': 0.030962211546832760,
    'n_estimators': 500,
    'lambda_l1': 0.009667446568254372,
    'lambda_l2': 0.04018641437301800,
    'max_depth': 10,
    'colsample_bytree': 0.40977129346872643,
    'subsample': 0.9535797422450176,
    'min_child_samples': 26
}

# Train LightGBM model
model_lgb = lgb.LGBMClassifier(**param, verbose=70)
model_lgb.fit(X_train, y_train)
pred_proba = model_lgb.predict_proba(X_val)

In [8]:
# Optimize thresholds using Optuna
def objective(trial):
    thresholds = {}
    for i in range(num_classes):
        thresholds[f'threshold_{i}'] = trial.suggest_uniform(f'threshold_{i}', 0.0, 1.0)
    y_pred = apply_thresholds(pred_proba, thresholds)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

def apply_thresholds(y_proba, thresholds):
    y_pred_labels = np.argmax(y_proba, axis=1)
    for i in range(y_proba.shape[1]):
        y_pred_labels[y_proba[:, i] > thresholds[f'threshold_{i}']] = i
    return y_pred_labels

num_classes = 7
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=70)


[I 2024-02-23 16:32:27,665] A new study created in memory with name: no-name-c3b3c58f-08cf-4238-9aea-5c248b2cbe7c
  thresholds[f'threshold_{i}'] = trial.suggest_uniform(f'threshold_{i}', 0.0, 1.0)
[I 2024-02-23 16:32:27,672] Trial 0 finished with value: 0.8826876778288466 and parameters: {'threshold_0': 0.42675972539156937, 'threshold_1': 0.5568182033419732, 'threshold_2': 0.45211497869202955, 'threshold_3': 0.6064516279646746, 'threshold_4': 0.6592183299898215, 'threshold_5': 0.6031889920214176, 'threshold_6': 0.07638226594765951}. Best is trial 0 with value: 0.8826876778288466.
  thresholds[f'threshold_{i}'] = trial.suggest_uniform(f'threshold_{i}', 0.0, 1.0)
[I 2024-02-23 16:32:27,675] Trial 1 finished with value: 0.8956007879185818 and parameters: {'threshold_0': 0.36693225184928624, 'threshold_1': 0.18493380342230592, 'threshold_2': 0.8496325891482901, 'threshold_3': 0.7886914070460295, 'threshold_4': 0.09547001130150301, 'threshold_5': 0.9901837386388512, 'threshold_6': 0.1497331

In [9]:
# Get the best thresholds
best_thresholds = study.best_params
print("Best Thresholds:", best_thresholds)


Best Thresholds: {'threshold_0': 0.099891329764591, 'threshold_1': 0.5083326860377968, 'threshold_2': 0.3511376738569743, 'threshold_3': 0.6896781007508572, 'threshold_4': 0.3514299087723463, 'threshold_5': 0.6389793233939873, 'threshold_6': 0.7133069667551435}


In [10]:
# Apply thresholds to test data and make predictions
threshold = {'threshold_0': 0.17884231879780574, 'threshold_1':  0.48561839962867365, 'threshold_2': 0.9219939474024544,
             'threshold_3': 0.35636557998085017, 'threshold_4': 0.6660333089412345, 'threshold_5': 0.7809118861672213,
             'threshold_6': 0.9152343943891742}
test_label = model_lgb.predict_proba(X_test)
test_label = apply_thresholds(test_label, threshold)
pred = labelencoder.inverse_transform(test_label)

In [11]:
# Create submission file
submission = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred})
submission.to_csv('submission.csv', index=False)