In [None]:
import pandas as pd
import numpy as np
import warnings
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy.stats import rankdata

warnings.filterwarnings('ignore')

print("Loading data...")
train = pd.read_csv('/kaggle/input/playground-series-s6e2/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s6e2/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s6e2/sample_submission.csv')


# Map 'Absence' to 0 and 'Presence' to 1
target_map = {'Absence': 0, 'Presence': 1}
train['Heart Disease'] = train['Heart Disease'].map(target_map)

# Verify the fix
print("Target values after fix:", train['Heart Disease'].unique()) # Should be [0 1]


print("Preprocessing features...")

X = train.drop(['id', 'Heart Disease'], axis=1)
y = train['Heart Disease']
X_test = test.drop(['id'], axis=1)

# Identify categorical columns
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# Label Encoding for Features
for col in cat_cols:
    le = LabelEncoder()
    # Fit on both train and test to capture all categories
    combined = pd.concat([X[col], X_test[col]], axis=0).astype(str)
    le.fit(combined)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))


xgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_jobs': -1,
    'random_state': 42,
    'enable_categorical': True,
    'eval_metric': 'logloss' 
}

lgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 8,
    'num_leaves': 32,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_jobs': -1,
    'random_state': 42,
    'verbose': -1
}

cat_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'loss_function': 'Logloss',
    'verbose': 0,
    'random_seed': 42
}



folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Arrays to store test predictions
test_preds_xgb = np.zeros(len(X_test))
test_preds_lgb = np.zeros(len(X_test))
test_preds_cat = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    # --- XGBoost ---
    model_xgb = XGBClassifier(**xgb_params)
    model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    test_preds_xgb += model_xgb.predict_proba(X_test)[:, 1] / folds.get_n_splits()
    
    # --- LightGBM ---
    model_lgb = LGBMClassifier(**lgb_params)
    model_lgb.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    test_preds_lgb += model_lgb.predict_proba(X_test)[:, 1] / folds.get_n_splits()

    # --- CatBoost ---
    model_cat = CatBoostClassifier(**cat_params)
    model_cat.fit(X_train, y_train, eval_set=(X_val, y_val))
    test_preds_cat += model_cat.predict_proba(X_test)[:, 1] / folds.get_n_splits()
    
    print(f"Fold {fold+1} completed.")

print("Applying Rank Blending...")

# 1. Linear Average (Base)
linear_blend = (test_preds_xgb * 0.34) + (test_preds_lgb * 0.33) + (test_preds_cat * 0.33)

# 2. Rank Averaging (Stability)
rank_blend = (rankdata(test_preds_xgb) * 0.34 + 
              rankdata(test_preds_lgb) * 0.33 + 
              rankdata(test_preds_cat) * 0.33)

# Normalize ranks to 0-1
rank_blend = (rank_blend - rank_blend.min()) / (rank_blend.max() - rank_blend.min())

# 3. Final Ensemble (90% Linear + 10% Rank)
final_preds = (linear_blend * 0.90) + (rank_blend * 0.10)

# 4. Clipping
final_preds = np.clip(final_preds, 0.001, 0.999)


submission['Heart Disease'] = final_preds
submission.to_csv('submission.csv', index=False)

print("Success! 'submission.csv' saved.")
print(submission.head())