In [3]:
import pandas as pd
import numpy as np
import warnings
import xgboost as xgb
import catboost as ctb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')
print("Libraries imported successfully.")

# --- 1. Load Data ---
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# --- 2. Simple Imputation ---
print("Applying simple and robust preprocessing...")
num_features = [col for col in train.columns if train[col].dtype != 'object' and col not in ['id', 'Personality']]
cat_features = [col for col in train.columns if train[col].dtype == 'object' and col not in ['id', 'Personality']]

# Create a copy for the final model which needs unmodified test features
X_test_final = test.copy() 

for col in num_features:
    median_value = train[col].median()
    train[col].fillna(median_value, inplace=True)
    test[col].fillna(median_value, inplace=True)

for col in cat_features:
    train[col].fillna('missing', inplace=True)
    test[col].fillna('missing', inplace=True)

# --- 3. Prepare Data ---
features = [col for col in train.columns if col not in ['id', 'Personality']]
X = train[features]
X_test = test[features]

target_encoder = LabelEncoder()
y = target_encoder.fit_transform(train['Personality'])
print("Preprocessing complete.")

Libraries imported successfully.
Applying simple and robust preprocessing...
Preprocessing complete.


In [4]:
def get_oof_predictions(X, y, X_test, cat_features):
    N_SPLITS = 5
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
    oof_xgb, test_preds_xgb = np.zeros(len(X)), np.zeros(len(X_test))
    oof_ctb, test_preds_ctb = np.zeros(len(X)), np.zeros(len(X_test))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"===== Fold {fold+1}/{N_SPLITS} =====")
        X_train, y_train = X.iloc[train_idx], y[train_idx]
        X_val, y_val = X.iloc[val_idx], y[val_idx]
        X_train_enc, X_val_enc, X_test_enc = X_train.copy(), X_val.copy(), X_test.copy()
        for col in cat_features:
            le = LabelEncoder()
            le.fit(pd.concat([X_train[col], X_test[col]]).astype(str))
            X_train_enc[col] = le.transform(X_train[col].astype(str))
            X_val_enc[col] = le.transform(X_val[col].astype(str))
            X_test_enc[col] = le.transform(X_test[col].astype(str))

        params_xgb = {'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42+fold}
        dtrain_xgb, dval_xgb, dtest_xgb = xgb.DMatrix(X_train_enc, label=y_train), xgb.DMatrix(X_val_enc, label=y_val), xgb.DMatrix(X_test_enc)
        xgb_model = xgb.train(params_xgb, dtrain_xgb, 300, evals=[(dval_xgb, 'eval')], early_stopping_rounds=30, verbose_eval=False)
        oof_xgb[val_idx] = xgb_model.predict(dval_xgb, iteration_range=(0, xgb_model.best_iteration))
        test_preds_xgb += xgb_model.predict(dtest_xgb, iteration_range=(0, xgb_model.best_iteration)) / N_SPLITS
        
        ctb_model = ctb.CatBoostClassifier(iterations=300, random_seed=42+fold, verbose=0, cat_features=cat_features, eval_metric='Logloss')
        ctb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=30)
        oof_ctb[val_idx] = ctb_model.predict_proba(X_val)[:, 1]
        test_preds_ctb += ctb_model.predict_proba(X_test)[:, 1] / N_SPLITS

    X_meta_train = np.column_stack((oof_xgb, oof_ctb))
    X_meta_test = np.column_stack((test_preds_xgb, test_preds_ctb))
    return X_meta_train, X_meta_test

# --- Phase 1 & 2 ---
print("--- Phase 1 & 2: Initial Stacking ---")
X_meta_train, X_meta_test = get_oof_predictions(X, y, X_test, cat_features)
meta_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss', random_state=42)
meta_model.fit(X_meta_train, y)

# --- Phase 3 ---
print("\n--- Phase 3: Applying Pseudo-Labeling ---")
test_probs = meta_model.predict_proba(X_meta_test)[:, 1]
CONFIDENCE_THRESHOLD = 0.98
confident_indices = np.where((test_probs > CONFIDENCE_THRESHOLD) | (test_probs < 1 - CONFIDENCE_THRESHOLD))[0]
pseudo_labels = (test_probs[confident_indices] > 0.5).astype(int)
pseudo_df = X_test_final.iloc[confident_indices].copy()
pseudo_df['Personality'] = target_encoder.inverse_transform(pseudo_labels)
print(f"Found {len(pseudo_df)} high-confidence samples to use as pseudo-labels.")

# --- Phase 4 ---
train_augmented = pd.concat([train, pseudo_df], ignore_index=True)
print(f"New augmented training set size: {len(train_augmented)}")
print("\n--- Phase 4: Retraining the full stack on augmented data ---")
X_aug = train_augmented[features]
y_aug = target_encoder.transform(train_augmented['Personality'])
for col in cat_features:
    X_aug[col].fillna('missing', inplace=True)
X_meta_train_aug, X_meta_test_final = get_oof_predictions(X_aug, y_aug, X_test, cat_features)

# ***************************************************************
# *** THE FINAL EVALUATION STEP IS HERE ***
# ***************************************************************
print("\n--- Evaluating Final Meta-Model Performance via Cross-Validation ---")
final_meta_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss', random_state=42)
cv_scores_meta = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
meta_accuracies = []

# Cross-validate the meta-model on the augmented OOF predictions
for train_idx, val_idx in cv_scores_meta.split(X_meta_train_aug, y_aug):
    X_tr, y_tr = X_meta_train_aug[train_idx], y_aug[train_idx]
    X_val, y_val = X_meta_train_aug[val_idx], y_aug[val_idx]
    
    fold_meta_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss', random_state=42)
    fold_meta_model.fit(X_tr, y_tr)
    preds = fold_meta_model.predict(X_val)
    meta_accuracies.append(accuracy_score(y_val, preds))

print("\n--- Leaderboard Hunter Performance ---")
print(f"Final Stacking CV Accuracy: {np.mean(meta_accuracies):.6f} (+/- {np.std(meta_accuracies):.6f})")


# --- Final fit on all augmented data for submission ---
print("\n--- Training the FINAL meta-model on all data ---")
final_meta_model.fit(X_meta_train_aug, y_aug)
print("Final meta-model training complete.")

# --- Phase 5: Final Submission ---
print("\n--- Creating Final Leaderboard Hunter Submission ---")
final_predictions_encoded = final_meta_model.predict(X_meta_test_final)
final_predictions = target_encoder.inverse_transform(final_predictions_encoded)
submission_df = pd.DataFrame({'id': X_test_final['id'], 'Personality': final_predictions})
submission_df.to_csv('submission_leaderboard_hunter.csv', index=False)
print("\nSubmission file 'submission_leaderboard_hunter.csv' created successfully!")
display(submission_df.head())

--- Phase 1: Generating OOF predictions from diverse base models ---
===== Fold 1/5 =====
===== Fold 2/5 =====
===== Fold 3/5 =====
===== Fold 4/5 =====
===== Fold 5/5 =====

--- Phase 2: Training a powerful XGBoost meta-model ---
Meta-model trained.

--- Phase 3: Applying Pseudo-Labeling ---
Found 2502 high-confidence samples to use as pseudo-labels.
New augmented training set size: 21026

--- Phase 4: Retraining the full stack on augmented data ---
===== Fold 1/5 =====
===== Fold 2/5 =====
===== Fold 3/5 =====
===== Fold 4/5 =====
===== Fold 5/5 =====

--- Training the FINAL meta-model ---
Final meta-model training complete.

--- Creating Final Leaderboard Hunter Submission ---

Submission file 'submission_leaderboard_hunter.csv' created successfully!


Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
