In [1]:
import pandas as pd
import numpy as np
import warnings
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')
print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# --- Load Data ---
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# --- Simple Imputation & Encoding ---
print("Applying simple and robust preprocessing...")
num_features = [col for col in train.columns if train[col].dtype != 'object' and col not in ['id', 'Personality']]
cat_features = [col for col in train.columns if train[col].dtype == 'object' and col not in ['id', 'Personality']]

for col in num_features:
    median_value = train[col].median()
    train[col] = train[col].fillna(median_value)
    test[col] = test[col].fillna(median_value)
    
for col in cat_features:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

# --- Prepare data ---
features = [col for col in train.columns if col not in ['id', 'Personality']]
X = train[features]
X_test = test[features]

target_encoder = LabelEncoder()
y = target_encoder.fit_transform(train['Personality'])

print("Preprocessing complete.")

Applying simple and robust preprocessing...
Preprocessing complete.


In [7]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import warnings

warnings.filterwarnings('ignore')
print("Libraries imported successfully.")

# --- 1. Load Data ---
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# --- 2. Simple Preprocessing ---
print("Applying simple and robust preprocessing...")
num_features = [col for col in train.columns if train[col].dtype != 'object' and col not in ['id', 'Personality']]
cat_features = [col for col in train.columns if train[col].dtype == 'object' and col not in ['id', 'Personality']]

for col in num_features:
    median_value = train[col].median()
    train[col] = train[col].fillna(median_value)
    test[col] = test[col].fillna(median_value)
    
for col in cat_features:
    le = LabelEncoder()
    combined_series = pd.concat([train[col], test[col]]).astype(str)
    le.fit(combined_series)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# --- 3. Prepare Data for Modeling ---
features = [col for col in train.columns if col not in ['id', 'Personality']]
X = train[features]
X_test = test[features]

target_encoder = LabelEncoder()
y = target_encoder.fit_transform(train['Personality'])
print("Preprocessing complete.")

# --- 4. 5-Fold Stratified Cross-Validation using Core API ---
N_SPLITS = 5
kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
test_predictions_per_fold = np.zeros((test.shape[0], N_SPLITS))
cv_scores = []

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 6,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

print(f"\n--- Starting {N_SPLITS}-Fold XGBoost Training (Core API) ---")
dtest = xgb.DMatrix(X_test)

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"\nTraining fold {fold+1}/{N_SPLITS}...")
    
    # *** THIS IS THE CORRECTED DATA SPLIT ***
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    # Convert data into XGBoost's optimized DMatrix object
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Use the core `train` function
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=300,
        evals=[(dval, 'eval')],
        early_stopping_rounds=30,
        verbose_eval=False
    )
    
    val_probs = model.predict(dval, iteration_range=(0, model.best_iteration))
    val_preds = (val_probs > 0.5).astype(int)
    acc = accuracy_score(y_val, val_preds)
    print(f"Fold {fold+1} Validation Accuracy: {acc:.4f}")
    cv_scores.append(acc)

    test_probs = model.predict(dtest, iteration_range=(0, model.best_iteration))
    test_predictions_per_fold[:, fold] = (test_probs > 0.5).astype(int)

print("\nCV Mean Accuracy: {:.4f} | Std: {:.4f}".format(np.mean(cv_scores), np.std(cv_scores)))

# --- 5. Create Final Submission with Majority Voting ---
print("\n--- Aggregating predictions using majority vote ---")
final_test_predictions_encoded = mode(test_predictions_per_fold, axis=1, keepdims=False)[0].astype(int)
final_test_predictions = target_encoder.inverse_transform(final_test_predictions_encoded)

submission_df = pd.DataFrame({'id': test['id'], 'Personality': final_test_predictions})
submission_df.to_csv('submission_core_api_final.csv', index=False)
print("\nSubmission file 'submission_core_api_final.csv' created successfully!")
display(submission_df.head())

Libraries imported successfully.
Applying simple and robust preprocessing...
Preprocessing complete.

--- Starting 5-Fold XGBoost Training (Core API) ---

Training fold 1/5...
Fold 1 Validation Accuracy: 0.9695

Training fold 2/5...
Fold 2 Validation Accuracy: 0.9673

Training fold 3/5...
Fold 3 Validation Accuracy: 0.9663

Training fold 4/5...
Fold 4 Validation Accuracy: 0.9703

Training fold 5/5...
Fold 5 Validation Accuracy: 0.9714

CV Mean Accuracy: 0.9690 | Std: 0.0019

--- Aggregating predictions using majority vote ---

Submission file 'submission_core_api_final.csv' created successfully!


Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
