In [2]:
import pandas as pd
import numpy as np
import warnings
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from scipy.stats import mode

warnings.filterwarnings('ignore')
print("Libraries imported successfully.")

# --- 1. Load Data ---
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# --- Combine for consistent feature engineering ---
combined = pd.concat([train, test], axis=0, ignore_index=True)

# --- 2. Simple Imputation ---
print("Applying simple imputation...")
num_features = [col for col in train.columns if train[col].dtype != 'object' and col not in ['id', 'Personality']]
cat_features = [col for col in train.columns if train[col].dtype == 'object' and col not in ['id', 'Personality']]

for col in num_features:
    median_value = combined[col].median()
    combined[col].fillna(median_value, inplace=True)

for col in cat_features:
    combined[col].fillna('missing', inplace=True)

Libraries imported successfully.
Applying simple imputation...


In [3]:
print("\n--- Starting AGGRESSIVE Feature Engineering ---")

# --- 3a. Polynomial Features ---
for col in ['Time_spent_Alone', 'Social_event_attendance', 'Friends_circle_size', 'Post_frequency']:
    combined[f'{col}_sq'] = combined[col]**2
    combined[f'{col}_cube'] = combined[col]**3
print("Polynomial features created.")

# --- 3b. Advanced Interaction Features ---
combined['Social_Activity_Score'] = combined['Social_event_attendance'] + combined['Going_outside']
combined['Introversion_Index'] = (combined['Time_spent_Alone'] + 1) / (combined['Social_Activity_Score'] + 1)
combined['Social_Footprint'] = combined['Friends_circle_size'] * combined['Post_frequency'] * combined['Social_event_attendance']
print("Advanced interaction features created.")

# --- 3c. Deep Group-Based Statistical Features (Single-Level) ---
for cat_col in cat_features:
    for num_col in num_features:
        stats = ['mean', 'std', 'min', 'max', 'sum', 'skew']
        group_stats = combined.groupby(cat_col)[num_col].agg(stats)
        group_stats.columns = [f'{num_col}_by_{cat_col}_{stat}' for stat in stats]
        combined = combined.join(group_stats, on=cat_col)
print("Single-level group features created.")

# --- 3d. Multi-Level Grouping Features (The Power Play) ---
# Group by the two most important categorical features together
important_cats = ['Stage_fear', 'Drained_after_socializing']
for num_col in num_features:
    stats = ['mean', 'std', 'max']
    group_stats = combined.groupby(important_cats)[num_col].agg(stats)
    group_stats.columns = [f'{num_col}_by_multi_{stat}' for stat in stats]
    combined = combined.join(group_stats, on=important_cats)
print("Multi-level group features created.")

# --- 4. Final Data Preparation ---
# Separate before filling NaNs to protect the 'Personality' column
train_processed = combined[combined['Personality'].notna()].copy()
test_processed = combined[combined['Personality'].isna()].copy()

# Now, only fill NaNs in the feature columns
all_features = [col for col in train_processed.columns if col not in ['id', 'Personality']]
for df in [train_processed, test_processed]:
    df[all_features] = df[all_features].fillna(0)

print(f"Total number of features created: {len(train_processed.columns) - len(train.columns)}")

# Final Encoding
for col in cat_features:
    le = LabelEncoder()
    le.fit(pd.concat([train_processed[col], test_processed[col]]).astype(str))
    train_processed[col] = le.transform(train_processed[col].astype(str))
    test_processed[col] = le.transform(test_processed[col].astype(str))

# Prepare for modeling
features = [col for col in train_processed.columns if col not in ['id', 'Personality']]
X = train_processed[features]
X_test = test_processed[features]

target_encoder = LabelEncoder()
y = target_encoder.fit_transform(train_processed['Personality'])
print("\nFeature engineering and final preparation complete.")


--- Starting AGGRESSIVE Feature Engineering ---
Polynomial features created.
Advanced interaction features created.
Single-level group features created.
Multi-level group features created.
Total number of features created: 86

Feature engineering and final preparation complete.


In [4]:
# --- 5-Fold Stratified Cross-Validation using Core API ---
N_SPLITS = 5
kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
test_predictions_per_fold = np.zeros((test_processed.shape[0], N_SPLITS))
cv_scores = []

# Using slightly more robust parameters for a more complex feature set
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 5, # Slightly shallower to prevent overfitting on many features
    'learning_rate': 0.05,
    'subsample': 0.7,
    'colsample_bytree': 0.7, # Sample features more aggressively
    'seed': 42
}

print(f"\n--- Starting {N_SPLITS}-Fold XGBoost Training with AGGRESSIVE Features ---")
# Sanitize feature names
sanitized_features = [f.replace('<', '').replace('>', '').replace('[', '').replace(']', '').replace(',', '') for f in features]
X.columns = sanitized_features
X_test.columns = sanitized_features
dtest = xgb.DMatrix(X_test, feature_names=sanitized_features)

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"\nTraining fold {fold+1}/{N_SPLITS}...")
    
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=sanitized_features)
    dval = xgb.DMatrix(X_val, label=y_val, feature_names=sanitized_features)
    
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=500, # More rounds for more features
        evals=[(dval, 'eval')],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    val_probs = model.predict(dval, iteration_range=(0, model.best_iteration))
    val_preds = (val_probs > 0.5).astype(int)
    acc = accuracy_score(y_val, val_preds)
    print(f"Fold {fold+1} Validation Accuracy: {acc:.6f}")
    cv_scores.append(acc)

    test_probs = model.predict(dtest, iteration_range=(0, model.best_iteration))
    test_predictions_per_fold[:, fold] = (test_probs > 0.5).astype(int)

print("\n--- Aggressive Feature Model Performance ---")
print(f"CV Mean Accuracy: {np.mean(cv_scores):.6f} | Std: {np.std(cv_scores):.6f}")

# --- 5. Create Final Submission with Majority Voting ---
print("\n--- Aggregating predictions using majority vote ---")
final_test_predictions_encoded = mode(test_predictions_per_fold, axis=1, keepdims=False)[0].astype(int)
final_test_predictions = target_encoder.inverse_transform(final_test_predictions_encoded)

submission_df = pd.DataFrame({'id': test_processed['id'].astype(int), 'Personality': final_test_predictions})
submission_df.to_csv('submission_aggressive_features.csv', index=False)
print("\nSubmission file 'submission_aggressive_features.csv' created successfully!")
display(submission_df.head())


--- Starting 5-Fold XGBoost Training with AGGRESSIVE Features ---

Training fold 1/5...
Fold 1 Validation Accuracy: 0.969771

Training fold 2/5...
Fold 2 Validation Accuracy: 0.967611

Training fold 3/5...
Fold 3 Validation Accuracy: 0.966532

Training fold 4/5...
Fold 4 Validation Accuracy: 0.970580

Training fold 5/5...
Fold 5 Validation Accuracy: 0.971382

--- Aggressive Feature Model Performance ---
CV Mean Accuracy: 0.969175 | Std: 0.001824

--- Aggregating predictions using majority vote ---

Submission file 'submission_aggressive_features.csv' created successfully!


Unnamed: 0,id,Personality
18524,18524,Extrovert
18525,18525,Introvert
18526,18526,Extrovert
18527,18527,Extrovert
18528,18528,Introvert
