In [1]:
# --- Core Libraries ---
import pandas as pd
import numpy as np
import warnings

# --- Visualization ---
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# --- Preprocessing & Feature Engineering ---
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import KNNImputer

# --- Modeling & Evaluation ---
import lightgbm as lgb
import optuna  # Import Optuna for hyperparameter tuning
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- Notebook Settings ---
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
plt.style.use('seaborn-v0_8-deep')

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# --- Load the datasets ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError as e:
    print(f"File not found: {e}. Please ensure all CSV files are in the correct directory.")
    train_df, test_df = pd.DataFrame(), pd.DataFrame()

# --- Initial Data Inspection ---
if not train_df.empty:
    print("Training Data Shape:", train_df.shape)
    print("Testing Data Shape:", test_df.shape)
    y = train_df['Personality'] # Define target variable early

Training Data Shape: (18524, 9)
Testing Data Shape: (6175, 8)


In [3]:
if not train_df.empty:
    # --- Combine for consistent processing ---
    combined_df = pd.concat([train_df.drop('Personality', axis=1), test_df], ignore_index=True)

    # --- 1. Basic Feature Engineering (from your notebook) ---
    binary_cols = ['Stage_fear', 'Drained_after_socializing']
    for col in binary_cols:
        combined_df[col] = combined_df[col].map({'Yes': 1, 'No': 0})

    epsilon = 1e-6
    combined_df['Social_Energy_Score'] = (combined_df['Social_event_attendance'] + combined_df['Going_outside']) / (combined_df['Time_spent_Alone'] + epsilon)
    combined_df['Online_Offline_Ratio'] = combined_df['Post_frequency'] / (combined_df['Social_event_attendance'] + epsilon)
    combined_df['Friends_to_Alone_Ratio'] = combined_df['Friends_circle_size'] / (combined_df['Time_spent_Alone'] + epsilon)
    for col in ['Time_spent_Alone', 'Social_event_attendance']:
        combined_df[f'{col}_sq'] = combined_df[col]**2
        
    print("Basic feature engineering complete.")

    # --- 2. Advanced Feature Engineering (NEW) ---
    print("Creating advanced features...")
    
    # Interaction between key behavioral traits
    combined_df['Social_Engagement_Index'] = combined_df['Social_event_attendance'] * combined_df['Friends_circle_size']
    combined_df['Alone_vs_Social_Time_Diff'] = combined_df['Social_event_attendance'] - combined_df['Time_spent_Alone']

    # Binning a feature to capture non-linear effects
    # We can group 'Friends_circle_size' into categories
    combined_df['Friends_group_size'] = pd.cut(combined_df['Friends_circle_size'], 
                                               bins=[-1, 5, 10, 15, 20], 
                                               labels=[0, 1, 2, 3])

    # --- 3. Missing Value Flags (from your notebook) ---
    for col in combined_df.columns:
        if combined_df[col].isnull().any():
            combined_df[f'{col}_missing_flag'] = combined_df[col].isnull().astype(int)

    print("Advanced feature engineering complete.")
    
    # --- 4. Preprocessing (Imputation, Scaling, Target Encoding) ---
    print("\nApplying KNN Imputer and Scaling...")
    
    # Fill NaN in new categorical feature before imputation
    combined_df['Friends_group_size'] = combined_df['Friends_group_size'].cat.add_categories([-1]).fillna(-1)

    imputer = KNNImputer(n_neighbors=7)
    imputed_data = imputer.fit_transform(combined_df)
    combined_df_imputed = pd.DataFrame(imputed_data, columns=combined_df.columns)

    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(combined_df_imputed)
    combined_df_processed = pd.DataFrame(scaled_data, columns=combined_df_imputed.columns)

    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    X_processed = combined_df_processed.iloc[:len(train_df)]
    X_test_processed = combined_df_processed.iloc[len(train_df):]

    print("\nPreprocessing complete.")
    print("Final Processed Training Data Shape:", X_processed.shape)

Basic feature engineering complete.
Creating advanced features...
Advanced feature engineering complete.

Applying KNN Imputer and Scaling...

Preprocessing complete.
Final Processed Training Data Shape: (18524, 31)


In [4]:
# --- All imports and data loading/preprocessing from your v2 notebook go here ---
# (Assume X_processed, y_encoded, X_test_processed, and le are available)

print("--- Generating predictions from the SIMPLE model (v2) ---")

# Model Configuration from your v2
N_SPLITS = 10
RANDOM_STATE = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

# We need to store the raw probability predictions
simple_model_test_preds = np.zeros((len(X_test_processed),))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_processed, y_encoded)):
    print(f"Simple Model - Fold {fold+1}")
    X_train, y_train = X_processed.iloc[train_idx], y_encoded[train_idx]
    X_val, y_val = X_processed.iloc[val_idx], y_encoded[val_idx]
    
    # Use the simple, default model
    model = lgb.LGBMClassifier(random_state=RANDOM_STATE, n_estimators=1000, objective='binary')
    
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='accuracy',
              callbacks=[lgb.early_stopping(100, verbose=False)])
    
    # Get the probability of the positive class (e.g., 'Extrovert')
    fold_test_preds = model.predict_proba(X_test_processed)[:, 1]
    simple_model_test_preds += fold_test_preds / N_SPLITS

print("Predictions from simple model generated.")

--- Generating predictions from the SIMPLE model (v2) ---
Simple Model - Fold 1
[LightGBM] [Info] Number of positive: 4342, number of negative: 12329
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000920 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2224
[LightGBM] [Info] Number of data points in the train set: 16671, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260452 -> initscore=-1.043619
[LightGBM] [Info] Start training from score -1.043619
Simple Model - Fold 2
[LightGBM] [Info] Number of positive: 4342, number of negative: 12329
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000633 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2219
[LightGBM] 

In [7]:
# --- Use the same processed data from before ---

def objective_safer(trial):
    # SAFER search space to prevent overfitting
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'n_estimators': 1000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.08), # Slightly lower max rate
        'num_leaves': trial.suggest_int('num_leaves', 20, 60), # Lower max leaves
        'max_depth': trial.suggest_int('max_depth', 5, 10),      # Lower max depth
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'random_state': 42,
        'verbose': -1,
    }
    # ... (Rest of the objective function is the same, using skf and model.fit)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    for train_idx, val_idx in skf.split(X_processed, y_encoded):
        X_train, y_train = X_processed.iloc[train_idx], y_encoded[train_idx]
        X_val, y_val = X_processed.iloc[val_idx], y_encoded[val_idx]
        model = lgb.LGBMClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='accuracy', callbacks=[lgb.early_stopping(100, verbose=False)])
        preds = model.predict(X_val)
        accuracies.append(accuracy_score(y_val, preds))
    return np.mean(accuracies)

print("\n--- Starting SAFER Optuna Optimization ---")
study = optuna.create_study(direction='maximize')
study.optimize(objective_safer, n_trials=70, show_progress_bar=True) # 30 trials is enough for a safer search
best_params = study.best_params
print("Best parameters from safer search:", best_params)

# --- Train with the safer tuned parameters ---
print("\n--- Generating predictions from the TUNED model (v3.1) ---")
final_params = { 'objective': 'binary', 'random_state': RANDOM_STATE, 'n_estimators': 2000, 'verbose': -1, **best_params }
tuned_model_test_preds = np.zeros((len(X_test_processed),))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_processed, y_encoded)):
    print(f"Tuned Model - Fold {fold+1}")
    X_train, y_train = X_processed.iloc[train_idx], y_encoded[train_idx]
    X_val, y_val = X_processed.iloc[val_idx], y_encoded[val_idx]
    model = lgb.LGBMClassifier(**final_params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='accuracy', callbacks=[lgb.early_stopping(150, verbose=False)])
    
    # Get the probability of the positive class
    fold_test_preds = model.predict_proba(X_test_processed)[:, 1]
    tuned_model_test_preds += fold_test_preds / N_SPLITS

print("Predictions from tuned model generated.")

[I 2025-07-04 18:35:57,864] A new study created in memory with name: no-name-644e804e-f212-4d96-a055-4ec517f22172



--- Starting SAFER Optuna Optimization ---


  0%|          | 0/70 [00:00<?, ?it/s]

[I 2025-07-04 18:35:58,448] Trial 0 finished with value: 0.9689592897345541 and parameters: {'learning_rate': 0.07257293206649229, 'num_leaves': 34, 'max_depth': 6, 'subsample': 0.8265339369170482, 'colsample_bytree': 0.7465163789156342, 'reg_alpha': 0.2959617089121309, 'reg_lambda': 0.9286362546400216, 'min_child_samples': 27}. Best is trial 0 with value: 0.9689592897345541.
[I 2025-07-04 18:35:59,164] Trial 1 finished with value: 0.968635403094878 and parameters: {'learning_rate': 0.07584304546783488, 'num_leaves': 43, 'max_depth': 8, 'subsample': 0.7108152496680548, 'colsample_bytree': 0.9462653822435309, 'reg_alpha': 0.9328779406952297, 'reg_lambda': 0.4815295638788598, 'min_child_samples': 21}. Best is trial 0 with value: 0.9689592897345541.
[I 2025-07-04 18:35:59,740] Trial 2 finished with value: 0.9688513275213287 and parameters: {'learning_rate': 0.05352092293308994, 'num_leaves': 21, 'max_depth': 9, 'subsample': 0.8341649387620014, 'colsample_bytree': 0.814355228035436, 'reg_a

In [8]:
print("\n--- Creating the ENSEMBLE submission ---")

# Simple average of the two models' probability predictions
ensemble_preds_proba = (simple_model_test_preds + tuned_model_test_preds) / 2

# Convert the averaged probabilities to final class labels (0 or 1)
# The threshold of 0.5 is standard for binary classification.
ensemble_preds_encoded = (ensemble_preds_proba > 0.5).astype(int)

# Inverse transform to get the original string labels
final_ensemble_preds = le.inverse_transform(ensemble_preds_encoded)

# Create the final submission DataFrame
submission_df = pd.DataFrame({'id': test_df['id'], 'Personality': final_ensemble_preds})
submission_df.to_csv('submission_ensemble_v4.csv', index=False)

print("\nEnsemble submission file created successfully: submission_ensemble_v4.csv")
display(submission_df.head())


--- Creating the ENSEMBLE submission ---

Ensemble submission file created successfully: submission_ensemble_v4.csv


Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
