In [None]:
# Import libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder

# Load the saved model package
print("📦 Loading saved model package...")
model_package = joblib.load('home_credit_improved_model.pkl')

model = model_package['model']
scaler = model_package['scaler']
label_encoders = model_package['label_encoders']
feature_names = model_package['feature_names']
train_median = model_package['train_median']
threshold = model_package['threshold']

print(f"✅ Loaded {model_package['method_used']} model with AUC: {model_package['performance_metrics']['auc']:.4f}")

# Load new data
print("📂 Loading application_test.csv...")
new_data = pd.read_csv('application_test.csv')
original_ids = new_data['SK_ID_CURR'].copy()

# Pastikan hanya menggunakan feature yang sama dengan training
print("🔧 Preprocessing new data...")
missing_features = set(feature_names) - set(new_data.columns)
extra_features = set(new_data.columns) - set(feature_names)

if missing_features:
    print(f"⚠️  Adding missing features: {len(missing_features)}")
    for feature in missing_features:
        new_data[feature] = 0

if extra_features:
    print(f"⚠️  Removing extra features: {len(extra_features)}")
    new_data = new_data[feature_names]
else:
    new_data = new_data[feature_names]

# SOLUSI PERBAIKAN: Robust Categorical Encoding
print("🔤 Encoding categorical features...")

def robust_label_encode(series, label_encoder, column_name):
    """
    Robust label encoding yang handle unknown categories dengan baik
    """
    print(f"   Processing: {column_name}")

    # Step 1: Handle missing values
    series_clean = series.fillna('Unknown')

    # Step 2: Convert to string dan strip whitespace
    series_clean = series_clean.astype(str).str.strip()

    # Step 3: Get training categories
    train_categories = set([str(x).strip() for x in label_encoder.classes_])
    test_categories = set(series_clean.unique())

    # Step 4: Identify unknown categories
    unknown_categories = test_categories - train_categories

    if unknown_categories:
        print(f"      ⚠️  Found {len(unknown_categories)} unknown categories")
        print(f"      Examples: {list(unknown_categories)[:3]}")

    # Step 5: PERBAIKAN - Cek apakah 'Unknown' ada di training classes
    if 'Unknown' not in train_categories:
        print(f"      ⚠️  'Unknown' not in training classes for {column_name}")
        print(f"      Training classes: {list(train_categories)}")

        # Pilihan 1: Map ke kategori yang paling umum di training
        most_common_class = label_encoder.classes_[0]  # Ambil class pertama sebagai default
        print(f"      → Mapping unknown values to: '{most_common_class}'")

        # Map semua unknown categories ke most common class
        series_encoded = series_clean.apply(
            lambda x: most_common_class if x in unknown_categories else x
        )

    else:
        # Jika 'Unknown' ada di training, map ke 'Unknown'
        series_encoded = series_clean.apply(
            lambda x: 'Unknown' if x in unknown_categories else x
        )

    # Step 6: Transform dengan encoder
    try:
        result = label_encoder.transform(series_encoded)
        print(f"      ✅ Successfully encoded")
        return result

    except ValueError as e:
        print(f"      ❌ Still failed: {e}")

        # FALLBACK TERAKHIR: Manual encoding berdasarkan training classes
        print(f"      🔧 Using fallback: map to most frequent training class")

        # Buat mapping manual ke class pertama (biasanya most frequent)
        fallback_class = label_encoder.classes_[0]
        fallback_encoded = label_encoder.transform([fallback_class])[0]

        # Map semua ke fallback class
        result = np.full(len(series_encoded), fallback_encoded, dtype=int)
        print(f"      → All values mapped to '{fallback_class}' (encoded: {fallback_encoded})")
        return result

# Apply robust encoding untuk semua categorical features
for col, le in label_encoders.items():
    if col in new_data.columns:
        new_data[col] = robust_label_encode(new_data[col], le, col)

print("✅ Categorical encoding completed")

# Handle missing values dengan median dari TRAINING set
print("🔧 Imputing missing values...")
new_data = new_data.fillna(train_median)

# Pastikan urutan kolom sama dengan training
new_data = new_data[feature_names]

# Scale features dengan scaler dari training
print("⚖️ Scaling features...")
new_data_scaled = scaler.transform(new_data)

print(f"✅ Preprocessing completed. Data shape: {new_data_scaled.shape}")

# Verifikasi hasil preprocessing
print("\n🔍 Preprocessing Verification:")
print(f"Final data shape: {new_data_scaled.shape}")
print(f"Any NaN values: {np.isnan(new_data_scaled).sum()}")
print(f"Data type: {new_data_scaled.dtype}")

# Lanjutkan dengan prediksi
print("\n🔮 Making predictions...")
predictions_proba = model.predict_proba(new_data_scaled)[:, 1]
predictions_binary = (predictions_proba >= threshold).astype(int)

# Buat hasil prediksi
print("📊 Creating submission file...")
results_df = pd.DataFrame({
    'SK_ID_CURR': original_ids,
    'TARGET': predictions_binary,
    'PREDICTION_PROBABILITY': predictions_proba
})

# Simpan hasil
results_df.to_csv('predictions_test.csv', index=False)
print(f"✅ Predictions saved! Shape: {results_df.shape}")

# Tampilkan ringkasan prediksi
print(f"\n📈 Prediction Summary:")
print(f"Total predictions: {len(results_df)}")
print(f"Predicted defaults (1): {(predictions_binary == 1).sum()} ({(predictions_binary == 1).mean()*100:.2f}%)")
print(f"Predicted no defaults (0): {(predictions_binary == 0).sum()} ({(predictions_binary == 0).mean()*100:.2f}%)")
print(f"Average probability: {predictions_proba.mean():.4f}")
print(f"Probability range: {predictions_proba.min():.4f} - {predictions_proba.max():.4f}")

print("\n🎉 All done! Check 'predictions_test.csv' for results.")

📦 Loading saved model package...
✅ Loaded Hyperparameter Tuned model with AUC: 0.7374
📂 Loading application_test.csv...
🔧 Preprocessing new data...
⚠️  Adding missing features: 85
⚠️  Removing extra features: 76
🔤 Encoding categorical features...
   Processing: OCCUPATION_TYPE
      ✅ Successfully encoded
   Processing: ORGANIZATION_TYPE
      ⚠️  Found 7 unknown categories
      Examples: ['Industry: type 8', 'Industry: type 6', 'Trade: type 4']
      ✅ Successfully encoded
   Processing: AGE_GROUP
      ⚠️  Found 1 unknown categories
      Examples: ['0']
      ⚠️  'Unknown' not in training classes for AGE_GROUP
      Training classes: ['Adult', 'Middle', 'Senior', 'Elder', 'Young']
      → Mapping unknown values to: 'Adult'
      ✅ Successfully encoded
   Processing: INCOME_CATEGORY
      ⚠️  Found 1 unknown categories
      Examples: ['0']
      ⚠️  'Unknown' not in training classes for INCOME_CATEGORY
      Training classes: ['High', 'Medium', 'Very_High', 'Low']
      → Mapping u

In [None]:
# Make predictions
print("🎯 Making predictions...")
predictions_proba = model.predict_proba(new_data_scaled)[:, 1]
predictions_binary = (predictions_proba >= threshold).astype(int)

# Create submission dataframe
results_df = pd.DataFrame({
    'SK_ID_CURR': original_ids,
    'TARGET_PROBA': predictions_proba,
    'TARGET_PREDICTION': predictions_binary
})

# Save results
output_file = 'final_predictions.csv'
results_df.to_csv(output_file, index=False)
print(f"✅ Predictions saved to {output_file}")
print(f"📊 Prediction distribution:")
print(f"   - Default predictions (1): {sum(predictions_binary)}")
print(f"   - Non-default predictions (0): {len(predictions_binary) - sum(predictions_binary)}")
print(f"   - Default rate: {sum(predictions_binary)/len(predictions_binary)*100:.2f}%")

🎯 Making predictions...
✅ Predictions saved to final_predictions.csv
📊 Prediction distribution:
   - Default predictions (1): 241
   - Non-default predictions (0): 48503
   - Default rate: 0.49%


In [None]:
# Analisis tambahan
print("\n📈 Prediction Analysis:")
print(f"   - Min probability: {predictions_proba.min():.4f}")
print(f"   - Max probability: {predictions_proba.max():.4f}")
print(f"   - Mean probability: {predictions_proba.mean():.4f}")

# Simpan hasil dengan berbagai threshold untuk analisis bisnis
for thresh in [0.3, 0.5, 0.7]:
    binary_preds = (predictions_proba >= thresh).astype(int)
    default_rate = sum(binary_preds)/len(binary_preds)
    print(f"   - Default rate at threshold {thresh}: {default_rate*100:.2f}%")

# Rekomendasi untuk tim bisnis
print("\n💡 Business Recommendations:")
print("1. Prioritasi applicant dengan probability > 0.7 untuk review manual")
print("2. Applicant dengan probability < 0.3 dapat diapprove secara otomatis")
print("3. Buat segmentasi risk-based pricing berdasarkan probability score")


📈 Prediction Analysis:
   - Min probability: 0.0000
   - Max probability: 1.0000
   - Mean probability: 0.3283
   - Default rate at threshold 0.3: 56.02%
   - Default rate at threshold 0.5: 6.26%
   - Default rate at threshold 0.7: 0.48%

💡 Business Recommendations:
1. Prioritasi applicant dengan probability > 0.7 untuk review manual
2. Applicant dengan probability < 0.3 dapat diapprove secara otomatis
3. Buat segmentasi risk-based pricing berdasarkan probability score


In [None]:
# Quality check
print("\n🔍 Quality Check:")
print(f"   - Expected features: {len(feature_names)}")
print(f"   - Features in new data: {new_data.shape[1]}")
print(f"   - Missing values after imputation: {pd.DataFrame(new_data).isnull().sum().sum()}")
print(f"   - Data shape consistency: {new_data_scaled.shape[1] == model.n_features_in_}")

# Sample predictions
print("\n👀 Sample predictions:")
sample_results = results_df.head(10).copy()
sample_results['TARGET_PROBA'] = sample_results['TARGET_PROBA'].round(4)
print(sample_results)


🔍 Quality Check:
   - Expected features: 130
   - Features in new data: 130
   - Missing values after imputation: 0
   - Data shape consistency: True

👀 Sample predictions:
   SK_ID_CURR  TARGET_PROBA  TARGET_PREDICTION
0      100001        0.1825                  0
1      100005        0.3639                  0
2      100013        0.4263                  0
3      100028        0.4424                  0
4      100038        0.4151                  0
5      100042        0.2294                  0
6      100057        0.2895                  0
7      100065        0.2789                  0
8      100066        0.3013                  0
9      100067        0.3156                  0


In [None]:
# Executive summary
print("\n" + "="*60)
print("📋 EXECUTIVE SUMMARY")
print("="*60)
print(f"Model Used: {model_package['method_used']}")
print(f"Model AUC: {model_package['performance_metrics']['auc']:.4f}")
print(f"Optimal Threshold: {threshold:.3f}")
print(f"Total Applicants: {len(results_df):,}")
print(f"Recommended for Rejection: {sum(predictions_binary):,}")
print(f"Predicted Default Rate: {sum(predictions_binary)/len(predictions_binary)*100:.1f}%")
print(f"Estimated Cost Savings: ${model_package['performance_metrics']['business_impact']['Cost Savings']:,}")
print("="*60)


📋 EXECUTIVE SUMMARY
Model Used: Hyperparameter Tuned
Model AUC: 0.7374
Optimal Threshold: 0.699
Total Applicants: 48,744
Recommended for Rejection: 241
Predicted Default Rate: 0.5%
Estimated Cost Savings: $85,000
