# SafeLend - Feature Engineering Checks

This notebook contains feature engineering validation and checks for the SafeLend project.

## Overview
- Feature engineering validation
- Data quality checks
- Feature distribution comparisons
- Model feature importance analysis


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load processed data
print("Loading processed data...")
train_df = pd.read_parquet('../Data/processed/train_modeling.parquet')
test_df = pd.read_parquet('../Data/processed/test_modeling.parquet')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Check if TARGET column exists
if 'TARGET' in train_df.columns:
    print(f"Target distribution: {train_df['TARGET'].value_counts().to_dict()}")
else:
    print("No TARGET column found in train data")


In [None]:
# Feature quality checks
print("=== FEATURE QUALITY CHECKS ===")

# Check for constant features
constant_features = []
for col in train_df.columns:
    if col != 'TARGET' and train_df[col].nunique() <= 1:
        constant_features.append(col)

print(f"Constant features found: {len(constant_features)}")
if constant_features:
    print(f"Constant features: {constant_features}")

# Check for near-constant features (99% same value)
near_constant_features = []
for col in train_df.columns:
    if col != 'TARGET':
        value_counts = train_df[col].value_counts()
        max_freq = value_counts.iloc[0] / len(train_df)
        if max_freq > 0.99:
            near_constant_features.append(col)

print(f"Near-constant features (>99% same value): {len(near_constant_features)}")
if near_constant_features:
    print(f"Near-constant features: {near_constant_features}")

# Check for features with high missing values
missing_threshold = 0.5  # 50%
high_missing_features = []
for col in train_df.columns:
    if col != 'TARGET':
        missing_pct = train_df[col].isnull().mean()
        if missing_pct > missing_threshold:
            high_missing_features.append((col, missing_pct))

print(f"Features with >{missing_threshold*100}% missing values: {len(high_missing_features)}")
if high_missing_features:
    print("High missing features:")
    for col, pct in high_missing_features:
        print(f"  {col}: {pct:.2%}")

# Feature variance analysis
numeric_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
if 'TARGET' in numeric_features:
    numeric_features.remove('TARGET')

feature_variance = train_df[numeric_features].var().sort_values(ascending=True)
low_variance_features = feature_variance[feature_variance < 1e-6]

print(f"Features with very low variance (< 1e-6): {len(low_variance_features)}")
if len(low_variance_features) > 0:
    print("Low variance features:")
    for col, var in low_variance_features.items():
        print(f"  {col}: {var:.2e}")


In [None]:
# Data leakage checks
print("=== DATA LEAKAGE CHECKS ===")

# Check for features with perfect correlation with target
if 'TARGET' in train_df.columns:
    target_corr = train_df[numeric_features + ['TARGET']].corr()['TARGET'].drop('TARGET')
    perfect_corr = target_corr[abs(target_corr) > 0.99]
    
    print(f"Features with perfect correlation with target: {len(perfect_corr)}")
    if len(perfect_corr) > 0:
        print("Perfect correlation features:")
        for col, corr in perfect_corr.items():
            print(f"  {col}: {corr:.4f}")

# Check for features that are too predictive (potential leakage)
high_corr_features = target_corr[abs(target_corr) > 0.8]
print(f"Features with very high correlation (>0.8): {len(high_corr_features)}")
if len(high_corr_features) > 0:
    print("High correlation features:")
    for col, corr in high_corr_features.items():
        print(f"  {col}: {corr:.4f}")

# Check for features with unrealistic distributions
print("\n=== UNREALISTIC DISTRIBUTION CHECKS ===")

# Check for features with extreme values
extreme_features = []
for col in numeric_features:
    if col in train_df.columns:
        q99 = train_df[col].quantile(0.99)
        q01 = train_df[col].quantile(0.01)
        max_val = train_df[col].max()
        min_val = train_df[col].min()
        
        # Check for extreme outliers
        if max_val > q99 * 10 or min_val < q01 * 10:
            extreme_features.append((col, min_val, max_val, q01, q99))

print(f"Features with extreme outliers: {len(extreme_features)}")
if extreme_features:
    print("Extreme outlier features:")
    for col, min_val, max_val, q01, q99 in extreme_features:
        print(f"  {col}: min={min_val:.2f}, max={max_val:.2f}, q01={q01:.2f}, q99={q99:.2f}")

# Check for features with suspicious patterns
print("\n=== SUSPICIOUS PATTERN CHECKS ===")

# Check for features that are identical between train and test
identical_features = []
for col in train_df.columns:
    if col != 'TARGET' and col in test_df.columns:
        train_unique = set(train_df[col].dropna().unique())
        test_unique = set(test_df[col].dropna().unique())
        if train_unique == test_unique:
            identical_features.append(col)

print(f"Features identical between train and test: {len(identical_features)}")
if identical_features:
    print(f"Identical features: {identical_features}")


In [None]:
# Feature distribution comparison
print("=== FEATURE DISTRIBUTION COMPARISON ===")

# Compare distributions between train and test
common_features = [col for col in train_df.columns if col != 'TARGET' and col in test_df.columns]
print(f"Comparing {len(common_features)} common features between train and test")

# Statistical tests for distribution differences
from scipy import stats

significant_differences = []
for col in common_features[:20]:  # Check first 20 features to avoid overwhelming output
    if col in numeric_features:
        train_vals = train_df[col].dropna()
        test_vals = test_df[col].dropna()
        
        if len(train_vals) > 100 and len(test_vals) > 100:  # Sufficient samples
            # Kolmogorov-Smirnov test
            ks_stat, ks_pvalue = stats.ks_2samp(train_vals, test_vals)
            
            # Mann-Whitney U test
            mw_stat, mw_pvalue = stats.mannwhitneyu(train_vals, test_vals, alternative='two-sided')
            
            if ks_pvalue < 0.05 or mw_pvalue < 0.05:
                significant_differences.append({
                    'feature': col,
                    'ks_pvalue': ks_pvalue,
                    'mw_pvalue': mw_pvalue,
                    'train_mean': train_vals.mean(),
                    'test_mean': test_vals.mean()
                })

print(f"Features with significant distribution differences: {len(significant_differences)}")
if significant_differences:
    print("Significant differences (p < 0.05):")
    for diff in significant_differences:
        print(f"  {diff['feature']}: KS p={diff['ks_pvalue']:.4f}, MW p={diff['mw_pvalue']:.4f}")
        print(f"    Train mean: {diff['train_mean']:.4f}, Test mean: {diff['test_mean']:.4f}")

# Visualize distribution differences for key features
key_features = ['AMT_CREDIT', 'AMT_INCOME_TOTAL', 'AMT_ANNUITY']
key_features = [f for f in key_features if f in common_features]

if key_features:
    plt.figure(figsize=(15, 5 * len(key_features)))
    
    for i, feature in enumerate(key_features):
        plt.subplot(len(key_features), 2, 2*i + 1)
        plt.hist(train_df[feature].dropna(), bins=50, alpha=0.7, label='Train', density=True)
        plt.hist(test_df[feature].dropna(), bins=50, alpha=0.7, label='Test', density=True)
        plt.title(f'{feature} Distribution Comparison')
        plt.legend()
        plt.yscale('log')
        
        plt.subplot(len(key_features), 2, 2*i + 2)
        train_q = np.percentile(train_df[feature].dropna(), range(0, 101))
        test_q = np.percentile(test_df[feature].dropna(), range(0, 101))
        plt.plot(train_q, test_q, 'o', markersize=2)
        plt.plot([train_q.min(), train_q.max()], [train_q.min(), train_q.max()], 'r--')
        plt.title(f'{feature} Q-Q Plot')
        plt.xlabel('Train Quantiles')
        plt.ylabel('Test Quantiles')
    
    plt.tight_layout()
    plt.show()


In [None]:
# Extreme values check
print("=== EXTREME VALUES CHECK ===")

# Check for extreme outliers using IQR method
extreme_outliers = {}
for col in numeric_features[:20]:  # Check first 20 features
    if col in train_df.columns:
        Q1 = train_df[col].quantile(0.25)
        Q3 = train_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = train_df[(train_df[col] < lower_bound) | (train_df[col] > upper_bound)]
        outlier_pct = len(outliers) / len(train_df) * 100
        
        if outlier_pct > 5:  # More than 5% outliers
            extreme_outliers[col] = {
                'outlier_count': len(outliers),
                'outlier_percentage': outlier_pct,
                'lower_bound': lower_bound,
                'upper_bound': upper_bound
            }

print(f"Features with >5% extreme outliers: {len(extreme_outliers)}")
if extreme_outliers:
    print("Extreme outlier features:")
    for col, info in extreme_outliers.items():
        print(f"  {col}: {info['outlier_count']} outliers ({info['outlier_percentage']:.1f}%)")

# Check for infinite values
infinite_features = []
for col in numeric_features:
    if col in train_df.columns:
        if np.isinf(train_df[col]).any():
            infinite_count = np.isinf(train_df[col]).sum()
            infinite_features.append((col, infinite_count))

print(f"\nFeatures with infinite values: {len(infinite_features)}")
if infinite_features:
    print("Infinite value features:")
    for col, count in infinite_features:
        print(f"  {col}: {count} infinite values")

# Check for features with suspicious ranges
suspicious_ranges = []
for col in numeric_features[:20]:
    if col in train_df.columns:
        min_val = train_df[col].min()
        max_val = train_df[col].max()
        
        # Check for suspicious ranges
        if min_val < -1e6 or max_val > 1e6:
            suspicious_ranges.append((col, min_val, max_val))

print(f"\nFeatures with suspicious ranges: {len(suspicious_ranges)}")
if suspicious_ranges:
    print("Suspicious range features:")
    for col, min_val, max_val in suspicious_ranges:
        print(f"  {col}: [{min_val:.2e}, {max_val:.2e}]")


In [None]:
# Feature correlation analysis
print("=== FEATURE CORRELATION ANALYSIS ===")

# Calculate correlation matrix for numeric features
correlation_matrix = train_df[numeric_features].corr()

# Find highly correlated feature pairs
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = correlation_matrix.iloc[i, j]
        if abs(corr_val) > 0.8:  # High correlation threshold
            high_corr_pairs.append({
                'feature1': correlation_matrix.columns[i],
                'feature2': correlation_matrix.columns[j],
                'correlation': corr_val
            })

print(f"Highly correlated feature pairs (|correlation| > 0.8): {len(high_corr_pairs)}")
if high_corr_pairs:
    print("High correlation pairs:")
    for pair in high_corr_pairs:
        print(f"  {pair['feature1']} <-> {pair['feature2']}: {pair['correlation']:.4f}")

# Visualize correlation matrix for top features
if 'TARGET' in train_df.columns:
    target_corr = train_df[numeric_features + ['TARGET']].corr()['TARGET'].drop('TARGET')
    top_features = target_corr.abs().sort_values(ascending=False).head(15).index.tolist()
    
    if len(top_features) > 0:
        corr_subset = correlation_matrix.loc[top_features, top_features]
        
        plt.figure(figsize=(12, 10))
        sns.heatmap(corr_subset, annot=True, cmap='coolwarm', center=0, 
                   square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
        plt.title('Correlation Matrix - Top Features by Target Correlation')
        plt.tight_layout()
        plt.show()

# Multicollinearity analysis using VIF (Variance Inflation Factor)
print("\n=== MULTICOLLINEARITY ANALYSIS ===")

# Calculate VIF for top features (simplified version)
def calculate_vif(df, features):
    from sklearn.linear_model import LinearRegression
    
    vif_data = []
    for feature in features[:10]:  # Limit to first 10 features for performance
        X = df[features].drop(columns=[feature])
        y = df[feature]
        
        # Handle missing values
        mask = ~(X.isnull().any(axis=1) | y.isnull())
        X_clean = X[mask]
        y_clean = y[mask]
        
        if len(X_clean) > 100:  # Sufficient samples
            model = LinearRegression().fit(X_clean, y_clean)
            r_squared = model.score(X_clean, y_clean)
            
            if r_squared < 0.999:  # Avoid division by zero
                vif = 1 / (1 - r_squared)
                vif_data.append({'feature': feature, 'vif': vif})
    
    return pd.DataFrame(vif_data)

vif_df = calculate_vif(train_df, numeric_features)
if len(vif_df) > 0:
    high_vif = vif_df[vif_df['vif'] > 5]  # VIF > 5 indicates multicollinearity
    print(f"Features with high VIF (>5): {len(high_vif)}")
    if len(high_vif) > 0:
        print("High VIF features:")
        for _, row in high_vif.iterrows():
            print(f"  {row['feature']}: VIF = {row['vif']:.2f}")
    else:
        print("No features with high VIF detected")
else:
    print("VIF calculation not possible (insufficient data or perfect correlation)")


In [None]:
# Feature stability check
print("=== FEATURE STABILITY CHECK ===")

# Check feature stability over different time periods (if applicable)
# For this analysis, we'll check stability between train and test sets

stability_results = []
for col in common_features[:20]:  # Check first 20 features
    if col in numeric_features:
        train_vals = train_df[col].dropna()
        test_vals = test_df[col].dropna()
        
        if len(train_vals) > 100 and len(test_vals) > 100:
            # Calculate stability metrics
            train_mean = train_vals.mean()
            test_mean = test_vals.mean()
            train_std = train_vals.std()
            test_std = test_vals.std()
            
            # Population Stability Index (PSI)
            # Simplified PSI calculation
            train_hist, train_bins = np.histogram(train_vals, bins=10)
            test_hist, _ = np.histogram(test_vals, bins=train_bins)
            
            train_pct = train_hist / len(train_vals)
            test_pct = test_hist / len(test_vals)
            
            # Avoid division by zero
            train_pct = np.where(train_pct == 0, 1e-6, train_pct)
            test_pct = np.where(test_pct == 0, 1e-6, test_pct)
            
            psi = np.sum((train_pct - test_pct) * np.log(train_pct / test_pct))
            
            stability_results.append({
                'feature': col,
                'train_mean': train_mean,
                'test_mean': test_mean,
                'mean_diff_pct': abs(train_mean - test_mean) / train_mean * 100 if train_mean != 0 else 0,
                'train_std': train_std,
                'test_std': test_std,
                'std_diff_pct': abs(train_std - test_std) / train_std * 100 if train_std != 0 else 0,
                'psi': psi
            })

# Analyze stability results
if stability_results:
    stability_df = pd.DataFrame(stability_results)
    
    # Features with high PSI (>0.2 indicates significant shift)
    high_psi = stability_df[stability_df['psi'] > 0.2]
    print(f"Features with high PSI (>0.2): {len(high_psi)}")
    if len(high_psi) > 0:
        print("High PSI features:")
        for _, row in high_psi.iterrows():
            print(f"  {row['feature']}: PSI = {row['psi']:.4f}")
    
    # Features with significant mean shift (>10%)
    high_mean_shift = stability_df[stability_df['mean_diff_pct'] > 10]
    print(f"\nFeatures with significant mean shift (>10%): {len(high_mean_shift)}")
    if len(high_mean_shift) > 0:
        print("High mean shift features:")
        for _, row in high_mean_shift.iterrows():
            print(f"  {row['feature']}: Mean diff = {row['mean_diff_pct']:.1f}%")
    
    # Visualize stability metrics
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    plt.scatter(stability_df['psi'], stability_df['mean_diff_pct'])
    plt.xlabel('PSI')
    plt.ylabel('Mean Difference (%)')
    plt.title('PSI vs Mean Difference')
    plt.axhline(y=10, color='r', linestyle='--', alpha=0.5)
    plt.axvline(x=0.2, color='r', linestyle='--', alpha=0.5)
    
    plt.subplot(1, 3, 2)
    stability_df['psi'].hist(bins=20)
    plt.xlabel('PSI')
    plt.ylabel('Count')
    plt.title('PSI Distribution')
    
    plt.subplot(1, 3, 3)
    stability_df['mean_diff_pct'].hist(bins=20)
    plt.xlabel('Mean Difference (%)')
    plt.ylabel('Count')
    plt.title('Mean Difference Distribution')
    
    plt.tight_layout()
    plt.show()

else:
    print("No stability analysis possible (insufficient data)")


In [None]:
# Feature engineering summary
print("=== FEATURE ENGINEERING SUMMARY ===")

# Compile all findings
summary = {
    'total_features': len(train_df.columns) - 1,  # Exclude TARGET
    'constant_features': len(constant_features),
    'near_constant_features': len(near_constant_features),
    'high_missing_features': len(high_missing_features),
    'low_variance_features': len(low_variance_features),
    'perfect_corr_features': len(perfect_corr) if 'TARGET' in train_df.columns else 0,
    'high_corr_pairs': len(high_corr_pairs),
    'extreme_outlier_features': len(extreme_outliers),
    'infinite_value_features': len(infinite_features),
    'suspicious_range_features': len(suspicious_ranges),
    'identical_features': len(identical_features),
    'significant_dist_diff': len(significant_differences),
    'high_psi_features': len(high_psi) if 'high_psi' in locals() else 0,
    'high_vif_features': len(high_vif) if 'high_vif' in locals() and len(vif_df) > 0 else 0
}

print("FEATURE QUALITY SUMMARY:")
print(f"  Total features analyzed: {summary['total_features']}")
print(f"  Constant features: {summary['constant_features']}")
print(f"  Near-constant features: {summary['near_constant_features']}")
print(f"  High missing value features: {summary['high_missing_features']}")
print(f"  Low variance features: {summary['low_variance_features']}")
print(f"  Perfect correlation with target: {summary['perfect_corr_features']}")
print(f"  High correlation pairs: {summary['high_corr_pairs']}")
print(f"  Extreme outlier features: {summary['extreme_outlier_features']}")
print(f"  Infinite value features: {summary['infinite_value_features']}")
print(f"  Suspicious range features: {summary['suspicious_range_features']}")
print(f"  Identical train/test features: {summary['identical_features']}")
print(f"  Significant distribution differences: {summary['significant_dist_diff']}")
print(f"  High PSI features: {summary['high_psi_features']}")
print(f"  High VIF features: {summary['high_vif_features']}")

# Recommendations
print("\nRECOMMENDATIONS:")
print("1. FEATURE REMOVAL:")
if summary['constant_features'] > 0:
    print(f"   - Remove {summary['constant_features']} constant features")
if summary['near_constant_features'] > 0:
    print(f"   - Consider removing {summary['near_constant_features']} near-constant features")
if summary['perfect_corr_features'] > 0:
    print(f"   - Remove {summary['perfect_corr_features']} features with perfect correlation (potential leakage)")

print("\n2. FEATURE ENGINEERING:")
if summary['high_missing_features'] > 0:
    print(f"   - Implement robust missing value handling for {summary['high_missing_features']} features")
if summary['extreme_outlier_features'] > 0:
    print(f"   - Apply outlier treatment to {summary['extreme_outlier_features']} features")
if summary['high_corr_pairs'] > 0:
    print(f"   - Consider feature selection to reduce {summary['high_corr_pairs']} high correlation pairs")

print("\n3. DATA QUALITY:")
if summary['infinite_value_features'] > 0:
    print(f"   - Handle infinite values in {summary['infinite_value_features']} features")
if summary['suspicious_range_features'] > 0:
    print(f"   - Investigate suspicious ranges in {summary['suspicious_range_features']} features")

print("\n4. MODEL CONSIDERATIONS:")
if summary['significant_dist_diff'] > 0:
    print(f"   - Monitor {summary['significant_dist_diff']} features with distribution shifts")
if summary['high_psi_features'] > 0:
    print(f"   - Re-evaluate {summary['high_psi_features']} features with high PSI")
if summary['high_vif_features'] > 0:
    print(f"   - Address multicollinearity in {summary['high_vif_features']} features")

# Save summary
import json
with open('../Data/processed/feature_checks_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\n✅ Feature checks summary saved to: ../Data/processed/feature_checks_summary.json")
print("\n🎯 Next steps: Apply recommendations before model training!")
