In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')

# Set visualization parameters
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
%matplotlib inline

# Display all columns
pd.set_option('display.max_columns', None)

# Set random seed for reproducibility
np.random.seed(42)

# Define path to the dataset
file_path = "../data/raw/kenyan_loan_default_dataset.csv"

# Load the raw dataset
df_raw = pd.read_csv(file_path)

# Display the first few rows
print(f"Dataset shape: {df_raw.shape}")
df_raw.head()

In [None]:
# First, let's create a copy of the raw data to work with
df = df_raw.copy()

# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

# Create a DataFrame to display missing values and percentages
missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage (%)': missing_percentage
})

# Display only columns with missing values, sorted by percentage
missing_df = missing_df[missing_df['Missing Values'] > 0].sort_values('Percentage (%)', ascending=False)
print("Columns with Missing Values:")
missing_df

# Handle missing values based on column type
# For numerical columns, we'll fill with median
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        median_value = df[col].median()
        df[col] = df[col].fillna(median_value)
        print(f"Filled missing values in {col} with median: {median_value}")

# For categorical columns, we'll fill with mode
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        mode_value = df[col].mode()[0]
        df[col] = df[col].fillna(mode_value)
        print(f"Filled missing values in {col} with mode: {mode_value}")

# Verify all missing values are handled
print("\nRemaining missing values:")
print(df.isnull().sum().sum())

In [None]:
# Function to detect and handle outliers using IQR method
def handle_outliers(df, column, method='cap'):
    """
    Detect and handle outliers in a column.
    
    Parameters:
    -----------
    df : pandas DataFrame
        The DataFrame containing the column
    column : str
        The column name to process
    method : str, default='cap'
        The method to handle outliers ('cap' or 'remove')
    
    Returns:
    --------
    df : pandas DataFrame
        The DataFrame with outliers handled
    """
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Count outliers
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column].count()
    outliers_percent = (outliers / len(df)) * 100
    print(f"{column}: {outliers} outliers detected ({outliers_percent:.2f}%)")
    
    if method == 'cap':
        # Cap the outliers instead of removing
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    elif method == 'remove':
        # Remove outliers (be careful with this approach)
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    return df

# Apply outlier handling to key numerical columns
numerical_columns_for_outlier_treatment = [
    'age', 'monthly_income_kes', 'mobile_money_usage', 'loan_amount_kes', 
    'interest_rate', 'days_late', 'airtime_topup_frequency'
]

for col in numerical_columns_for_outlier_treatment:
    df = handle_outliers(df, col, method='cap')

# Let's visualize the distributions after outlier treatment
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_columns_for_outlier_treatment):
    plt.subplot(3, 3, i+1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col} (after outlier treatment)')
plt.tight_layout()
plt.show() 

In [None]:
# First, let's identify all categorical columns
categorical_columns = df.select_dtypes(include=['object', 'bool']).columns.tolist()
print(f"Categorical columns to encode: {categorical_columns}")

# Make a copy of the dataframe before encoding
df_encoded = df.copy()

# Encode binary categorical variables
binary_cols = ['group_loan']
for col in binary_cols:
    if col in df_encoded.columns and df_encoded[col].dtype == 'object':
        df_encoded[col] = df_encoded[col].map({'Yes': 1, 'No': 0})
        print(f"Binary encoded {col}")

# One-hot encoding for categorical variables with multiple categories
# We'll exclude 'loan_id' and 'customer_id' as they are identifiers
categorical_to_encode = [col for col in categorical_columns if col not in ['loan_id', 'customer_id', 'group_loan']]
df_encoded = pd.get_dummies(df_encoded, columns=categorical_to_encode, drop_first=True)

# Display the encoded dataframe
print(f"\nShape after encoding: {df_encoded.shape}")
df_encoded.head()

In [None]:
# Create financial ratio features based on EDA insights

# Loan-to-Income Ratio (higher ratio might indicate higher risk)
df_encoded['loan_to_income_ratio'] = df_encoded['loan_amount_kes'] / df_encoded['monthly_income_kes']

# Monthly payment estimate (simplified calculation)
df_encoded['estimated_monthly_payment'] = (df_encoded['loan_amount_kes'] * 
                                         (1 + df_encoded['interest_rate']/100)) / df_encoded['loan_term_months']

# Debt-to-Income Ratio (estimated monthly payment to monthly income)
df_encoded['debt_to_income_ratio'] = df_encoded['estimated_monthly_payment'] / df_encoded['monthly_income_kes']

# Mobile Money to Income Ratio
df_encoded['mobile_money_to_income_ratio'] = df_encoded['mobile_money_usage'] / df_encoded['monthly_income_kes']

# Repayment Rate (repayment progress relative to loan term)
df_encoded['repayment_rate'] = df_encoded['repayment_progress'] / (df_encoded['days_late'] + 1)

# Interest Burden (total interest as a percentage of loan)
df_encoded['interest_burden'] = (df_encoded['loan_amount_kes'] * df_encoded['interest_rate'] / 100 * 
                                df_encoded['loan_term_months'] / 12) / df_encoded['loan_amount_kes']

# Risk Factor (weighted combination of key risk indicators)
df_encoded['risk_factor'] = (0.3 * df_encoded['loan_to_income_ratio'] + 
                           0.3 * df_encoded['num_defaults'] + 
                           0.2 * (df_encoded['days_late'] / 30) +  # Normalize to months
                           0.2 * (1 - df_encoded['repayment_progress']/100))  # Convert to 0-1 scale

# Display summary of new ratio features
ratio_features = ['loan_to_income_ratio', 'estimated_monthly_payment', 'debt_to_income_ratio', 
                 'mobile_money_to_income_ratio', 'repayment_rate', 'interest_burden', 'risk_factor']

print("Summary statistics for new ratio features:")
df_encoded[ratio_features].describe().T

# Visualize the relationship between ratios and default status
plt.figure(figsize=(15, 10))
for i, feat in enumerate(ratio_features):
    plt.subplot(3, 3, i+1)
    sns.boxplot(x='defaulted', y=feat, data=df_encoded)
    plt.title(f'{feat} by Default Status')
plt.tight_layout()
plt.show()

In [None]:
# Create features related to mobile money usage and financial behavior

# Mobile Money Activity Level (categorical based on usage)
df_encoded['mobile_money_activity'] = pd.qcut(
    df_encoded['mobile_money_usage'], 
    q=5, 
    labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']
)

# Mobile Money to Airtime Ratio
df_encoded['mobile_money_to_airtime_ratio'] = df_encoded['mobile_money_usage'] / (df_encoded['airtime_topup_frequency'] + 1)

# Financial Activity Index (combined measure of mobile and airtime usage)
df_encoded['financial_activity_index'] = (
    (df_encoded['mobile_money_usage'] / df_encoded['mobile_money_usage'].max()) * 0.7 + 
    (df_encoded['airtime_topup_frequency'] / df_encoded['airtime_topup_frequency'].max()) * 0.3
)

# Convert 'mobile_money_activity' back to numerical for modeling
df_encoded = pd.get_dummies(df_encoded, columns=['mobile_money_activity'], prefix='mm_activity')

# Display the new behavioral features
behavioral_features = ['mobile_money_to_airtime_ratio', 'financial_activity_index'] + [col for col in df_encoded.columns if 'mm_activity_' in col]
print("Summary of new behavioral features:")
df_encoded[behavioral_features[:2]].describe().T

# Visualize the relationship with default status
plt.figure(figsize=(12, 6))
for i, feat in enumerate(behavioral_features[:2]):  # Plot just the continuous features
    plt.subplot(1, 2, i+1)
    sns.boxplot(x='defaulted', y=feat, data=df_encoded)
    plt.title(f'{feat} by Default Status')
plt.tight_layout()
plt.show()

In [None]:
# Create features based on loan history and borrower characteristics

# Default Rate (ratio of defaults to previous loans)
df_encoded['default_rate'] = df_encoded['num_defaults'] / (df_encoded['num_previous_loans'] + 1)  # Add 1 to avoid division by zero

# Loan Experience Level
df_encoded['loan_experience'] = pd.cut(
    df_encoded['num_previous_loans'],
    bins=[-1, 0, 2, 5, 10, float('inf')],
    labels=['First-time', 'Beginner', 'Intermediate', 'Experienced', 'Expert']
)

# Default History Severity
df_encoded['default_history_severity'] = pd.cut(
    df_encoded['default_rate'],
    bins=[-0.001, 0.001, 0.2, 0.5, 1.0],  # -0.001 to include 0
    labels=['Clean', 'Low Risk', 'Medium Risk', 'High Risk']
)

# Current Loan Progress Index
df_encoded['loan_progress_index'] = df_encoded['repayment_progress'] / (df_encoded['days_late'] + 1)

# Convert categorical features to dummy variables
df_encoded = pd.get_dummies(df_encoded, columns=['loan_experience', 'default_history_severity'], prefix=['loan_exp', 'default_hist'])

# Display the new history-based features
history_features = ['default_rate', 'loan_progress_index'] + [col for col in df_encoded.columns if 'loan_exp_' in col or 'default_hist_' in col]
print("Summary of new history-based features:")
df_encoded[history_features[:2]].describe().T

# Visualize the relationship with default status
plt.figure(figsize=(12, 6))
for i, feat in enumerate(history_features[:2]):  # Plot just the continuous features
    plt.subplot(1, 2, i+1)
    sns.boxplot(x='defaulted', y=feat, data=df_encoded)
    plt.title(f'{feat} by Default Status')
plt.tight_layout()
plt.show()

In [None]:
# Create features based on demographic and socioeconomic characteristics

# Age Groups (more granular than typical binning)
df_encoded['age_group'] = pd.cut(
    df_encoded['age'],
    bins=[18, 25, 30, 35, 40, 45, 50, 60, 100],
    labels=['18-25', '26-30', '31-35', '36-40', '41-45', '46-50', '51-60', '60+']
)

# Income Level
df_encoded['income_level'] = pd.qcut(
    df_encoded['monthly_income_kes'],
    q=5,
    labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']
)

# Socioeconomic Score (combined measure of income, education, and employment)
# First, we need to identify education and employment columns from one-hot encoding
education_cols = [col for col in df_encoded.columns if 'education_level_' in col]
employment_cols = [col for col in df_encoded.columns if 'employment_status_' in col]

# Create an education score (simple method: weighting different levels)
if education_cols:
    # Example weights based on education level
    education_weights = {
        'education_level_Tertiary': 3,
        'education_level_Secondary': 2,
        'education_level_Primary': 1,
        'education_level_None': 0
    }
    
    # Initialize education score
    df_encoded['education_score'] = 0
    
    # Apply weights where columns exist
    for col, weight in education_weights.items():
        if col in df_encoded.columns:
            df_encoded['education_score'] += df_encoded[col] * weight

# Employment stability score
if employment_cols:
    # Example weights based on employment stability
    employment_weights = {
        'employment_status_Formal': 3,
        'employment_status_Self-employed': 2,
        'employment_status_Informal': 1,
        'employment_status_Unemployed': 0
    }
    
    # Initialize employment score
    df_encoded['employment_score'] = 0
    
    # Apply weights where columns exist
    for col, weight in employment_weights.items():
        if col in df_encoded.columns:
            df_encoded['employment_score'] += df_encoded[col] * weight

# Normalize income for socioeconomic score
df_encoded['income_score'] = (df_encoded['monthly_income_kes'] - df_encoded['monthly_income_kes'].min()) / (df_encoded['monthly_income_kes'].max() - df_encoded['monthly_income_kes'].min())

# Calculate socioeconomic score (weighted combination)
df_encoded['socioeconomic_score'] = (
    df_encoded['income_score'] * 0.5 + 
    df_encoded['education_score'] / 3 * 0.3 +  # Normalize to 0-1
    df_encoded['employment_score'] / 3 * 0.2    # Normalize to 0-1
)

# Convert categorical features to dummy variables
df_encoded = pd.get_dummies(df_encoded, columns=['age_group', 'income_level'], prefix=['age', 'income'])

# Display the new demographic features
demographic_features = ['education_score', 'employment_score', 'income_score', 'socioeconomic_score']
print("Summary of new demographic features:")
df_encoded[demographic_features].describe().T

# Visualize the relationship with default status
plt.figure(figsize=(15, 8))
for i, feat in enumerate(demographic_features):
    plt.subplot(2, 2, i+1)
    sns.boxplot(x='defaulted', y=feat, data=df_encoded)
    plt.title(f'{feat} by Default Status')
plt.tight_layout()
plt.show()

In [None]:
# Create interaction features between key variables

# Interaction between loan amount and interest rate
df_encoded['loan_amount_interest_interaction'] = df_encoded['loan_amount_kes'] * df_encoded['interest_rate']

# Interaction between income and previous defaults
df_encoded['income_defaults_interaction'] = df_encoded['monthly_income_kes'] * (1 / (df_encoded['num_defaults'] + 1))

# Interaction between mobile money usage and loan amount
df_encoded['mobile_money_loan_interaction'] = df_encoded['mobile_money_usage'] / (df_encoded['loan_amount_kes'] + 1)

# Interaction between loan term and interest rate
df_encoded['term_interest_interaction'] = df_encoded['loan_term_months'] * df_encoded['interest_rate']

# Interaction between age and loan amount
df_encoded['age_loan_interaction'] = df_encoded['age'] * df_encoded['loan_amount_kes']

# Interaction between socioeconomic score and loan-to-income ratio
df_encoded['socio_lti_interaction'] = df_encoded['socioeconomic_score'] * df_encoded['loan_to_income_ratio']

# Display the new interaction features
interaction_features = [
    'loan_amount_interest_interaction', 'income_defaults_interaction', 
    'mobile_money_loan_interaction', 'term_interest_interaction',
    'age_loan_interaction', 'socio_lti_interaction'
]
print("Summary of new interaction features:")
df_encoded[interaction_features].describe().T

# Visualize the relationship with default status
plt.figure(figsize=(15, 10))
for i, feat in enumerate(interaction_features):
    plt.subplot(3, 2, i+1)
    sns.boxplot(x='defaulted', y=feat, data=df_encoded)
    plt.title(f'{feat} by Default Status')
plt.tight_layout()
plt.show()

In [None]:
# Analyze correlation between features and target
# First, let's create a subset without non-numeric columns and identifiers
numeric_df = df_encoded.select_dtypes(include=['int64', 'float64'])
numeric_df = numeric_df.drop(['loan_id', 'customer_id'], errors='ignore')  # Drop identifiers if present

# Calculate correlation with target
target_correlation = numeric_df.corr()['defaulted'].sort_values(ascending=False)

# Plot top positive and negative correlations
plt.figure(figsize=(12, 10))
top_n = 15

# Top positive correlations
plt.subplot(2, 1, 1)
top_pos_corr = target_correlation.head(top_n+1)  # +1 to include defaulted itself
top_pos_corr = top_pos_corr[1:]  # Remove defaulted
sns.barplot(x=top_pos_corr.values, y=top_pos_corr.index)
plt.title(f'Top {top_n} Positive Correlations with Default Status')
plt.xlabel('Correlation Coefficient')

# Top negative correlations
plt.subplot(2, 1, 2)
top_neg_corr = target_correlation.tail(top_n)
sns.barplot(x=top_neg_corr.values, y=top_neg_corr.index)
plt.title(f'Top {top_n} Negative Correlations with Default Status')
plt.xlabel('Correlation Coefficient')

plt.tight_layout()
plt.show()

# Display top correlated features
print("Top positively correlated features:")
print(top_pos_corr)
print("\nTop negatively correlated features:")
print(top_neg_corr)

In [None]:
# Check for multicollinearity among features using Variance Inflation Factor (VIF)
# We'll select a subset of important features to avoid computational issues

# First, let's get the most correlated features (both positive and negative)
important_features = list(pd.concat([top_pos_corr.head(10), top_neg_corr.head(10)]).index)

# Add some of our engineered features
important_features.extend([
    'loan_to_income_ratio', 'debt_to_income_ratio', 'mobile_money_to_income_ratio',
    'risk_factor', 'default_rate', 'socioeconomic_score'
])

# Remove duplicates and ensure 'defaulted' is not included
important_features = list(set(important_features))
if 'defaulted' in important_features:
    important_features.remove('defaulted')

# Limit to a reasonable number of features to avoid computational issues
if len(important_features) > 20:
    important_features = important_features[:20]

# Create a DataFrame with selected features
X_vif = numeric_df[important_features].copy()

# Add a constant
X_vif_const = sm.add_constant(X_vif)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_vif_const.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif_const.values, i) for i in range(X_vif_const.shape[1])]

# Sort by VIF value
vif_data = vif_data.sort_values("VIF", ascending=False)

# Display VIF values
print("Variance Inflation Factor (VIF) for selected features:")
print(vif_data)

# Plot VIF values
plt.figure(figsize=(12, 8))
sns.barplot(x='VIF', y='Feature', data=vif_data)
plt.title('Variance Inflation Factor (VIF) for Selected Features')
plt.axvline(x=10, color='red', linestyle='--', label='VIF = 10 (Common threshold)')
plt.axvline(x=5, color='orange', linestyle='--', label='VIF = 5 (Strict threshold)')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Use Random Forest to identify important features

# Prepare data for the model
X = df_encoded.drop(['loan_id', 'customer_id', 'defaulted'], axis=1, errors='ignore')
y = df_encoded['defaulted']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot the top features
plt.figure(figsize=(12, 10))
top_features = feature_importance.head(20)
sns.barplot(x='Importance', y='Feature', data=top_features)
plt.title('Top 20 Feature Importance from Random Forest')
plt.tight_layout()
plt.show()

# Display top important features
print("Top 20 Most Important Features from Random Forest:")
print(feature_importance.head(20))

In [None]:
# Use statistical tests to select features

# ANOVA F-value for classification
print("ANOVA F-value feature selection:")
selector_f = SelectKBest(f_classif, k=20)
X_selected_f = selector_f.fit_transform(X, y)
f_scores = pd.DataFrame({
    'Feature': X.columns,
    'F-Score': selector_f.scores_,
    'P-value': selector_f.pvalues_
}).sort_values('F-Score', ascending=False)

print(f_scores.head(20))

# Mutual Information for feature selection
print("\nMutual Information feature selection:")
selector_mi = SelectKBest(mutual_info_classif, k=20)
X_selected_mi = selector_mi.fit_transform(X, y)
mi_scores = pd.DataFrame({
    'Feature': X.columns,
    'MI-Score': selector_mi.scores_,
}).sort_values('MI-Score', ascending=False)

print(mi_scores.head(20))

# Visualize the selected features
plt.figure(figsize=(15, 10))

plt.subplot(2, 1, 1)
sns.barplot(x='F-Score', y='Feature', data=f_scores.head(20))
plt.title('Top 20 Features by ANOVA F-Score')

plt.subplot(2, 1, 2)
sns.barplot(x='MI-Score', y='Feature', data=mi_scores.head(20))
plt.title('Top 20 Features by Mutual Information')

plt.tight_layout()
plt.show()

In [None]:
# Scale the numerical features

# Define all numeric features
numeric_features = df_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features = [feat for feat in numeric_features if feat not in ['loan_id', 'customer_id', 'defaulted']]

# Create scaled versions of the data with different scalers
df_scaled_standard = df_encoded.copy()
df_scaled_minmax = df_encoded.copy()
df_scaled_robust = df_encoded.copy()

# Standard Scaling (Z-score normalization)
scaler_standard = StandardScaler()
df_scaled_standard[numeric_features] = scaler_standard.fit_transform(df_encoded[numeric_features])

# Min-Max Scaling (0-1 normalization)
scaler_minmax = MinMaxScaler()
df_scaled_minmax[numeric_features] = scaler_minmax.fit_transform(df_encoded[numeric_features])

# Robust Scaling (less sensitive to outliers)
scaler_robust = RobustScaler()
df_scaled_robust[numeric_features] = scaler_robust.fit_transform(df_encoded[numeric_features])

# Compare distributions of original and scaled data for a few key features
key_features_to_compare = [
    'loan_to_income_ratio', 'monthly_income_kes', 'mobile_money_usage', 
    'risk_factor', 'socioeconomic_score'
]

# Create a long-format dataframe for easier plotting
comparison_data = []

for feature in key_features_to_compare:
    # Original data
    for value in df_encoded[feature]:
        comparison_data.append({'Feature': feature, 'Scaling': 'Original', 'Value': value})
    
    # Standard scaled data
    for value in df_scaled_standard[feature]:
        comparison_data.append({'Feature': feature, 'Scaling': 'Standard', 'Value': value})
    
    # Min-Max scaled data
    for value in df_scaled_minmax[feature]:
        comparison_data.append({'Feature': feature, 'Scaling': 'Min-Max', 'Value': value})
    
    # Robust scaled data
    for value in df_scaled_robust[feature]:
        comparison_data.append({'Feature': feature, 'Scaling': 'Robust', 'Value': value})

comparison_df = pd.DataFrame(comparison_data)

# Visualize the distributions
plt.figure(figsize=(15, 15))
for i, feature in enumerate(key_features_to_compare):
    plt.subplot(len(key_features_to_compare), 1, i+1)
    feature_data = comparison_df[comparison_df['Feature'] == feature]
    sns.boxplot(x='Scaling', y='Value', data=feature_data)
    plt.title(f'Distribution of {feature} with Different Scaling Methods')
    plt.tight_layout()
plt.tight_layout()
plt.show()

In [None]:
# Identify skewed features
skewness = df_encoded[numeric_features].skew().sort_values(ascending=False)
print("Skewness of numeric features:")
print(skewness)

# Select highly skewed features for log transformation
highly_skewed = skewness[abs(skewness) > 2].index.tolist()
print(f"\nHighly skewed features (|skew| > 2):")
print(highly_skewed)

# Apply log transformation to highly skewed features
# Add a small constant to handle zeros
df_log_transformed = df_encoded.copy()
for feature in highly_skewed:
    # Add 1 to handle zeros
    df_log_transformed[f'{feature}_log'] = np.log1p(df_log_transformed[feature])

# Compare original and log-transformed distributions
plt.figure(figsize=(15, len(highly_skewed) * 5))
for i, feature in enumerate(highly_skewed):
    # Original distribution
    plt.subplot(len(highly_skewed), 2, 2*i+1)
    sns.histplot(df_encoded[feature], kde=True)
    plt.title(f'Original {feature} (Skewness: {skewness[feature]:.2f})')
    
    # Log-transformed distribution
    plt.subplot(len(highly_skewed), 2, 2*i+2)
    sns.histplot(df_log_transformed[f'{feature}_log'], kde=True)
    log_skewness = df_log_transformed[f'{feature}_log'].skew()
    plt.title(f'Log-transformed {feature} (Skewness: {log_skewness:.2f})')

plt.tight_layout()
plt.show()

In [None]:
# Apply PCA to reduce dimensionality

# Select only numeric features for PCA
X_pca = df_scaled_standard[numeric_features].copy()

# Initialize and fit PCA
pca = PCA(n_components=0.95)  # Capture 95% of variance
X_pca_transformed = pca.fit_transform(X_pca)

# Get explained variance ratio
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)
n_components = len(explained_variance)

# Plot explained variance
plt.figure(figsize=(12, 6))
plt.bar(range(1, n_components+1), explained_variance, alpha=0.7, label='Individual')
plt.step(range(1, n_components+1), cumulative_variance, where='mid', label='Cumulative')
plt.axhline(y=0.95, color='r', linestyle='-', alpha=0.5, label='95% Variance Threshold')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.title('PCA Explained Variance')
plt.legend()
plt.tight_layout()
plt.show()

# Create a dataframe with PCA components
pca_df = pd.DataFrame(
    data=X_pca_transformed,
    columns=[f'PC{i+1}' for i in range(X_pca_transformed.shape[1])]
)

# Add the target variable back
pca_df['defaulted'] = df_encoded['defaulted'].values

# Plot the first two principal components
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='defaulted', data=pca_df, palette=['#2ecc71', '#e74c3c'], alpha=0.7)
plt.title('First Two Principal Components')
plt.tight_layout()
plt.show()

# Display component loadings
if n_components <= 10:  # Only show if number of components is manageable
    component_loadings = pd.DataFrame(
        pca.components_.T,
        columns=[f'PC{i+1}' for i in range(n_components)],
        index=numeric_features
    )
    
    print("Top feature loadings for each principal component:")
    for i in range(min(5, n_components)):  # Show first 5 components or less
        print(f"\nPrincipal Component {i+1} (Explains {explained_variance[i]*100:.2f}% of variance):")
        loadings = component_loadings[f'PC{i+1}'].abs().sort_values(ascending=False)
        print(loadings.head(10))

In [None]:
# Combine insights from different feature selection methods

# Function to rank features based on multiple selection methods
def get_feature_ranking(dataframes, rank_column_names, reverse_order=None):
    """
    Combine rankings from multiple feature selection methods.
    
    Parameters:
    -----------
    dataframes : list of pandas DataFrames
        Each DataFrame should contain feature selection results
    rank_column_names : list of str
        Name of the ranking column in each DataFrame
    reverse_order : list of bool, optional
        Whether to reverse the order for each method (True for methods where lower is better)
    
    Returns:
    --------
    pandas DataFrame
        Combined rankings with mean rank
    """
    if reverse_order is None:
        reverse_order = [False] * len(dataframes)
    
    # Initialize with all features
    all_features = set()
    for df in dataframes:
        all_features.update(df['Feature'].values)
    
    # Create a dataframe to store ranks
    ranks_df = pd.DataFrame({'Feature': list(all_features)})
    
    # Add ranks from each method
    for i, (df, col_name, reverse) in enumerate(zip(dataframes, rank_column_names, reverse_order)):
        # Create a ranking based on values
        rank_col = f'Rank_{col_name}'
        temp_df = df.copy()
        if reverse:
            temp_df[rank_col] = temp_df[col_name].rank(ascending=False)
        else:
            temp_df[rank_col] = temp_df[col_name].rank(ascending=True)
        
        # Merge with the ranks dataframe
        ranks_df = ranks_df.merge(temp_df[['Feature', rank_col]], on='Feature', how='left')
    
    # Fill NaN with worst rank + 1
    for col in ranks_df.columns:
        if col.startswith('Rank_'):
            worst_rank = ranks_df[col].max()
            ranks_df[col] = ranks_df[col].fillna(worst_rank + 1)
    
    # Calculate mean rank
    rank_columns = [col for col in ranks_df.columns if col.startswith('Rank_')]
    ranks_df['Mean_Rank'] = ranks_df[rank_columns].mean(axis=1)
    
    # Sort by mean rank
    ranks_df = ranks_df.sort_values('Mean_Rank')
    
    return ranks_df

# Prepare DataFrames for ranking
# 1. Correlation with target (absolute value)
correlation_df = pd.DataFrame({
    'Feature': numeric_df.columns,
    'Correlation': numeric_df.corr()['defaulted'].abs()
}).sort_values('Correlation', ascending=False)

In [None]:
# Create combined ranking
feature_ranks = get_feature_ranking(
    [correlation_df, feature_importance, f_scores, mi_scores],
    ['Correlation', 'Importance', 'F-Score', 'MI-Score'],
    [False, False, False, False]  # Higher is better for all methods
)

# Display top features
print("Top 30 Features Based on Combined Ranking:")
print(feature_ranks.head(30))

# Visualize top features
plt.figure(figsize=(12, 10))
top_features_to_plot = feature_ranks.head(20)
sns.barplot(y='Feature', x='Mean_Rank', data=top_features_to_plot)
plt.title('Top 20 Features by Combined Ranking')
plt.xlabel('Mean Rank (Lower is Better)')
plt.tight_layout()
plt.show()

In [None]:
# Get top 30 features from ranking
top_ranked_features = feature_ranks['Feature'].head(30).tolist()

# Add important domain-specific features that might not be in the top 30
domain_specific_features = [
    'loan_to_income_ratio', 'debt_to_income_ratio', 
    'mobile_money_to_income_ratio', 'default_rate',
    'socioeconomic_score', 'risk_factor'
]

# Combine and remove duplicates
final_features = list(set(top_ranked_features + domain_specific_features))

# Ensure 'defaulted' is not in the feature list
if 'defaulted' in final_features:
    final_features.remove('defaulted')

# Calculate number of features
print(f"Number of features in final set: {len(final_features)}")
print("\nFinal Feature Set:")
print(final_features)

# Create the final dataset for modeling
final_df = df_encoded[final_features + ['defaulted']].copy()

# Display sample of the final dataset
print("\nSample of Final Dataset:")
final_df.head()

In [None]:
# 1. Full dataset with all engineered features
processed_path_full = "../data/processed/kenyan_loan_default_engineered_full.csv"
df_encoded.to_csv(processed_path_full, index=False)
print(f"Full dataset with all engineered features saved to: {processed_path_full}")

# 2. Selected feature dataset
processed_path_selected = "../data/processed/kenyan_loan_default_engineered_selected.csv"
final_df.to_csv(processed_path_selected, index=False)
print(f"Selected feature dataset saved to: {processed_path_selected}")

# 3. Scaled dataset (Standard scaling)
processed_path_scaled = "../data/processed/kenyan_loan_default_engineered_scaled.csv"
df_scaled_standard[final_features + ['defaulted']].to_csv(processed_path_scaled, index=False)
print(f"Scaled dataset saved to: {processed_path_scaled}")

# 4. PCA transformed dataset
processed_path_pca = "../data/processed/kenyan_loan_default_pca.csv"
pca_df.to_csv(processed_path_pca, index=False)
print(f"PCA transformed dataset saved to: {processed_path_pca}")