# Loan Eligibility Prediction Model Training

This notebook demonstrates the training and evaluation of machine learning models for predicting loan eligibility.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix, roc_curve
)
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Generate Synthetic Training Data

In [None]:
def generate_synthetic_data(n_samples=5000):
    """Generate synthetic loan application data"""
    np.random.seed(42)
    
    data = []
    for _ in range(n_samples):
        # Generate basic demographics
        gender = np.random.choice(['Male', 'Female'])
        married = np.random.choice(['Yes', 'No'])
        dependents = np.random.randint(0, 4)
        education = np.random.choice(['Graduate', 'Not Graduate'])
        self_employed = np.random.choice(['Yes', 'No'])
        
        # Generate income and loan details
        applicant_income = np.random.lognormal(10.5, 0.5)
        coapplicant_income = np.random.lognormal(9.5, 0.8) if np.random.random() > 0.3 else 0
        loan_amount = np.random.lognormal(12, 0.3)
        loan_amount_term = np.random.choice([12, 24, 36, 60, 84, 120, 180, 240, 300, 360, 480])
        
        # Credit history (biased towards good credit)
        credit_history = np.random.choice(['Yes', 'No'], p=[0.85, 0.15])
        
        # Property area
        property_area = np.random.choice(['Urban', 'Semiurban', 'Rural'], p=[0.5, 0.3, 0.2])
        
        # Calculate eligibility based on multiple factors
        approval_prob = 0.6
        
        # Income factors
        total_income = applicant_income + coapplicant_income
        income_to_loan_ratio = total_income / loan_amount
        if income_to_loan_ratio > 0.3:
            approval_prob += 0.2
        elif income_to_loan_ratio > 0.2:
            approval_prob += 0.1
        else:
            approval_prob -= 0.3
        
        # Credit history factor
        if credit_history == 'Yes':
            approval_prob += 0.3
        else:
            approval_prob -= 0.4
        
        # Education factor
        if education == 'Graduate':
            approval_prob += 0.1
        else:
            approval_prob -= 0.1
        
        # Final decision based on probability
        approval_prob = max(0, min(1, approval_prob))
        loan_status = 'Y' if np.random.random() < approval_prob else 'N'
        
        data.append({
            'Gender': gender,
            'Married': married,
            'Dependents': dependents,
            'Education': education,
            'Self_Employed': self_employed,
            'ApplicantIncome': applicant_income,
            'CoapplicantIncome': coapplicant_income,
            'LoanAmount': loan_amount,
            'Loan_Amount_Term': loan_amount_term,
            'Credit_History': credit_history,
            'Property_Area': property_area,
            'Loan_Status': loan_status
        })
    
    return pd.DataFrame(data)

# Generate data
print("Generating synthetic training data...")
df = generate_synthetic_data(5000)
print(f"Generated {len(df)} samples")
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\nTarget Distribution:")
print(df['Loan_Status'].value_counts())
print(f"Approval Rate: {df['Loan_Status'].value_counts()['Y'] / len(df) * 100:.2f}%")

In [None]:
# Visualize target distribution
plt.figure(figsize=(8, 6))
df['Loan_Status'].value_counts().plot(kind='bar', color=['skyblue', 'lightcoral'])
plt.title('Loan Status Distribution')
plt.xlabel('Loan Status')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Analyze categorical features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

categorical_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']

for i, col in enumerate(categorical_cols):
    pd.crosstab(df[col], df['Loan_Status'], normalize='index').plot(kind='bar', ax=axes[i])
    axes[i].set_title(f'Loan Approval Rate by {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Proportion')
    axes[i].legend(['Rejected', 'Approved'])
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Analyze numerical features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Income distribution
df.boxplot(column='ApplicantIncome', by='Loan_Status', ax=axes[0,0])
axes[0,0].set_title('Applicant Income by Loan Status')
axes[0,0].set_xlabel('Loan Status')
axes[0,0].set_ylabel('Applicant Income')

# Loan amount distribution
df.boxplot(column='LoanAmount', by='Loan_Status', ax=axes[0,1])
axes[0,1].set_title('Loan Amount by Loan Status')
axes[0,1].set_xlabel('Loan Status')
axes[0,1].set_ylabel('Loan Amount')

# Total income vs loan amount
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
approved = df[df['Loan_Status'] == 'Y']
rejected = df[df['Loan_Status'] == 'N']

axes[1,0].scatter(approved['Total_Income'], approved['LoanAmount'], alpha=0.6, label='Approved', color='green')
axes[1,0].scatter(rejected['Total_Income'], rejected['LoanAmount'], alpha=0.6, label='Rejected', color='red')
axes[1,0].set_xlabel('Total Income')
axes[1,0].set_ylabel('Loan Amount')
axes[1,0].set_title('Total Income vs Loan Amount')
axes[1,0].legend()

# Income to loan ratio
df['Income_Loan_Ratio'] = df['Total_Income'] / df['LoanAmount']
df.boxplot(column='Income_Loan_Ratio', by='Loan_Status', ax=axes[1,1])
axes[1,1].set_title('Income-to-Loan Ratio by Loan Status')
axes[1,1].set_xlabel('Loan Status')
axes[1,1].set_ylabel('Income-to-Loan Ratio')

plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
def create_features(df):
    """Create additional features for better model performance"""
    df = df.copy()
    
    # Total income
    df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
    
    # Income to loan ratio
    df['Income_Loan_Ratio'] = df['Total_Income'] / df['LoanAmount']
    
    # Per capita income (income per dependent)
    df['Per_Capita_Income'] = df['Total_Income'] / (df['Dependents'] + 1)
    
    # Loan amount per month
    df['Monthly_Loan_Amount'] = df['LoanAmount'] / df['Loan_Amount_Term']
    
    # Income stability indicator
    df['Income_Stability'] = (df['Self_Employed'] == 'No').astype(int)
    
    # Family size
    df['Family_Size'] = df['Dependents'] + (df['Married'] == 'Yes').astype(int) + 1
    
    # Credit history binary
    df['Credit_History_Binary'] = (df['Credit_History'] == 'Yes').astype(int)
    
    # Property area encoded numerically
    property_area_mapping = {'Rural': 0, 'Semiurban': 1, 'Urban': 2}
    df['Property_Area_Numeric'] = df['Property_Area'].map(property_area_mapping)
    
    # Education level numeric
    df['Education_Numeric'] = (df['Education'] == 'Graduate').astype(int)
    
    return df

# Apply feature engineering
df_features = create_features(df)
print("New features created:")
print(df_features.columns.tolist())
df_features.head()

## 4. Data Preprocessing

In [None]:
def preprocess_data(df):
    """Preprocess data for model training"""
    # Separate features and target
    X = df.drop(['Loan_Status'], axis=1)
    y = df['Loan_Status'].map({'Y': 1, 'N': 0})
    
    # Encode categorical variables
    categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']
    
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le
    
    return X.values, y.values, label_encoders

# Preprocess data
X, y, encoders = preprocess_data(df_features)
print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"Features: {df_features.drop(['Loan_Status'], axis=1).columns.tolist()}")

## 5. Model Training and Evaluation

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# Train models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'auc_roc': roc_auc_score(y_test, y_pred_proba)
    }
    
    results[name] = {
        'model': model,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'metrics': metrics
    }
    
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1-Score: {metrics['f1_score']:.4f}")
    print(f"AUC-ROC: {metrics['auc_roc']:.4f}")

## 6. Model Comparison

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    model: results[model]['metrics'] 
    for model in results.keys()
}).T

print("Model Performance Comparison:")
print(comparison_df.round(4))

# Visualize comparison
fig, ax = plt.subplots(figsize=(12, 8))
comparison_df.plot(kind='bar', ax=ax)
plt.title('Model Performance Comparison')
plt.xlabel('Models')
plt.ylabel('Score')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# ROC Curve comparison
plt.figure(figsize=(10, 8))

for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['probabilities'])
    auc = result['metrics']['auc_roc']
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

## 7. Feature Importance Analysis

In [None]:
# Get feature names
feature_names = df_features.drop(['Loan_Status'], axis=1).columns.tolist()

# Feature importance for Gradient Boosting
gb_model = results['Gradient Boosting']['model']
feature_importance = gb_model.feature_importances_

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(importance_df.head(10))

# Visualize feature importance
plt.figure(figsize=(12, 8))
top_features = importance_df.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importances (Gradient Boosting)')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 8. Confusion Matrix Analysis

In [None]:
# Create confusion matrices for both models
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

for i, (name, result) in enumerate(results.items()):
    cm = confusion_matrix(y_test, result['predictions'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
    axes[i].set_title(f'Confusion Matrix - {name}')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')
    axes[i].set_xticklabels(['Rejected', 'Approved'])
    axes[i].set_yticklabels(['Rejected', 'Approved'])

plt.tight_layout()
plt.show()

# Print detailed classification reports
for name, result in results.items():
    print(f"\nClassification Report - {name}:")
    print(classification_report(y_test, result['predictions'], target_names=['Rejected', 'Approved']))

## 9. Save the Best Model

In [None]:
import joblib
import os

# Select best model based on F1-score
best_model_name = max(results.keys(), key=lambda x: results[x]['metrics']['f1_score'])
best_model = results[best_model_name]['model']

print(f"Best model: {best_model_name}")
print(f"F1-Score: {results[best_model_name]['metrics']['f1_score']:.4f}")

# Save model components
os.makedirs('../models', exist_ok=True)

# Save the complete model pipeline
model_data = {
    'model': best_model,
    'scaler': scaler,
    'label_encoders': encoders,
    'feature_names': feature_names,
    'metrics': results[best_model_name]['metrics']
}

joblib.dump(model_data, '../models/loan_eligibility_model.joblib')
print("Model saved successfully!")

# Also save individual components for the API
joblib.dump(best_model, '../models/model.joblib')
joblib.dump(scaler, '../models/scaler.joblib')
joblib.dump(encoders, '../models/label_encoders.joblib')
print("Individual components saved!")

## 10. Test Model with Sample Data

In [None]:
# Test with sample applicants
test_applicants = [
    {
        'Gender': 'Male',
        'Married': 'Yes',
        'Dependents': 1,
        'Education': 'Graduate',
        'Self_Employed': 'No',
        'ApplicantIncome': 5000,
        'CoapplicantIncome': 3000,
        'LoanAmount': 150000,
        'Loan_Amount_Term': 360,
        'Credit_History': 'Yes',
        'Property_Area': 'Urban'
    },
    {
        'Gender': 'Female',
        'Married': 'No',
        'Dependents': 3,
        'Education': 'Not Graduate',
        'Self_Employed': 'Yes',
        'ApplicantIncome': 2000,
        'CoapplicantIncome': 0,
        'LoanAmount': 200000,
        'Loan_Amount_Term': 240,
        'Credit_History': 'No',
        'Property_Area': 'Rural'
    }
]

for i, applicant in enumerate(test_applicants):
    print(f"\nTest Applicant {i+1}:")
    print(f"Income: ${applicant['ApplicantIncome'] + applicant['CoapplicantIncome']:,}")
    print(f"Loan Amount: ${applicant['LoanAmount']:,}")
    print(f"Credit History: {applicant['Credit_History']}")
    print(f"Education: {applicant['Education']}")
    
    # Preprocess and predict
    df_test = pd.DataFrame([applicant])
    df_test_features = create_features(df_test)
    
    # Encode categorical variables
    categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']
    for col in categorical_columns:
        df_test_features[col] = encoders[col].transform(df_test_features[col])
    
    # Select features in correct order
    X_test_sample = df_test_features[feature_names]
    X_test_scaled_sample = scaler.transform(X_test_sample)
    
    # Make prediction
    prediction = best_model.predict(X_test_scaled_sample)[0]
    probability = best_model.predict_proba(X_test_scaled_sample)[0, 1]
    
    print(f"Prediction: {'Approved' if prediction == 1 else 'Rejected'}")
    print(f"Probability: {probability:.4f}")

## Summary

This notebook demonstrated:

1. **Data Generation**: Created synthetic loan application data with realistic patterns
2. **Exploratory Analysis**: Analyzed feature distributions and their relationship with loan approval
3. **Feature Engineering**: Created additional features like income ratios and family size
4. **Model Training**: Trained and compared Logistic Regression and Gradient Boosting models
5. **Model Evaluation**: Used multiple metrics including accuracy, precision, recall, F1-score, and AUC-ROC
6. **Feature Importance**: Identified the most important features for loan eligibility prediction
7. **Model Saving**: Saved the best performing model for deployment

The Gradient Boosting model showed superior performance and will be used in the API deployment.