# Creditworthiness Prediction

This notebook predicts an individual's creditworthiness using past financial data through various classification algorithms.


## 1. Import Libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)


## 2. Load and Explore Dataset


In [None]:
# Generate synthetic dataset for demonstration
# In practice, you would load your actual dataset here
np.random.seed(42)
n_samples = 1000

data = {
    'age': np.random.randint(18, 70, n_samples),
    'income': np.random.normal(50000, 20000, n_samples).clip(20000, 150000),
    'debt_amount': np.random.normal(15000, 8000, n_samples).clip(0, 50000),
    'credit_history_length': np.random.randint(1, 20, n_samples),
    'num_credit_cards': np.random.randint(1, 8, n_samples),
    'num_loans': np.random.randint(0, 5, n_samples),
    'payment_history_score': np.random.uniform(0, 100, n_samples),  # 0-100 score
    'late_payments_count': np.random.poisson(2, n_samples),
    'utilization_ratio': np.random.uniform(0, 1, n_samples),  # Credit utilization
    'employment_years': np.random.uniform(0, 30, n_samples)
}

df = pd.DataFrame(data)

# Create target variable based on financial indicators
# Creditworthy (1) if: good payment history, low debt-to-income, reasonable utilization
debt_to_income = df['debt_amount'] / (df['income'] + 1)
df['creditworthy'] = (
    (df['payment_history_score'] > 70) & 
    (debt_to_income < 0.4) & 
    (df['utilization_ratio'] < 0.7) &
    (df['late_payments_count'] < 3)
).astype(int)

print(f"Dataset shape: {df.shape}")
print(f"\nCreditworthy distribution:")
print(df['creditworthy'].value_counts())
print(f"\nFirst few rows:")
df.head()


In [None]:
# Basic statistics
df.describe()


In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check data types
print("\nData types:")
print(df.dtypes)


## 3. Feature Engineering


In [None]:
# Create engineered features
df['debt_to_income_ratio'] = df['debt_amount'] / (df['income'] + 1)
df['income_per_year'] = df['income'] / (df['age'] - 17 + 1)  # Approximate income per year of age
df['avg_debt_per_loan'] = df['debt_amount'] / (df['num_loans'] + 1)
df['credit_age_ratio'] = df['credit_history_length'] / (df['age'] - 17 + 1)
df['payment_reliability'] = 100 - (df['late_payments_count'] * 10).clip(0, 100)
df['total_credit_lines'] = df['num_credit_cards'] + df['num_loans']

# Create interaction features
df['income_utilization'] = df['income'] * (1 - df['utilization_ratio'])
df['history_payment_score'] = df['credit_history_length'] * df['payment_history_score'] / 100

# Binning features
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 45, 60, 100], labels=['Young', 'Middle', 'Senior', 'Elder'])
df['income_group'] = pd.cut(df['income'], bins=[0, 40000, 70000, 100000, 200000], 
                            labels=['Low', 'Medium', 'High', 'Very High'])

print("Feature engineering completed!")
print(f"Total features after engineering: {df.shape[1]}")
df.head()


In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

features_to_plot = ['income', 'debt_amount', 'payment_history_score', 
                    'debt_to_income_ratio', 'utilization_ratio', 'credit_history_length']

for idx, feature in enumerate(features_to_plot):
    axes[idx].hist(df[feature], bins=30, alpha=0.7, edgecolor='black')
    axes[idx].set_title(f'Distribution of {feature}')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Correlation heatmap
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()


## 4. Data Preprocessing


In [None]:
# Select features for modeling
# Exclude target variable and categorical features that need encoding
feature_cols = [
    'age', 'income', 'debt_amount', 'credit_history_length',
    'num_credit_cards', 'num_loans', 'payment_history_score',
    'late_payments_count', 'utilization_ratio', 'employment_years',
    'debt_to_income_ratio', 'income_per_year', 'avg_debt_per_loan',
    'credit_age_ratio', 'payment_reliability', 'total_credit_lines',
    'income_utilization', 'history_payment_score'
]

X = df[feature_cols].copy()
y = df['creditworthy'].copy()

print(f"Features shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")
print(f"\nTarget distribution (%):\n{y.value_counts(normalize=True) * 100}")


In [None]:
# Handle any infinite or NaN values
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.mean())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"\nTraining target distribution:\n{y_train.value_counts()}")
print(f"\nTest target distribution:\n{y_test.value_counts()}")


In [None]:
# Scale features (important for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for better readability
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Feature scaling completed!")


## 5. Model Training and Evaluation


In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
}

# Store results
results = {}
predictions = {}
probabilities = {}

# Train and evaluate each model
for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")
    print(f"{'='*50}")
    
    # Use scaled data for Logistic Regression, original for tree-based models
    if name == 'Logistic Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    }
    
    predictions[name] = y_pred
    probabilities[name] = y_proba
    
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))


In [None]:
# Create comparison DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.round(4)
print("\nModel Performance Comparison:")
print("="*60)
print(results_df)


In [None]:
# Visualize metrics comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot for all metrics
results_df.plot(kind='bar', ax=axes[0], width=0.8)
axes[0].set_title('Model Performance Metrics Comparison', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Model', fontsize=12)
axes[0].set_ylabel('Score', fontsize=12)
axes[0].legend(loc='best')
axes[0].grid(axis='y', alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)

# ROC-AUC comparison
results_df['ROC-AUC'].plot(kind='bar', ax=axes[1], color='steelblue', width=0.6)
axes[1].set_title('ROC-AUC Score Comparison', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Model', fontsize=12)
axes[1].set_ylabel('ROC-AUC Score', fontsize=12)
axes[1].grid(axis='y', alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)
axes[1].axhline(y=0.5, color='r', linestyle='--', label='Random Classifier')
axes[1].legend()

plt.tight_layout()
plt.show()


## 7. ROC Curves


In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(10, 8))

for name in models.keys():
    fpr, tpr, _ = roc_curve(y_test, probabilities[name])
    auc_score = roc_auc_score(y_test, probabilities[name])
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc_score:.4f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier (AUC = 0.5000)', linewidth=1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


## 8. Confusion Matrices


In [None]:
# Plot confusion matrices for all models
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, name in enumerate(models.keys()):
    cm = confusion_matrix(y_test, predictions[name])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                cbar_kws={'shrink': 0.8})
    axes[idx].set_title(f'{name}\nAccuracy: {results[name]["Accuracy"]:.4f}', 
                       fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Predicted', fontsize=10)
    axes[idx].set_ylabel('Actual', fontsize=10)
    axes[idx].set_xticklabels(['Not Creditworthy', 'Creditworthy'])
    axes[idx].set_yticklabels(['Not Creditworthy', 'Creditworthy'])

plt.tight_layout()
plt.show()


## 9. Feature Importance (Tree-based Models)


In [None]:
# Feature importance for Decision Tree and Random Forest
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for idx, name in enumerate(['Decision Tree', 'Random Forest']):
    model = models[name]
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Plot top 10 features
    top_features = feature_importance.head(10)
    axes[idx].barh(range(len(top_features)), top_features['importance'], color='steelblue')
    axes[idx].set_yticks(range(len(top_features)))
    axes[idx].set_yticklabels(top_features['feature'])
    axes[idx].set_xlabel('Importance', fontsize=12)
    axes[idx].set_title(f'Top 10 Feature Importance - {name}', fontsize=12, fontweight='bold')
    axes[idx].grid(axis='x', alpha=0.3)
    axes[idx].invert_yaxis()

plt.tight_layout()
plt.show()


## 10. Cross-Validation


In [None]:
# Perform cross-validation for more robust evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = {}

for name, model in models.items():
    if name == 'Logistic Regression':
        X_data = X_train_scaled
    else:
        X_data = X_train
    
    cv_scores = cross_val_score(model, X_data, y_train, cv=cv, scoring='roc_auc')
    cv_results[name] = {
        'Mean ROC-AUC': cv_scores.mean(),
        'Std ROC-AUC': cv_scores.std(),
        'Scores': cv_scores
    }
    
    print(f"{name} Cross-Validation ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Visualize cross-validation results
cv_df = pd.DataFrame({
    name: cv_results[name]['Scores'] for name in models.keys()
})

plt.figure(figsize=(10, 6))
cv_df.boxplot()
plt.title('Cross-Validation ROC-AUC Scores', fontsize=14, fontweight='bold')
plt.ylabel('ROC-AUC Score', fontsize=12)
plt.xlabel('Model', fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()


## 11. Summary and Conclusions


In [None]:
print("="*60)
print("FINAL MODEL COMPARISON SUMMARY")
print("="*60)
print("\nTest Set Performance:")
print(results_df)

print("\n\nCross-Validation Performance:")
cv_summary = pd.DataFrame({
    name: [cv_results[name]['Mean ROC-AUC'], cv_results[name]['Std ROC-AUC']]
    for name in models.keys()
}, index=['Mean ROC-AUC', 'Std ROC-AUC']).T
print(cv_summary)

print("\n\nBest Model:")
best_model = results_df['ROC-AUC'].idxmax()
print(f"Based on ROC-AUC: {best_model} ({results_df.loc[best_model, 'ROC-AUC']:.4f})")

best_f1 = results_df['F1-Score'].idxmax()
print(f"Based on F1-Score: {best_f1} ({results_df.loc[best_f1, 'F1-Score']:.4f})")
