# Credifi Credit Risk Analysis

**Submission Format Requirements:**
- All code and results are presented in this Jupyter Notebook.
- Library imports, version listing, and reproducibility are ensured.
- Well-commented code for clarity.


In [None]:
# Library Imports and Version Listing
import sys
import platform
import pandas as pd
import numpy as np
import sklearn
import xgboost
import pickle

print(f'Python version: {platform.python_version()}')
print(f'Pandas version: {pd.__version__}')
print(f'Numpy version: {np.__version__}')
print(f'scikit-learn version: {sklearn.__version__}')
print(f'XGBoost version: {xgboost.__version__}')


## Data Loading and Preprocessing
Load the credit data and apply necessary preprocessing steps as per the project pipeline.


In [None]:
# Data Loading, Cleaning, and Preprocessing
from ml.data_processor import CreditDataProcessor

# Initialize processor
processor = CreditDataProcessor()

# Load raw data
data = processor.load_data('data/credit_data.csv')
print(f'Dataset shape: {data.shape}')
display(data.head())

# Clean data
clean_data = processor.clean_data(data)
print('After cleaning:')
display(clean_data.describe())

# Encode categorical features
encoded_data = processor.encode_categorical_features(clean_data)
print('After encoding categorical features:')
display(encoded_data.head())

# Scale numerical features
scaled_data = processor.scale_features(encoded_data)
print('After scaling numerical features:')
display(scaled_data.head())

## Model Training and Evaluation
Train the XGBoost model, evaluate its performance, and visualize the results.

In [None]:
# Model Training and Evaluation
from ml.model import CreditRiskModel

# Initialize model
risk_model = CreditRiskModel()

# Train the model (set optimize_hyperparameters=True if you want grid search)
results = risk_model.train_model(clean_data, optimize_hyperparameters=False)

# Print evaluation metrics
print("Train Score:", results['train_score'])
print("Test Score:", results['test_score'])
print("Accuracy:", results['accuracy'])
print("AUC Score:", results['auc_score'])
print("Cross-Validation Mean:", results['cv_mean'])
print("Cross-Validation Std:", results['cv_std'])

# Classification report
import pandas as pd
cr = pd.DataFrame(results['classification_report']).transpose()
display(cr)

# Confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = results['confusion_matrix']
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Feature importance
fi = results['feature_importance']
plt.figure(figsize=(8,5))
sns.barplot(x=list(fi.values()), y=list(fi.keys()))
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

## Risk Analysis and Profit Optimization
Analyze risk, assign interest rates, and determine profit optimization using the trained model.

In [None]:
# Risk Analysis and Profit Optimization
from ml.risk_calculator import RiskCalculator

# Prepare features for prediction
X = risk_model.data_processor.prepare_features(clean_data, fit=False)
y = clean_data[risk_model.target_column]

# Get model probabilities
y_pred = risk_model.model.predict(X)
y_pred_proba = risk_model.model.predict_proba(X)[:, 1]

# Initialize risk calculator
risk_calc = RiskCalculator()

# Generate risk report
risk_report = risk_calc.generate_risk_report(clean_data, y_pred, y_pred_proba)

# Display key risk metrics
print('Portfolio Metrics:')
for k, v in risk_report['portfolio_metrics'].items():
    print(f'{k}: {v}')

print('\nProfit Analysis:')
for k, v in risk_report['profit_analysis'].items():
    if k.endswith('_data'): continue  # Skip large DataFrames
    print(f'{k}: {v}')

print('\nRisk Tier Distribution:')
print(risk_report['tier_distribution'])

# Show break-even analysis (optional)
break_even = risk_calc.calculate_break_even_analysis(clean_data, pd.Series(y_pred_proba))
display(break_even['break_even_analysis'].head())

## SHAP Explainability
Visualize and interpret model predictions using SHAP values.

In [None]:
# SHAP Explainability
import shap

# Pick a sample application (first row)
sample_app = clean_data.iloc[0].to_dict()

# Generate SHAP explanation
explanation = risk_model.explain_prediction(sample_app)

# SHAP summary plot for the whole dataset
explainer = shap.TreeExplainer(risk_model.model)
shap_values = explainer.shap_values(X)

# Summary plot (feature importance across all samples)
shap.summary_plot(shap_values, X, feature_names=X.columns)

# Force plot for a single prediction (requires Jupyter, not just VSCode)
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], X.iloc[0, :], feature_names=X.columns)

## Risk and ROI Visualizations
Visualize break-even analysis, risk tiers, and portfolio risk/return.

In [None]:
# Visualize break-even analysis
break_even_df = break_even['break_even_analysis']
plt.figure(figsize=(10,6))
plt.plot(break_even_df['acceptance_rate'], break_even_df['net_profit'], marker='o')
plt.title('Net Profit vs. Acceptance Rate')
plt.xlabel('Acceptance Rate')
plt.ylabel('Net Profit')
plt.grid(True)
plt.show()

# Visualize risk tier distribution
tiers = risk_report['tier_distribution']
plt.figure(figsize=(8,5))
sns.barplot(x=list(tiers.keys()), y=list(tiers.values()))
plt.title('Risk Tier Distribution')
plt.xlabel('Risk Tier')
plt.ylabel('Number of Applicants')
plt.show()

# Visualize weighted average interest rate and default probability
metrics = risk_report['portfolio_metrics']
plt.figure(figsize=(6,4))
plt.bar(['Weighted Avg. Interest Rate', 'Weighted Avg. Default Prob.'],
        [metrics['weighted_avg_interest_rate'], metrics['weighted_avg_default_probability']])
plt.title('Portfolio Risk & Return')
plt.ylabel('Value')
plt.show()