In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
# File paths
INPUT_FILE = "../../../3. Data/3. Anomaly_Data/features_with_anomaly.csv"
OUTPUT_FILE = "../../../3. Data/4. Risk_Data/features_with_risklabel.csv"

# Load dataset
df = pd.read_csv(INPUT_FILE)

In [3]:
# Statistics
print(f"\nüìà Key Statistics:")
print(f"   PD:           Mean={df['PD'].mean():.4f}, Median={df['PD'].median():.4f}")
print(f"   Anomaly Score: Mean={df['anomaly_score'].mean():.4f}, Max={df['anomaly_score'].max():.4f}")
print(f"   Anomaly Flag:  {(df['anomalyFlag']==1).sum():,} ({100*(df['anomalyFlag']==1).sum()/len(df):.2f}%)")
print(f"   Avg Bounces:   {df['bounceCount'].mean():.2f}")
print(f"   Avg EMI Ratio: {df['emiRatio'].mean():.4f}")


üìà Key Statistics:
   PD:           Mean=0.3679, Median=0.1639
   Anomaly Score: Mean=0.1318, Max=0.2092
   Anomaly Flag:  907 (3.02%)
   Avg Bounces:   0.64
   Avg EMI Ratio: 0.3004


# üìä Borrower Risk Classification Framework
 
Borrowers are classified into **HIGH**, **MEDIUM**, or **LOW** risk categories based on probability of default (PD), anomalies, payment behavior, and financial stress indicators.

---

## üî¥ SECTION 1 ‚Äî ABSOLUTE RED FLAGS (ALWAYS HIGH RISK)

If **any one** of the following conditions is met, the borrower is **immediately classified as HIGH RISK**.

### Non-Negotiable Conditions
- `anomalyFlag == 1`  
  *Fraud or abnormal behavior detected*

- `PD ‚â• 0.70`  
  *Extremely high probability of default*

- `bounceCount ‚â• 5`  
  *Chronic payment failure pattern*

- `anomaly_score ‚â• 0.80`  
  *Highly unusual behavior compared to the population*

‚û°Ô∏è **Result:** `HIGH RISK`

These represent **critical banking risk events** and override all other logic.

---

## üî¥ SECTION 2 ‚Äî HIGH RISK ZONE (Stacked Severe Stress)

### Direct PD Rule
- If `PD ‚â• 0.40`  
  ‚ûù **HIGH RISK**

---

### Composite High-Risk Scoring

If PD is below 0.40, a **high-risk score** is calculated using multiple stress indicators.

#### Scoring Rules
| Condition | Points |
|---------|--------|
| `PD ‚â• 0.30` | +2 |
| `bounceCount ‚â• 3` | +2 |
| `anomaly_score ‚â• 0.50` | +1 |
| `emiRatio ‚â• 0.55` | +1 |
| `expenseRatio ‚â• 0.85` | +1 |
| `incomeCV ‚â• 0.50` | +1 |
| `avgMonthlyBalance < 20% of avgMonthlyIncome` | +1 |

#### Decision Rule
- If `high_risk_score ‚â• 4`  
  ‚ûù **HIGH RISK**

üìå **Principle:**  
Multiple moderate weaknesses together can equal **severe financial stress**.

---

## üü° SECTION 3 ‚Äî MEDIUM RISK ZONE

### PD-Based Medium Risk
- `PD ‚â• 0.20` ‚ûù **MEDIUM RISK**
- `0.15 ‚â§ PD < 0.20` ‚ûù **MEDIUM RISK**  
  *(Even without additional signals)*

---

### Warning-Signal Accumulation

A **medium-risk score** is calculated to detect early warning patterns.

#### Scoring Rules
| Condition | Points |
|---------|--------|
| `PD ‚â• 0.10` | +2 |
| `bounceCount ‚â• 1` | +1 |
| `anomaly_score ‚â• 0.30` | +1 |
| `emiRatio ‚â• 0.40` | +1 |
| `expenseRatio ‚â• 0.60` | +1 |
| `incomeCV ‚â• 0.30` | +1 |
| `avgMonthlyBalance < 50% of avgMonthlyIncome` | +1 |
| `accountAgeMonths < 12` | +1 |

#### Decision Rule
- If `medium_risk_score ‚â• 4`  
  ‚ûù **MEDIUM RISK**

üìå **Interpretation:**  
These profiles indicate **borderline or emerging risk** requiring monitoring.

---

## üü¢ SECTION 4 ‚Äî LOW RISK (DEFAULT)

If **none** of the HIGH or MEDIUM conditions are triggered:

‚û°Ô∏è **LOW RISK**

### Typical Characteristics
- Low probability of default
- No anomalies detected
- Controlled EMI burden
- Controlled expenses
- Strong liquidity position
- Stable income
- Clean repayment history

---

## ‚úÖ Final Risk Labels
- **HIGH RISK** ‚Üí Immediate action / rejection / enhanced controls
- **MEDIUM RISK** ‚Üí Monitoring / conditional approval
- **LOW RISK** ‚Üí Standard approval

---


In [4]:
def assign_risk_label(row):
    
    # Extract all features
    pd = row['PD']
    anomaly_score = row['anomaly_score']
    anomaly_flag = row['anomalyFlag']
    bounces = row['bounceCount']
    emi = row['emiRatio']
    expense = row['expenseRatio']
    income_cv = row['incomeCV']
    income = row['avgMonthlyIncome']
    balance = row['avgMonthlyBalance']
    vintage = row['accountAgeMonths']
    
    # SECTION 1: ABSOLUTE RED FLAGS ‚Üí ALWAYS HIGH
    
    if anomaly_flag == 1:
        return "HIGH"  # Fraud detected
    
    if pd >= 0.70:
        return "HIGH"  # Extremely high default probability
    
    if bounces >= 5:
        return "HIGH"  # Chronic payment failure
    
    if anomaly_score >= 0.8:
        return "HIGH"  # Very high anomaly score
    
    # SECTION 2: HIGH RISK ZONE
    # PD is primary, but other factors can push to HIGH
    
    # PD-based HIGH threshold
    if pd >= 0.40:
        return "HIGH"
    
    # Multiple severe stress signals ‚Üí HIGH
    high_risk_score = 0
    
    if pd >= 0.30:
        high_risk_score += 2  # Strong PD signal
    if bounces >= 3:
        high_risk_score += 2  # Multiple bounces
    if anomaly_score >= 0.5:
        high_risk_score += 1  # Elevated anomaly
    if emi >= 0.55:
        high_risk_score += 1  # Very high EMI
    if expense >= 0.85:
        high_risk_score += 1  # Extreme expenses
    if income_cv >= 0.50:
        high_risk_score += 1  # Very volatile income
    if balance < (0.2 * income):
        high_risk_score += 1  # Very low buffer
    
    if high_risk_score >= 4:
        return "HIGH"  # Accumulation of severe issues


    
    # SECTION 3: MEDIUM RISK ZONE
    # PD >= 0.15 OR multiple concerning signals
    
    # PD-based MEDIUM threshold
    if pd >= 0.20:
        return "MEDIUM"
    
    # Borderline PD with additional concerns
    if pd >= 0.15:
        if bounces >= 1 or emi >= 0.40 or expense >= 0.70:
            return "MEDIUM"
        return "MEDIUM"  # PD alone is enough
    
    # Lower PD but multiple warning signals ‚Üí MEDIUM
    medium_risk_score = 0
    
    if pd >= 0.10:
        medium_risk_score += 2
    if bounces >= 1:
        medium_risk_score += 1
    if anomaly_score >= 0.3:
        medium_risk_score += 1
    if emi >= 0.40:
        medium_risk_score += 1
    if expense >= 0.60:
        medium_risk_score += 1
    if income_cv >= 0.30:
        medium_risk_score += 1
    if balance < (0.5 * income):
        medium_risk_score += 1
    if vintage < 12:
        medium_risk_score += 1
    
    if medium_risk_score >= 4:
        return "MEDIUM"
    
    # SECTION 4: LOW RISK (DEFAULT)
    # Low PD + healthy financial metrics
    
    return "LOW"

In [5]:

print("\nAssigning risk labels using ALL features...")
df['riskLabel'] = df.apply(assign_risk_label, axis=1)


Assigning risk labels using ALL features...


In [6]:
# Save results
df.to_csv(OUTPUT_FILE, index=False)
print(f"‚úÖ Saved to: {OUTPUT_FILE}")
print(f"üì¶ Total records: {len(df):,}")
print(f"üìä Columns: {len(df.columns)} (added riskLabel)")

‚úÖ Saved to: ../../../3. Data/4. Risk_Data/features_with_risklabel.csv
üì¶ Total records: 30,000
üìä Columns: 12 (added riskLabel)
