# Early Risk Signal Model Development

## Objective
Identify behavioral patterns in customer data that indicate early signs of credit card delinquency. We will analyze the provided dataset to define threshold-based risk flags.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data
data_path = '../data/sample_data.csv'
df = pd.read_csv(data_path)
df.head()

## 1. Exploratory Data Analysis (EDA)
Let's look at the distribution of key behavioral metrics.

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(df['utilisation_pct'], bins=20, kde=True)
plt.title('Utilization Distribution')

plt.subplot(1, 3, 2)
sns.histplot(df['avg_payment_ratio'], bins=20, kde=True)
plt.title('Payment Ratio Distribution')

plt.subplot(1, 3, 3)
sns.histplot(df['recent_spend_change_pct'], bins=20, kde=True)
plt.title('Spend Change Distribution')

plt.tight_layout()
plt.show()

## 2. Defining Risk Flags
Based on the distributions and business logic, we define the following flags:

1. **High Utilization**: Customers using > 90% of their limit are at risk of over-extension.
2. **Low Payment**: Customers paying < 10% of their due amount are likely in financial distress.
3. **Cash Withdrawal**: Any cash withdrawal indicates liquidity issues.
4. **Bust-out Risk**: A sudden increase in spending (> 20%) combined with other factors can indicate a 'bust-out' fraud or reckless spending.

In [None]:
# Apply Logic
def calculate_risk(row):
    score = 0
    reasons = []
    
    # Utilization
    if row['utilisation_pct'] > 90:
        score += 30
        reasons.append('High Utilization')
    elif row['utilisation_pct'] > 70:
        score += 15
        
    # Payment
    if row['avg_payment_ratio'] < 10:
        score += 40
        reasons.append('Critical Payment Miss')
    elif row['avg_payment_ratio'] < 50:
        score += 25
        
    # Cash
    if row['cash_withdrawal_pct'] > 0:
        score += 20
        reasons.append('Cash Withdrawal')
        
    return pd.Series([min(score, 100), ", ".join(reasons)], index=['risk_score', 'risk_drivers'])

risk_df = df.apply(calculate_risk, axis=1)
final_df = pd.concat([df, risk_df], axis=1)

final_df.sort_values('risk_score', ascending=False).head()

## 3. Analysis of High Risk Customers

In [None]:
high_risk = final_df[final_df['risk_score'] >= 60]
print(f"Identified {len(high_risk)} High Risk Customers")
high_risk[['customer_id', 'utilisation_pct', 'avg_payment_ratio', 'risk_score', 'risk_drivers']]