In [4]:
import pandas as pd
import numpy as np

np.random.seed(42)

n = 10000  # number of loans

# Loan sectors based on SBI’s annual report
loan_types = np.random.choice(
    ["Agriculture", "Industry_Priority", "Services_Priority", "Personal_Priority",
     "Industry_NonPriority", "Services_NonPriority", "Personal_NonPriority"],
    size=n,
    p=[0.15, 0.07, 0.10, 0.08, 0.20, 0.20, 0.20]  # rough distribution
)

# Loan amounts (log-normal distribution)
loan_amount = np.random.lognormal(mean=12, sigma=0.6, size=n) / 1e5  # in lakhs

# Borrower income
income = np.random.normal(loc=8, scale=3, size=n)  # in lakhs

# Credit score
credit_score = np.clip(np.random.normal(loc=700, scale=60, size=n), 300, 850)

# Tenure
tenure = np.random.choice([12, 24, 36, 60, 120, 240], size=n, p=[0.1,0.15,0.25,0.25,0.15,0.1])

# Default probabilities mapped from SBI’s NPA ratios
sector_npa = {
    "Agriculture": 0.0841,
    "Industry_Priority": 0.0439,
    "Services_Priority": 0.0333,
    "Personal_Priority": 0.0104,
    "Industry_NonPriority": 0.0183,
    "Services_NonPriority": 0.0054,
    "Personal_NonPriority": 0.0069
}

default_prob = [sector_npa[lt] for lt in loan_types]

# Add borrower-level variation
default_prob = np.array(default_prob) + (credit_score < 600) * 0.05 + (income < 5) * 0.02
default_prob = np.clip(default_prob, 0, 0.5)

default = np.random.binomial(1, default_prob)

# Build DataFrame
df = pd.DataFrame({
    "loan_type": loan_types,
    "loan_amount_lakh": loan_amount.round(2),
    "borrower_income_lakh": income.round(2),
    "credit_score": credit_score.astype(int),
    "tenure_months": tenure,
    "default": default
})

df.to_csv("data/synthetic_sbi_loans.csv", index=False)

print(df.head())
print("Simulated Default Rate:", df['default'].mean())

# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, roc_auc_score

# # Features and Target
# X = pd.get_dummies(df.drop("default", axis=1), drop_first=True)
# y = df["default"]

# # Split
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )

# # Scale
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# # Model
# model = LogisticRegression(max_iter=1000)
# model.fit(X_train, y_train)

# # Predict
# y_pred = model.predict(X_test)
# y_prob = model.predict_proba(X_test)[:, 1]

# print("ROC AUC:", roc_auc_score(y_test, y_prob))
# print(classification_report(y_test, y_pred))

              loan_type  loan_amount_lakh  borrower_income_lakh  credit_score  \
0     Personal_Priority              0.67                  6.15           747   
1  Personal_NonPriority              0.83                  6.82           700   
2  Services_NonPriority              2.06                  4.90           739   
3  Industry_NonPriority              0.80                  5.33           639   
4     Industry_Priority              3.17                  9.30           708   

   tenure_months  default  
0             12        0  
1            240        0  
2             36        0  
3             24        0  
4             36        0  
Simulated Default Rate: 0.0304
