# **Synthetic Lending Dataset for Loan Risk & Default Prediction**

 **Goal: To design and generate a realistic synthetic dataset for loan applications that simulates borrower demographics, financials, loan details, and repayment history. This dataset will later be used to build risk scoring models and perform business insights analysis.**

In [2]:
!pip install faker
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker()

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

# Number of records
n = 5000

# Generate synthetic data
data = {
    "Customer_ID": [f"CUST{1000+i}" for i in range(n)],
    "Age": np.random.randint(21, 65, n),
    "Gender": np.random.choice(["Male", "Female"], n),
    "Income": np.random.randint(20000, 200000, n),
    "Employment_Type": np.random.choice(["Salaried", "Self-Employed", "Student", "Retired"], n, p=[0.5,0.3,0.15,0.05]),
    "Loan_ID": [f"LOAN{5000+i}" for i in range(n)],
    "Loan_Type": np.random.choice(["Personal", "Auto", "Home", "Education", "Business"], n),
    "Loan_Amount": np.random.randint(50000, 2000000, n),
    "Interest_Rate": np.round(np.random.uniform(6.5, 18.5, n), 2),
    "Tenure_Months": np.random.choice([12, 24, 36, 60, 120, 180], n),
    "Application_Date": [fake.date_between(start_date='-3y', end_date='today') for _ in range(n)],
    "Approval_Status": np.random.choice(["Approved", "Rejected"], n, p=[0.8,0.2])
}

df = pd.DataFrame(data)

# Add EMI calculations for approved loans
df["EMI_Amount"] = np.where(
    df["Approval_Status"]=="Approved",
    np.round(df["Loan_Amount"] * (1 + (df["Interest_Rate"]/100)) / df["Tenure_Months"], 2),
    0
)

# Disbursal date (only for approved)
df["Disbursal_Date"] = np.where(
    df["Approval_Status"]=="Approved",
    [fake.date_between(start_date=date, end_date='today') for date in df["Application_Date"]],
    pd.NaT
)

# Random repayment behavior
df["EMIs_Paid"] = np.where(df["Approval_Status"]=="Approved",
                           np.random.randint(0, df["Tenure_Months"]+1, n),
                           0)

# Default flag
df["Default_Status"] = np.where((df["Approval_Status"]=="Approved") & (df["EMIs_Paid"] < df["Tenure_Months"]*0.75),
                                "Yes", "No")

# Outstanding balance
df["Outstanding_Balance"] = np.where(
    df["Approval_Status"]=="Approved",
    np.round((df["Tenure_Months"] - df["EMIs_Paid"]) * df["EMI_Amount"], 2),
    0
)

df.head()


Collecting faker
  Downloading faker-37.5.3-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.5.3-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.5.3


Unnamed: 0,Customer_ID,Age,Gender,Income,Employment_Type,Loan_ID,Loan_Type,Loan_Amount,Interest_Rate,Tenure_Months,Application_Date,Approval_Status,EMI_Amount,Disbursal_Date,EMIs_Paid,Default_Status,Outstanding_Balance
0,CUST1000,59,Female,62399,Salaried,LOAN5000,Auto,885209,15.11,12,2024-01-24,Approved,84913.67,2025-02-23,11,No,84913.67
1,CUST1001,49,Male,64013,Self-Employed,LOAN5001,Home,1314330,17.24,24,2022-11-29,Approved,64205.02,2025-08-02,16,Yes,513640.16
2,CUST1002,35,Female,165778,Salaried,LOAN5002,Education,202973,17.74,12,2024-10-24,Approved,19915.03,2024-10-24,9,No,59745.09
3,CUST1003,63,Female,189303,Self-Employed,LOAN5003,Education,1092201,13.92,36,2025-06-26,Approved,34562.09,2025-07-15,29,No,241934.63
4,CUST1004,28,Female,47909,Self-Employed,LOAN5004,Education,1934974,7.13,180,2025-02-23,Approved,11516.32,2025-07-06,171,No,103646.88


In [3]:
df.to_csv('Loan_data.csv', index=False)