In [5]:
import pandas as pd
import numpy as np
import sklearn

In [7]:
data = pd.read_csv("LoanDataRiskAnalysis.csv")

In [8]:
data.head()

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,2024-01-01,45,236513,588,Employed,Associate,23,18746,72,Single,...,19709.416667,0.601657,6,77692,0.239746,0.229321,481.441331,0.039344,1,37.6
1,2024-01-02,38,60500,543,Employed,High School,16,21814,60,Single,...,5041.666667,0.578376,7,14968,0.255314,0.288631,690.600027,0.202036,0,56.0
2,2024-01-03,47,30104,527,Self-Employed,High School,28,18811,60,Single,...,2508.666667,0.844337,6,56437,0.260311,0.281039,586.875925,0.319244,0,58.0
3,2024-01-04,58,56353,607,Employed,High School,36,31737,60,Married,...,4696.083333,0.715626,5,12751,0.233237,0.257151,944.874847,0.329823,0,48.0
4,2024-01-05,37,133586,610,Employed,Bachelor,13,13774,12,Married,...,11132.166667,0.774462,3,153627,0.173774,0.17326,1258.385969,0.135678,1,35.2


In [10]:
# Count dublicate rows
count = data.duplicated().sum()
print(f"The number of duplicate rows is: {count}")

The number of duplicate rows is: 0


In [11]:
# check missing data
count = data.isnull().sum()
print(f"The number of duplicate rows is: {count}")

The number of duplicate rows is: ApplicationDate               0
Age                           0
AnnualIncome                  0
CreditScore                   0
EmploymentStatus              0
EducationLevel                0
Experience                    0
LoanAmount                    0
LoanDuration                  0
MaritalStatus                 0
NumberOfDependents            0
HomeOwnershipStatus           0
MonthlyDebtPayments           0
CreditCardUtilizationRate     0
NumberOfOpenCreditLines       0
NumberOfCreditInquiries       0
DebtToIncomeRatio             0
BankruptcyHistory             0
LoanPurpose                   0
PreviousLoanDefaults          0
PaymentHistory                0
LengthOfCreditHistory         0
SavingsAccountBalance         0
CheckingAccountBalance        0
TotalAssets                   0
TotalLiabilities              0
MonthlyIncome                 0
UtilityBillsPaymentHistory    0
JobTenure                     0
NetWorth                      0
BaseInt

In [12]:
# Unique Values Per Column
print("\nUnique Values per Column:")
print(data.nunique())


Unique Values per Column:
ApplicationDate               5000
Age                             62
AnnualIncome                  4708
CreditScore                    275
EmploymentStatus                 3
EducationLevel                   5
Experience                      59
LoanAmount                    4657
LoanDuration                    10
MaritalStatus                    4
NumberOfDependents               6
HomeOwnershipStatus              4
MonthlyDebtPayments            977
CreditCardUtilizationRate     5000
NumberOfOpenCreditLines         12
NumberOfCreditInquiries          7
DebtToIncomeRatio             5000
BankruptcyHistory                2
LoanPurpose                      5
PreviousLoanDefaults             2
PaymentHistory                  34
LengthOfCreditHistory           29
SavingsAccountBalance         3747
CheckingAccountBalance        2710
TotalAssets                   4936
TotalLiabilities              4782
MonthlyIncome                 4708
UtilityBillsPaymentHistory  

# I will use these features for risk  modelling
These are the most critical features that directly impact financial risk and are almost always predictive:
DebtToIncomeRatio: Captures the balance between income and debt obligations.
CreditScore: A direct measure of creditworthiness.
MonthlyDebtPayments: Indicates the financial strain due to existing debt.
PaymentHistory: Past payment behavior reflects financial reliability.
NetWorth: Reflects overall financial health and capacity to manage debt.
SavingsAccountBalance: Indicates financial reserves for emergencies.
BankruptcyHistory: A strong predictor of financial instability.
CreditCardUtilizationRate: High utilization rates suggest financial overextension.
LoanAmount: Larger loans pose greater risk if disproportionate to income.
EmploymentStatus: Stability in employment impacts financial stability.

# Keep only the useful features

In [16]:
risk_data = data[['DebtToIncomeRatio',
                'CreditScore',
                'MonthlyDebtPayments',
                'PaymentHistory',
                'NetWorth',
                'SavingsAccountBalance',
                'BankruptcyHistory',
                'CreditCardUtilizationRate',
                'LoanAmount',
                'EmploymentStatus',
                'RiskScore']]

In [17]:
risk_data.head()
risk_data.to_csv('model_data.csv', index=False)
print("Data Ready for modeling")

Data Ready for modeling
