In [41]:
import pandas as pd

In [42]:
raw_data = pd.read_csv("data.csv")
raw_data.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,
3,1,1500,,,,,,,,,,,
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,


In [43]:
# Rename columns for clarity
raw_data.rename(columns={\
                                'BAD': 'bad_flag',\
                                'LOAN': 'loan_amt_requested',\
                                'MORTDUE': 'mort_due',\
                                'VALUE': 'current_property_value',\
                                'REASON': 'loan_reason',\
                                'JOB': 'job',\
                                'YOJ': 'years_on_job',\
                                'DEROG': 'no_of_derog',\
                                'DELINQ': 'no_of_delinq',\
                                'CLAGE': 'age_of_oldest_cl',\
                                'NINQ': 'no_of_recent_credit_inquiries',\
                                'CLNO': 'no_of_cl',\
                                'DEBTINC': 'debt_to_income_ratio'\
                                }, inplace=True)

# Convert 'loan_reason' and 'job' to string type
raw_data['loan_reason'] = raw_data['loan_reason'].astype(str)
raw_data['job'] = raw_data['job'].astype(str)


In [44]:
# Count of 0s and 1s in bad_flag pre-removal
print(raw_data['bad_flag'].value_counts())

bad_flag
0    4771
1    1189
Name: count, dtype: int64


In [45]:
# Drop rows with more than 5 missing values
raw_data = raw_data[raw_data.isnull().sum(axis=1) <= 5]

In [46]:
# Count of 0s and 1s in bad_flag pre-removal
print(raw_data['bad_flag'].value_counts())

bad_flag
0    4656
1    1140
Name: count, dtype: int64


In [47]:
#Pre-removal bad%
1189/(1189+4771)

0.199496644295302

In [48]:
#Post-removal bad%
1140/(4656+1140)

0.19668737060041408

In [49]:
# Treating no_of_derog, no_of_delinq, and no_of_recent_credit_inquiries. Replace missing values with 0
raw_data['no_of_derog'] = raw_data['no_of_derog'].fillna(0)
raw_data['no_of_delinq'] = raw_data['no_of_delinq'].fillna(0)
raw_data['no_of_recent_credit_inquiries'] = raw_data['no_of_recent_credit_inquiries'].fillna(0)


In [50]:
## Treating age of oldest credit line and number of credit lines
# Replace missing values in number of credit lines with 0
raw_data['no_of_cl'] = raw_data['no_of_cl'].fillna(0)

# Calculate mean of age_of_oldest_cl (excluding missing values)
mean_clage = raw_data['age_of_oldest_cl'].mean()

# Apply conditional imputation
raw_data['age_of_oldest_cl'] = raw_data.apply(
    lambda row: 0 if row['no_of_cl'] == 0 and pd.isnull(row['age_of_oldest_cl']) 
    else (mean_clage if pd.isnull(row['age_of_oldest_cl']) else row['age_of_oldest_cl']),
    axis=1
)

In [51]:
## Treat Mort due and value
# Step 1: Calculate median ratio of mort_due to value (for non-null entries)
valid_ratio = raw_data.dropna(subset=['mort_due', 'current_property_value'])
median_ratio = (valid_ratio['mort_due'] / valid_ratio['current_property_value']).median()

# Case 1 missing property value byt mort due exist: Impute missing 'current_property_value' using 'mort_due'
raw_data.loc[
    raw_data['current_property_value'].isnull() & raw_data['mort_due'].notnull(),
    'current_property_value'
] = raw_data['mort_due'] / median_ratio

# Case 2 missing mort due byt property value exist: Impute missing 'mort_due' using 'current_property_value'
raw_data.loc[
    raw_data['mort_due'].isnull() & raw_data['current_property_value'].notnull(),
    'mort_due'
] = raw_data['current_property_value'] * median_ratio

# Case 3 All values missing: Fill rows with both mort_due and current_property_value missing with 0
raw_data.loc[
    raw_data['mort_due'].isnull() & raw_data['current_property_value'].isnull(),
    ['mort_due', 'current_property_value']
] = 0


In [52]:
# Treat missing values for loan_reason and job with 'NonProvided'
raw_data['loan_reason'] = raw_data['loan_reason'].replace("nan", "NonProvided")
raw_data['job'] = raw_data['job'].replace("nan", "NonProvided")

In [53]:
## Treat years on job

# First, make sure years_on_job is numeric
raw_data['years_on_job'] = pd.to_numeric(raw_data['years_on_job'], errors='coerce')

# Create a mapping of median years_on_job for each job type (excluding 'NonProvided')
job_medians = raw_data[raw_data['job'] != 'NonProvided'].groupby('job')['years_on_job'].median()

# Define a function to apply the logic
def impute_years_on_job(row):
    if pd.notnull(row['years_on_job']):
        return row['years_on_job']
    elif row['job'] == 'NonProvided':
        return 0
    else:
        return job_medians.get(row['job'], 0)

# Apply the function row-wise
raw_data['years_on_job'] = raw_data.apply(impute_years_on_job, axis=1)


In [54]:
##Treat Debt to income ratio by filling with median
#### Fill no job as median for global debt to income ratio

# Step 1: Calculate the global median for fallback
median_dti = raw_data['debt_to_income_ratio'].median()

# Step 2: Calculate the median DTI by job (excluding NonProvided)
dti_medians = raw_data[raw_data['job'] != 'NonProvided'].groupby('job')['debt_to_income_ratio'].median()

# Step 3: Define the imputation function
def impute_dti(row):
    if pd.notnull(row['debt_to_income_ratio']):
        return row['debt_to_income_ratio']
    elif row['job'] == 'NonProvided':
        return median_dti
    else:
        return dti_medians.get(row['job'], median_dti)

# Step 4: Apply the function to the dataset
raw_data['debt_to_income_ratio'] = raw_data.apply(impute_dti, axis=1)



In [55]:
## Add Loan to Value Ratio
# Avoid division by zero
raw_data['loan_to_value_ratio'] = raw_data.apply(
    lambda row: row['loan_amt_requested'] / row['current_property_value']
    if row['current_property_value'] != 0 else 0,
    axis=1
)

In [56]:
## Add cl_delinquency_ratio 
# Calculate delinquency per credit line
raw_data['cl_delinquency_ratio'] = raw_data.apply(
    lambda row: row['no_of_delinq'] / row['no_of_cl']
    if row['no_of_cl'] != 0 else 0,
    axis=1
)

In [57]:
## Add Income
# Approximate income = mort_due / (DTI / 100)
raw_data['approx_income'] = raw_data.apply(
    lambda row: row['mort_due'] / (row['debt_to_income_ratio'] / 100)
    if row['debt_to_income_ratio'] != 0 else 0,
    axis=1
)

In [58]:
# Save files
raw_data.to_csv('cleaned_data.csv', index=False)

# EDA

In [60]:
# Show count of missing values per column
null_counts = raw_data.isnull().sum()
null_counts = null_counts[null_counts > 0]  # Filter only columns with missing values
print(null_counts)


Series([], dtype: int64)
