## Setup

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

## 1 Read bronze_loan_default

In [0]:
loan_default = spark.table("msme_risk_analytics.bronze_loan_default")
print(f"Raw records: {loan_default.count()}")
loan_default.printSchema()

Raw records: 148670
root
 |-- ID: long (nullable = true)
 |-- year: long (nullable = true)
 |-- loan_limit: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- approv_in_adv: string (nullable = true)
 |-- loan_type: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- Credit_Worthiness: string (nullable = true)
 |-- open_credit: string (nullable = true)
 |-- business_or_commercial: string (nullable = true)
 |-- loan_amount: long (nullable = true)
 |-- rate_of_interest: double (nullable = true)
 |-- Interest_rate_spread: double (nullable = true)
 |-- Upfront_charges: double (nullable = true)
 |-- term: double (nullable = true)
 |-- Neg_ammortization: string (nullable = true)
 |-- interest_only: string (nullable = true)
 |-- lump_sum_payment: string (nullable = true)
 |-- property_value: double (nullable = true)
 |-- construction_type: string (nullable = true)
 |-- occupancy_type: string (nullable = true)
 |-- Secured_by: string (nullable = true)
 |--

##Check nulls & data quality

In [0]:
loan_default.select([
    count(when(col(c).isNull(), c)).alias(c) 
    for c in loan_default.columns
]).show(vertical=True)

-RECORD 0--------------------------
 ID                        | 0     
 year                      | 0     
 loan_limit                | 3344  
 Gender                    | 0     
 approv_in_adv             | 908   
 loan_type                 | 0     
 loan_purpose              | 134   
 Credit_Worthiness         | 0     
 open_credit               | 0     
 business_or_commercial    | 0     
 loan_amount               | 0     
 rate_of_interest          | 36439 
 Interest_rate_spread      | 36639 
 Upfront_charges           | 39642 
 term                      | 41    
 Neg_ammortization         | 121   
 interest_only             | 0     
 lump_sum_payment          | 0     
 property_value            | 15098 
 construction_type         | 0     
 occupancy_type            | 0     
 Secured_by                | 0     
 total_units               | 0     
 income                    | 9150  
 credit_type               | 0     
 Credit_Score              | 0     
 co-applicant_credit_type  |

##Clean loan_default

In [0]:
silver_loan_default = loan_default \
    .dropna(subset=['loan_amount', 'income', 'Credit_Score']) \
    .withColumn('loan_amount', col('loan_amount').cast('double')) \
    .withColumn('income', col('income').cast('double')) \
    .withColumn('rate_of_interest', col('rate_of_interest').cast('double')) \
    .withColumn('LTV', col('LTV').cast('double')) \
    .filter((col('loan_amount') > 0) & (col('income') > 0)) \
    .withColumn('load_timestamp', current_timestamp())

##Filter for business loans (MSME focus)

In [0]:
msme_loans = silver_loan_default \
    .filter(col('business_or_commercial') == 'b/c') \
    .withColumn('is_msme_loan', lit(True))

##Write to Silver

In [0]:
msme_loans.write \
    .format('delta') \
    .mode('overwrite') \
    .option('overwriteSchema', 'true') \
    .saveAsTable('msme_risk_analytics.silver_loan_default')

print(f"✓ Silver records: {msme_loans.count()}")

✓ Silver records: 16552


## 2 Read Bronze Loan Prediction

In [0]:
loan_pred = spark.table("msme_risk_analytics.bronze_loan_prediction")

silver_loan_pred = loan_pred \
    .dropna(subset=['age', 'income', 'credit_score', 'loan_approved']) \
    .filter((col('age') >= 18) & (col('age') <= 100)) \
    .filter(col('income') > 0) \
    .filter((col('credit_score') >= 300) & (col('credit_score') <= 900)) \
    .withColumn('debt_to_income_pct', col('debt_to_income_ratio') * 100) \
    .withColumn('has_criminal_record', col('criminal_record').cast('boolean')) \
    .withColumn('has_existing_loan', col('existing_loan').cast('boolean')) \
    .withColumn('load_timestamp', current_timestamp())

silver_loan_pred.write \
    .format('delta') \
    .mode('overwrite') \
    .saveAsTable('msme_risk_analytics.silver_loan_prediction')

print(f"✓ Loan Prediction: {silver_loan_pred.count()} records")

✓ Loan Prediction: 12367 records


## 3 Read Bonze Loan Prediction

In [0]:
loan_pred = spark.table("msme_risk_analytics.bronze_loan_prediction")

silver_loan_pred = loan_pred \
    .dropna(subset=['age', 'income', 'credit_score', 'loan_approved']) \
    .filter((col('age') >= 18) & (col('age') <= 100)) \
    .filter(col('income') > 0) \
    .filter((col('credit_score') >= 300) & (col('credit_score') <= 900)) \
    .withColumn('debt_to_income_pct', col('debt_to_income_ratio') * 100) \
    .withColumn('has_criminal_record', col('criminal_record').cast('boolean')) \
    .withColumn('has_existing_loan', col('existing_loan').cast('boolean')) \
    .withColumn('load_timestamp', current_timestamp())

silver_loan_pred.write \
    .format('delta') \
    .mode('overwrite') \
    .saveAsTable('msme_risk_analytics.silver_loan_prediction')

print(f"✓ Loan Prediction: {silver_loan_pred.count()} records")

✓ Loan Prediction: 12367 records


## 4 Bronze Credit Risk

In [0]:
credit_risk = spark.table("msme_risk_analytics.bronze_credit_risk")

silver_credit = credit_risk \
    .dropna(subset=['person_age', 'person_income', 'loan_amnt', 'loan_status']) \
    .filter((col('person_age') >= 18) & (col('person_age') <= 100)) \
    .filter((col('person_income') > 0) & (col('loan_amnt') > 0)) \
    .withColumn('loan_to_income', col('loan_percent_income')) \
    .withColumn('has_default_history', 
                when(col('cb_person_default_on_file') == 'Y', True).otherwise(False)) \
    .withColumn('load_timestamp', current_timestamp())

silver_credit.write \
    .format('delta') \
    .mode('overwrite') \
    .saveAsTable('msme_risk_analytics.silver_credit_risk')

print(f"✓ Credit Risk: {silver_credit.count()} records")

✓ Credit Risk: 32576 records
