## Setup

In [0]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator

df = spark.table('msme_risk_analytics.gold_ml_training_data')
train, test = df.randomSplit([0.8, 0.2], seed=42)

feature_cols = ['loan_amount', 'income', 'Credit_Score', 'LTV', 'dtir1', 
                'loan_to_income_ratio', 'risk_score']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
train_vec = assembler.transform(train)
test_vec = assembler.transform(test)

##  Train best model

In [0]:
gbt = GBTClassifier(featuresCol='features', labelCol='Status', maxIter=20, seed=42)
gbt_model = gbt.fit(train_vec)
predictions = gbt_model.transform(test_vec)

## Confusion matrix

In [0]:
predictions.groupBy('Status', 'prediction').count().show()

+------+----------+-----+
|Status|prediction|count|
+------+----------+-----+
|     0|       1.0|   39|
|     0|       0.0| 2133|
|     1|       1.0|  173|
|     1|       0.0|  684|
+------+----------+-----+



## Feature importance

In [0]:
import pandas as pd
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': gbt_model.featureImportances.toArray()
}).sort_values('importance', ascending=False)
print(feature_importance)

                feature  importance
3                   LTV    0.255088
5  loan_to_income_ratio    0.192788
4                 dtir1    0.173469
0           loan_amount    0.145198
1                income    0.102258
2          Credit_Score    0.075848
6            risk_score    0.055350


## Save metrics

In [0]:
spark.createDataFrame([{
    'model': 'GBT',
    'roc_auc': 0.6955,
    'accuracy': 0.7613,
    'precision': 684/(684+39),
    'recall': 684/(684+173),
    'train_records': train.count(),
    'test_records': test.count()
}]).write.format('delta').mode('overwrite') \
  .saveAsTable('msme_risk_analytics.gold_best_model_metrics')

print("✅ DAY 3 COMPLETE - Model trained & evaluated")

✅ DAY 3 COMPLETE - Model trained & evaluated


In [0]:
# Loan Prediction
new_loan = spark.createDataFrame([{
    'loan_amount': 250000, 'income': 5000, 'Credit_Score': 650,
    'LTV': 85, 'dtir1': 45, 'loan_to_income_ratio': 50, 'risk_score': 55
}])
new_vec = assembler.transform(new_loan)
prediction = gbt_model.transform(new_vec)
prediction.select('prediction', 'probability').show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-8917257583684502>, line 6[0m
[1;32m      1[0m [38;5;66;03m# Loan Prediction[39;00m
[1;32m      2[0m new_loan [38;5;241m=[39m spark[38;5;241m.[39mcreateDataFrame([{
[1;32m      3[0m     [38;5;124m'[39m[38;5;124mloan_amount[39m[38;5;124m'[39m: [38;5;241m250000[39m, [38;5;124m'[39m[38;5;124mincome[39m[38;5;124m'[39m: [38;5;241m5000[39m, [38;5;124m'[39m[38;5;124mCredit_Score[39m[38;5;124m'[39m: [38;5;241m650[39m,
[1;32m      4[0m     [38;5;124m'[39m[38;5;124mLTV[39m[38;5;124m'[39m: [38;5;241m85[39m, [38;5;124m'[39m[38;5;124mdtir1[39m[38;5;124m'[39m: [38;5;241m45[39m, [38;5;124m'[39m[38;5;124mloan_to_income_ratio[39m[38;5;124m'[39m: [38;5;241m50[39m, [38;5;124m'[39m[38;5;124mrisk_score[39m[38;5;124m'[39m: [38;5;241m55[39m
[1;32m