# 🧠 Loan Default Prediction using PySpark
This project demonstrates a complete PySpark-based machine learning pipeline for predicting loan default using structured data.

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

spark = SparkSession.builder.appName("LoanDefaultPrediction").getOrCreate()

In [None]:
# Upload and read the CSV file
from google.colab import files
uploaded = files.upload()

df = spark.read.csv("loan_data.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)

In [None]:
# Drop ID column and convert target label if needed
# Adjust column names based on your dataset
df = df.drop('Customer_ID')
indexer = StringIndexer(inputCol='Loan_Status', outputCol='label')
df = indexer.fit(df).transform(df)

In [None]:
# Index categorical features and assemble all features
categorical_cols = ['Gender', 'Education']
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in categorical_cols]

assembler = VectorAssembler(
    inputCols=['LoanAmount', 'Income', 'Age'] + [col+"_index" for col in categorical_cols],
    outputCol='features'
)

In [None]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
pipeline = Pipeline(stages=indexers + [assembler, rf])

train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train_data)
predictions = model.transform(test_data)

In [None]:
evaluator = BinaryClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)
print(f"Test Set Accuracy: {accuracy:.2f}")

In [None]:
# Save the model for reuse
model.save("loan_default_model")