In [0]:
# Regression model to predict credit risk scores

# Import necessary libraries
import pandas as pd
from pyspark.sql.functions import col, when, lit, avg, count, sum
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# --- 1. Define Database and Table Names, and Read Existing Data ---
# IMPORTANT: Ensure these database and table names match your actual setup in Databricks.
# The 'customers' and 'transactions' tables are assumed to already exist with the specified schema.

database_name = "banking_database" 
customer_table_name = "customers"
transactions_table_name = "transactions"

# Read existing Customer Data from the Delta Table
try:
    # Expected Customer Table Schema:
    # - Id (LongType)
    # - First Name (StringType)
    # - Last Name (StringType)
    # - Age (IntegerType)
    # - Location (StringType)
    # - Annual Income (DoubleType)
    # - Debt-To-Income Ratio (DTI) (DoubleType)
    # - Loan-to-Value Ratio (LTV) (DoubleType)
    # - Average Monthly Spending (DoubleType)
    # - Credit Score (IntegerType)
    spark_customer_df = spark.read.format("delta").table(f"{database_name}.{customer_table_name}")
    print(f"Customer table '{database_name}.{customer_table_name}' loaded successfully.")
except Exception as e:
    print(f"Error loading customer table: {e}")
    print("Please ensure the database and table exist and are accessible with the correct schema.")
    raise # Re-raise the exception to stop execution if tables are not found

# Read existing Transactions Data from the Delta Table (optional for this model, but loaded for completeness)
try:
    # Expected Transactions Table Schema:
    # - Customer ID (LongType)
    # - Transaction Date (TimestampType)
    # - Amount (DoubleType)
    # - Recipient (StringType)
    # - Device Type (StringType)
    spark_transactions_df = spark.read.format("delta").table(f"{database_name}.{transactions_table_name}")
    print(f"Transactions table '{database_name}.{transactions_table_name}' loaded successfully.")
except Exception as e:
    print(f"Error loading transactions table: {e}")
    print("Please ensure the database and table exist and are accessible with the correct schema.")
    # Not critical to stop execution if transactions table is missing for this specific model,
    # but good practice to catch.
    pass

print("\n--- Raw Customer Data (first 5 rows) ---")
spark_customer_df.show(5)

# --- 2. Feature Engineering and Label Creation for Credit Risk Scoring ---
# For credit risk scoring, we primarily focus on customer-level attributes.
# We'll create a dummy 'label' for credit risk. In a real scenario, this would be
# based on historical loan default data or similar risk indicators.

# Define rules for a dummy 'label' (1.0 for high risk, 0.0 for low risk)
# These rules are illustrative and should be replaced with actual risk criteria.
# Example rules:
# - Credit Score below 600
# - DTI (Debt-To-Income Ratio) above 0.45 (45%)
# - LTV (Loan-to-Value Ratio) above 0.90 (90%)
# - Combination of low credit score and high DTI/LTV

customer_data_for_ml_df = spark_customer_df.withColumn(
    "label", # 'label' is the standard column name for target variable in Spark ML
    when(col("Credit Score") < 600, lit(1.0)) # Low credit score
    .when(col("Debt-To-Income Ratio (DTI)") > 0.45, lit(1.0)) # High DTI
    .when(col("Loan-to-Value Ratio (LTV)") > 0.90, lit(1.0)) # High LTV
    .when((col("Credit Score") < 650) & (col("Debt-To-Income Ratio (DTI)") > 0.40), lit(1.0)) # Combination
    .otherwise(lit(0.0)) # Otherwise low risk
)

print("\n--- Customer Data with Dummy 'label' for ML Training ---")
customer_data_for_ml_df.select(
    "Id", "Credit Score", "Debt-To-Income Ratio (DTI)", "Loan-to-Value Ratio (LTV)", "label"
).show(truncate=False)

# Define numerical and categorical features for the credit risk model
numerical_features = [
    "Age", "Annual Income", "Debt-To-Income Ratio (DTI)", "Loan-to-Value Ratio (LTV)",
    "Average Monthly Spending", "Credit Score"
]
categorical_features = ["Location"]

# Create a Pipeline for feature processing
# StringIndexer for categorical features
indexers = [
    StringIndexer(inputCol=col_name, outputCol=col_name + "_indexed", handleInvalid="keep")
    for col_name in categorical_features
]

# OneHotEncoder for indexed categorical features
encoders = [
    OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol=indexer.getOutputCol() + "_encoded")
    for indexer in indexers
]

# Assemble all features into a single vector column
assembler_inputs = numerical_features + [encoder.getOutputCol() for encoder in encoders]
vector_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features", handleInvalid="keep")

# Create a Pipeline for feature transformation
feature_pipeline = Pipeline(stages=indexers + encoders + [vector_assembler])

# Fit the feature pipeline to the data and transform it
# This step creates the 'features' column required by ML models
ml_data_credit_risk = feature_pipeline.fit(customer_data_for_ml_df).transform(customer_data_for_ml_df)

print("\n--- ML Data for Credit Risk with 'features' Vector Column (first 5 rows) ---")
ml_data_credit_risk.select("Id", "Credit Score", "label", "features").show(5, truncate=False)

# --- 3. Train and Evaluate Machine Learning Model ---

# Split data into training and test sets
train_data_cr, test_data_cr = ml_data_credit_risk.randomSplit([0.7, 0.3], seed=42)
print(f"\nCredit Risk Training data count: {train_data_cr.count()}")
print(f"Credit Risk Test data count: {test_data_cr.count()}")

# Initialize Logistic Regression model for credit risk
lr_cr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Train the model
print("\n--- Training Logistic Regression Model for Credit Risk ---")
lr_credit_risk_model = lr_cr.fit(train_data_cr)
print("Credit Risk Model training complete.")

# Make predictions on the test data
predictions_cr = lr_credit_risk_model.transform(test_data_cr)

print("\n--- Predictions on Credit Risk Test Data (first 10 rows) ---")
predictions_cr.select("Id", "Credit Score", "label", "prediction", "probability").show(10, truncate=False)

# Evaluate the model
evaluator_cr = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
auc_cr = evaluator_cr.evaluate(predictions_cr)
print(f"\nCredit Risk Model Area Under ROC (AUC) on test data: {auc_cr}")

evaluator_accuracy_cr = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="accuracy")
accuracy_cr = evaluator_accuracy_cr.evaluate(predictions_cr)
print(f"Credit Risk Model Accuracy on test data: {accuracy_cr}")

# --- 4. Apply the Trained Model for Credit Risk Prediction ---
# Apply the trained model to the entire customer dataset (or new incoming customer data)

final_predictions_cr_df = lr_credit_risk_model.transform(ml_data_credit_risk)

# Select relevant columns and rename 'prediction' to 'predicted_credit_risk' for clarity
credit_risk_results_df = final_predictions_cr_df.select(
    col("Id").alias("customer_id"),
    col("First Name"),
    col("Last Name"),
    col("Credit Score"),
    col("Debt-To-Income Ratio (DTI)"),
    col("Loan-to-Value Ratio (LTV)"),
    col("label").alias("actual_risk_label"), # The dummy label used for training
    col("prediction").cast("boolean").alias("is_high_credit_risk_predicted"), # Convert 0.0/1.0 to boolean
    col("probability")[1].alias("high_risk_probability") # Probability of being class 1 (high risk)
)

print("\n--- Final Credit Risk Predictions on All Customer Data ---")
credit_risk_results_df.orderBy(col("high_risk_probability").desc()).show(truncate=False)

print("\n--- Customers Predicted as High Credit Risk ---")
credit_risk_results_df.filter(col("is_high_credit_risk_predicted") == True).orderBy(col("high_risk_probability").desc()).show(truncate=False)

# --- Optional: Save High Credit Risk Customers to a separate table ---
credit_risk_output_table_name = "high_credit_risk_customers"
credit_risk_results_df.filter(col("is_high_credit_risk_predicted") == True) \
    .write.format("delta").mode("overwrite").saveAsTable(f"{database_name}.{credit_risk_output_table_name}")

print(f"\nHigh credit risk customers saved to '{database_name}.{credit_risk_output_table_name}'.")

print("\n--- Verify Credit Risk Output Table ---")
spark.sql(f"SELECT * FROM {database_name}.{credit_risk_output_table_name}").show(truncate=False)

# --- Clean up temporary views (optional) ---
# Note: The temporary views from the fraud model are not directly used here,
# but it's good practice to drop them if they were created in the same session.
spark.sql("DROP VIEW IF EXISTS all_customers")
spark.sql("DROP VIEW IF EXISTS all_transactions")
print("\nTemporary views dropped.")