In [0]:
import pandas as pd
from pyspark.sql.functions import col, sum, avg, count, when, lit, hour, current_timestamp
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import DoubleType, IntegerType

In [0]:
database_name = "banking_database"
customer_table_name = "customers"
transactions_table_name = "transactions"

In [0]:

# Read existing Customer Data from the Delta Table
try:
    # Expected Customer Table Schema:
    # - id (LongType)
    # - first_name (StringType)
    # - last_name (StringType)
    # - age (IntegerType)
    # - location (StringType)
    # - annual_income (DoubleType)
    # - dti (DoubleType)
    # - ltv (DoubleType)
    # - credit_score (IntegerType)
    spark_customer_df = spark.read.format("delta").table(f"{database_name}.{customer_table_name}")
    print(f"Customer table '{database_name}.{customer_table_name}' loaded successfully.")
except Exception as e:
    print(f"Error loading customer table: {e}")
    raise 

# Read existing Transactions Data from the Delta Table
try:
    # Expected Transactions Table Schema:
    # - customer_id (LongType)
    # - transaction_date (TimestampType)
    # - amount (DoubleType)
    # - recipient (StringType)
    # - device_type (StringType)
    spark_transactions_df = spark.read.format("delta").table(f"{database_name}.{transactions_table_name}")
    print(f"Transactions table '{database_name}.{transactions_table_name}' loaded successfully.")
except Exception as e:
    print(f"Error loading transactions table: {e}")


In [0]:
# Feature Engineering and Data Preparation

# Define temporary views for easier SQL querying
spark_customer_df.createOrReplaceTempView("all_customers")
if spark_transactions_df:
    spark_transactions_df.createOrReplaceTempView("all_transactions")
    print("\nTemporary views 'all_customers' and 'all_transactions' created.")
else:
    print("\nTemporary view 'all_customers' created. 'all_transactions' not created due to missing data.")


# Aggregate transaction data to customer level (if transactions table exists)
if spark_transactions_df:
    customer_transaction_aggregates = spark.sql("""
        SELECT
            customer_id,
            COUNT(DISTINCT transaction_date) AS num_transaction_days,
            COUNT(1) AS total_transactions,
            SUM(amount) AS total_spending,
            AVG(amount) AS avg_transaction_amount,
            MAX(amount) AS max_transaction_amount,
            COUNT(DISTINCT recipient) AS num_unique_recipients,
            COUNT(DISTINCT device_type) AS num_unique_devices,
            SUM(CASE WHEN transaction_date >= date_sub(current_date(), 30) THEN amount ELSE 0 END) AS spending_last_30_days,
            COUNT(CASE WHEN transaction_date >= date_sub(current_date(), 30) THEN 1 ELSE NULL END) AS transactions_last_30_days
        FROM
            all_transactions
        GROUP BY
            customer_id
    """)
    customer_transaction_aggregates.createOrReplaceTempView("customer_tx_aggregates")
    print("\nCustomer transaction aggregates created.")
else:
    # Create an empty DataFrame with the same schema if transactions table is missing
    from pyspark.sql.types import StructType, StructField, LongType, DoubleType
    customer_transaction_aggregates_schema = StructType([
        StructField("customer_id", LongType(), True),
        StructField("num_transaction_days", LongType(), True),
        StructField("total_transactions", LongType(), True),
        StructField("total_spending", DoubleType(), True),
        StructField("avg_transaction_amount", DoubleType(), True),
        StructField("max_transaction_amount", DoubleType(), True),
        StructField("num_unique_recipients", LongType(), True),
        StructField("num_unique_devices", LongType(), True),
        StructField("spending_last_30_days", DoubleType(), True),
        StructField("transactions_last_30_days", LongType(), True)
    ])
    customer_transaction_aggregates = spark.createDataFrame([], schema=customer_transaction_aggregates_schema)
    customer_transaction_aggregates.createOrReplaceTempView("customer_tx_aggregates")
    print("\nEmpty customer transaction aggregates created due to missing transactions table.")


# Join customer data with aggregated transaction data
# The target variable for regression is 'credit_score'
customer_features_df = spark.sql(f"""
    SELECT
        c.id AS customer_id,
        c.age,
        c.location,
        c.annual_income,
        c.dti,
        c.ltv,
        c.credit_score AS label,
        t.num_transaction_days,
        t.total_transactions,
        t.total_spending,
        t.avg_transaction_amount,
        t.max_transaction_amount,
        t.num_unique_recipients,
        t.num_unique_devices,
        t.spending_last_30_days,
        t.transactions_last_30_days
    FROM
        all_customers c
    LEFT JOIN 
        customer_tx_aggregates t ON c.id = t.customer_id
""")

In [0]:
# Prepare Features for Model

# Define numerical and categorical features

numerical_features = [
    "age", "annual_income", "dti", "ltv",
    "num_transaction_days", "total_transactions", "total_spending",
    "avg_transaction_amount", "max_transaction_amount", "num_unique_recipients",
    "num_unique_devices", "spending_last_30_days", "transactions_last_30_days"
]
categorical_features = ["location"]

# Handle potential nulls in numerical features
for col_name in numerical_features:
    customer_features_df = customer_features_df.withColumn(col_name, col(col_name).cast(DoubleType()))
    customer_features_df = customer_features_df.fillna(0.0, subset=[col_name])

# Handle potential nulls in categorical features
for col_name in categorical_features:
    customer_features_df = customer_features_df.fillna("Unknown", subset=[col_name])


# Create a Pipeline for feature processing
# StringIndexer for categorical features
indexers = []
for col_name in categorical_features:
    indexer = StringIndexer()
    indexer.setInputCol(col_name)
    indexer.setOutputCol(col_name + "_indexed")
    indexer.setHandleInvalid("keep") 
    indexers.append(indexer)

# OneHotEncoder for indexed categorical features
encoders = [
    OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol=indexer.getOutputCol() + "_encoded")
    for indexer in indexers
]

# Assemble all features into a single vector column
assembler_inputs = numerical_features + [encoder.getOutputCol() for encoder in encoders]
vector_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features", handleInvalid="keep")

# Create a Pipeline for feature transformation
feature_pipeline = Pipeline(stages=indexers + encoders + [vector_assembler])

# Fit the feature pipeline to the data and transform it
ml_data_credit_risk_regression = feature_pipeline.fit(customer_features_df).transform(customer_features_df)

# Filter out rows where credit_score is null
ml_data_credit_risk_regression = ml_data_credit_risk_regression.filter(col("label").isNotNull())


In [0]:
# Train and Evaluate Machine Learning Regression Model

# Split data into training and test sets
train_data_cr, test_data_cr = ml_data_credit_risk_regression.randomSplit([0.7, 0.3], seed=42)
print(f"\nCredit Risk Regression Training data count: {train_data_cr.count()}")
print(f"Credit Risk Regression Test data count: {test_data_cr.count()}")

# Initialize Linear Regression model
lr_reg = LinearRegression(featuresCol="features", labelCol="label", maxIter=10, regParam=0.1, elasticNetParam=0.0)

# Train the model
print("\n--- Training Linear Regression Model for Credit Risk ---")
lr_reg_model = lr_reg.fit(train_data_cr)
print("Credit Risk Regression Model training complete.")

# Make predictions on the test data
predictions_cr_reg = lr_reg_model.transform(test_data_cr)

print("\n--- Predictions on Credit Risk Test Data (first 10 rows) ---")
predictions_cr_reg.select("customer_id", "label", "prediction").show(10, truncate=False)

# Evaluate the model
evaluator_rmse = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions_cr_reg)
print(f"\nCredit Risk Model RMSE on test data: {rmse}")

evaluator_r2 = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
r2 = evaluator_r2.evaluate(predictions_cr_reg)
print(f"Credit Risk Model R-squared on test data: {r2}")

final_predictions_cr_reg_df = lr_reg_model.transform(ml_data_credit_risk_regression)

# Select relevant columns 
ranked_customers_df = final_predictions_cr_reg_df.select(
    col("customer_id"),
    col("age"),
    col("location"),
    col("annual_income"),
    col("dti"),
    col("ltv"),
    col("label").alias("actual_credit_score"), # The actual credit score from the table
    col("prediction").alias("predicted_credit_score") # The predicted credit score
).orderBy(col("predicted_credit_score").asc()) first

print("\n--- Customers Ranked from Highest Risk (Lowest Predicted Credit Score) to Lowest Risk ---")
ranked_customers_df.show(truncate=False)

In [0]:
ranked_customers_output_table_name = "customers_ranked_by_credit_risk"
ranked_customers_df.write.format("delta").mode("overwrite").saveAsTable(f"{database_name}.{ranked_customers_output_table_name}")

print(f"\nCustomers ranked by credit risk saved to '{database_name}.{ranked_customers_output_table_name}'.")