In [0]:
import pandas as pd
from pyspark.sql.functions import col, when, lit, avg, count, sum
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.sql.types import DoubleType, IntegerType


In [0]:
database_name = "banking_database"
customer_table_name = "customers"
transactions_table_name = "transactions"

In [0]:
# Read existing Customer Data from the Delta Table
try:
    # Expected Customer Table Schema:
    # - id (LongType)
    # - first_name (StringType)
    # - last_name (StringType)
    # - age (IntegerType)
    # - location (StringType)
    # - annual_income (DoubleType)
    # - dti (DoubleType)
    # - ltv (DoubleType)
    # - credit_score (IntegerType)
    spark_customer_df = spark.read.format("delta").table(f"{database_name}.{customer_table_name}")
    print(f"Customer table '{database_name}.{customer_table_name}' loaded successfully.")
except Exception as e:
    print(f"Error loading customer table: {e}")
    raise 

# Read existing Transactions Data from the Delta Table
try:
    # Expected Transactions Table Schema:
    # - customer_id (LongType)
    # - transaction_date (TimestampType)
    # - amount (DoubleType)
    # - recipient (StringType)
    # - device_type (StringType)
    spark_transactions_df = spark.read.format("delta").table(f"{database_name}.{transactions_table_name}")
    print(f"Transactions table '{database_name}.{transactions_table_name}' loaded successfully.")
except Exception as e:
    print(f"Error loading transactions table: {e}")
    raise

In [0]:
# Prepare Data for K-Means Clustering 

# Calculate Average Monthly Spending from the transactions table

customer_spending_aggregates = spark_transactions_df.groupBy("customer_id").agg(
    avg("amount").alias("Average_Monthly_Spending_Calculated")
)

# Join customer data with the calculated spending aggregates

customers_with_spending_df = spark_customer_df.join(
    customer_spending_aggregates,
    spark_customer_df.id == customer_spending_aggregates.customer_id,
    "left"
).select(
    spark_customer_df["*"], 
    customer_spending_aggregates["Average_Monthly_Spending_Calculated"]
)

# Select relevant features for clustering: annual_income and the newly derived spending.
clustering_data_df = customers_with_spending_df.select(
    "id",
    "first_name",
    "last_name",
    col("annual_income").cast(DoubleType()).alias("Annual_Income"),
    col("Average_Monthly_Spending_Calculated").cast(DoubleType()).alias("Average_Monthly_Spending")
).na.fill(0.0, subset=["Average_Monthly_Spending"]) 


In [0]:
# Assemble features into a single vector.
clustering_assembler = VectorAssembler(
    inputCols=["Annual_Income", "Average_Monthly_Spending"],
    outputCol="features_clustering",
    handleInvalid="skip" 
)
# StandardScaler to scales features to have zero mean and unit variance.
scaler = StandardScaler()
scaler.setInputCol("features_clustering")
scaler.setOutputCol("scaled_features_clustering")
scaler.setWithStd(True)
scaler.setWithMean(False)

In [0]:
# Train K-Means Clustering Model 
# Define the K-Means model 
kmeans = KMeans()
kmeans.setFeaturesCol("scaled_features_clustering")
kmeans.setK(4) 
kmeans.setSeed(42) 

# Create a pipeline for clustering: (assembler -> scaler -> kmeans)
clustering_pipeline = Pipeline(stages=[clustering_assembler, scaler, kmeans])

# Train the K-Means model on the prepared data.
print("\n--- Training K-Means Clustering Model ---")
kmeans_model = clustering_pipeline.fit(clustering_data_df)
print("K-Means Model training complete.")

In [0]:
# Assign Clusters and Interpret Categories

clustered_customers_df = kmeans_model.transform(clustering_data_df)

print("\n--- Average Income and Spending per Cluster (for interpretation) ---")
cluster_summary = clustered_customers_df.groupBy("prediction").agg(
    avg("Annual_Income").alias("Avg_Annual_Income"),
    avg("Average_Monthly_Spending").alias("Avg_Monthly_Spending")
).orderBy("prediction")
cluster_summary.show()

In [0]:
overall_avg_income = clustered_customers_df.agg(avg("Annual_Income")).collect()[0][0]
overall_avg_spending = clustered_customers_df.agg(avg("Average_Monthly_Spending")).collect()[0][0]

print(f"\nOverall Average Annual Income: {overall_avg_income:.2f}")
print(f"Overall Average Monthly Spending: {overall_avg_spending:.2f}")

customer_segments_df = clustered_customers_df.withColumn(
    "customer_category",
    when((col("Annual_Income") >= overall_avg_income) & (col("Average_Monthly_Spending") >= overall_avg_spending), lit("High Income High Spenders"))
    .when((col("Annual_Income") >= overall_avg_income) & (col("Average_Monthly_Spending") < overall_avg_spending), lit("High Income Low Spenders"))
    .when((col("Annual_Income") < overall_avg_income) & (col("Average_Monthly_Spending") >= overall_avg_spending), lit("Low Income High Spenders"))
    .otherwise(lit("Low Income Low Spenders"))
)

In [0]:
customer_segments_df = clustered_customers_df.withColumn(
    "customer_category",
    when(col("prediction") == 0, lit("High Income High Spenders")) 
    .when(col("prediction") == 1, lit("Low Income Low Spenders"))  
    .when(col("prediction") == 2, lit("High Income Low Spenders")) 
    .when(col("prediction") == 3, lit("Low Income High Spenders")) 
    .otherwise(lit("Uncategorized")) # Fallback for any unmapped clusters
)

In [0]:
customer_segmentation_output_table_name = "customer_income_spending_segments"
customer_segments_df.write.format("delta").mode("overwrite").saveAsTable(f"{database_name}.{customer_segmentation_output_table_name}")
