In [0]:
# Imports
import pandas as pd
from pyspark.sql.functions import col, sum, avg, count, when, lit, hour
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import DoubleType

In [0]:
database_name = "banking_database"
customer_table_name = "customers"
transactions_table_name = "transactions"

In [0]:
# Read existing Customer Data from the Delta Table
try:
    # Expected Customer Table Schema:
    # - id (LongType)
    # - first_name (StringType)
    # - last_name (StringType)
    # - age (IntegerType)
    # - location (StringType)
    # - annual_income (DoubleType)
    # - dti (DoubleType)
    # - ltv (DoubleType)
    # - credit_score (IntegerType)
    spark_customer_df = spark.read.format("delta").table(f"{database_name}.{customer_table_name}")
    print(f"Customer table '{database_name}.{customer_table_name}' loaded successfully.")
except Exception as e:
    print(f"Error loading customer table: {e}")
    print("Please ensure the database and table exist and are accessible with the correct schema.")
    raise 

# Read existing Transactions Data from the Delta Table
try:
    # Expected Transactions Table Schema:
    # - customer_id (LongType)
    # - transaction_date (TimestampType)
    # - amount (DoubleType)
    # - recipient (StringType)
    # - device_type (StringType)
    spark_transactions_df = spark.read.format("delta").table(f"{database_name}.{transactions_table_name}")
    print(f"Transactions table '{database_name}.{transactions_table_name}' loaded successfully.")
except Exception as e:
    print(f"Error loading transactions table: {e}")
    print("Please ensure the database and table exist and are accessible with the correct schema.")
    raise 

In [0]:
# Join customer and transactions, and calculate aggregate features per customer and per transaction.

spark_customer_df.createOrReplaceTempView("all_customers")
spark_transactions_df.createOrReplaceTempView("all_transactions")
print("\nTemporary views 'all_customers' and 'all_transactions' created.")

features_df = spark.sql(f"""
    SELECT
        t.customer_id,
        t.transaction_date,
        t.amount,
        t.recipient,
        t.device_type,
        c.age,
        c.location,
        c.annual_income,
        c.dti,
        c.ltv,
        c.credit_score,
        -- Global customer aggregates from transactions
        SUM(t.amount) OVER (PARTITION BY t.customer_id) AS total_amount_per_customer,
        AVG(t.amount) OVER (PARTITION BY t.customer_id) AS avg_amount_per_customer,
        MAX(t.amount) OVER (PARTITION BY t.customer_id) AS max_amount_per_customer,
        COUNT(1) OVER (PARTITION BY t.customer_id) AS num_transactions_per_customer,
        -- Rolling window features for fraud detection (transaction-level)
        COUNT(1) OVER (
            PARTITION BY t.customer_id
            ORDER BY t.transaction_date
            RANGE BETWEEN INTERVAL 1 HOUR PRECEDING AND CURRENT ROW
        ) AS transaction_count_last_hour,
        SUM(t.amount) OVER (
            PARTITION BY t.customer_id
            ORDER BY t.transaction_date
            RANGE BETWEEN INTERVAL 1 HOUR PRECEDING AND CURRENT ROW
        ) AS transaction_sum_last_hour,
        COUNT(1) OVER (
            PARTITION BY t.customer_id
            ORDER BY t.transaction_date
            RANGE BETWEEN INTERVAL 24 HOURS PRECEDING AND CURRENT ROW
        ) AS transaction_count_last_day,
        SUM(t.amount) OVER (
            PARTITION BY t.customer_id
            ORDER BY t.transaction_date
            RANGE BETWEEN INTERVAL 24 HOURS PRECEDING AND CURRENT ROW
        ) AS transaction_sum_last_day,
        HOUR(t.transaction_date) AS transaction_hour_of_day
    FROM
        all_transactions t
    JOIN
        all_customers c ON t.customer_id = c.id -- Join on customer_id from transactions and id from customers
    ORDER BY
        t.customer_id, t.transaction_date
""")

print("\n--- Features DataFrame (first 10 rows) ---")
features_df.show(truncate=False)

In [0]:
# Create synthetic label for fraud detection

# Define rules for synthetic label generation
DUMMY_FRAUD_AMOUNT_THRESHOLD = 2000.0
DUMMY_FRAUD_UNUSUAL_RECIPIENT = "Unknown"
DUMMY_FRAUD_UNUSUAL_DEVICE = "Unknown"
DUMMY_FRAUD_HIGH_FREQ_HOUR = 4 
DUMMY_FRAUD_HIGH_SUM_HOUR = 2500.0 
DUMMY_FRAUD_UNUSUAL_HOUR_START = 0 
DUMMY_FRAUD_UNUSUAL_HOUR_END = 6 

data_for_ml_df = features_df.withColumn(
    "label", 
    when(col("amount") >= DUMMY_FRAUD_AMOUNT_THRESHOLD, lit(1.0)) 
    .when(col("recipient") == DUMMY_FRAUD_UNUSUAL_RECIPIENT, lit(1.0)) 
    .when(col("device_type") == DUMMY_FRAUD_UNUSUAL_DEVICE, lit(1.0))
    .when(col("transaction_count_last_hour") >= DUMMY_FRAUD_HIGH_FREQ_HOUR, lit(1.0)) 
    .when(col("transaction_sum_last_hour") >= DUMMY_FRAUD_HIGH_SUM_HOUR, lit(1.0)) 
    .when((col("transaction_hour_of_day") >= DUMMY_FRAUD_UNUSUAL_HOUR_START) & \
          (col("transaction_hour_of_day") < DUMMY_FRAUD_UNUSUAL_HOUR_END) & \
          (col("amount") > 1000), lit(1.0)) 
    .otherwise(lit(0.0)) 
)

print("\n--- Data with Dummy 'label' for ML Training (first 10 rows) ---")
data_for_ml_df.select(
    "customer_id", "transaction_date", "amount", "recipient", "device_type", "label"
).show(10, truncate=False)

In [0]:

# Define numerical and categorical features for the ML model
numerical_features = [
    "amount", "age", "annual_income", "dti", "ltv", "credit_score",
    "total_amount_per_customer", "avg_amount_per_customer", "max_amount_per_customer",
    "num_transactions_per_customer", "transaction_count_last_hour", "transaction_sum_last_hour",
    "transaction_count_last_day", "transaction_sum_last_day", "transaction_hour_of_day"
]
categorical_features = ["location", "recipient", "device_type"]

# Handle potential nulls in numerical features by filling with 0 or mean

for col_name in numerical_features:
    data_for_ml_df = data_for_ml_df.withColumn(col_name, col(col_name).cast(DoubleType()))

    data_for_ml_df = data_for_ml_df.fillna(0, subset=[col_name])

# StringIndexer for categorical features
indexers = []
for col_name in categorical_features:
    indexer = StringIndexer()
    indexer.setInputCol(col_name)
    indexer.setOutputCol(col_name + "_indexed")
    indexer.setHandleInvalid("keep") 
    indexers.append(indexer)

# OneHotEncoder for indexed categorical features
encoders = [
    OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol=indexer.getOutputCol() + "_encoded")
    for indexer in indexers
]

# Assemble all features into a single vector column
assembler_inputs = numerical_features + [encoder.getOutputCol() for encoder in encoders]
vector_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features", handleInvalid="keep")

# Create a Pipeline for feature transformation
feature_pipeline = Pipeline(stages=indexers + encoders + [vector_assembler])

# Fit the feature pipeline to the data and transform it

ml_data = feature_pipeline.fit(data_for_ml_df).transform(data_for_ml_df)

print("\n--- ML Data with 'features' Vector Column (first 5 rows) ---")
ml_data.select("customer_id", "amount", "label", "features").show(5, truncate=False)

In [0]:
# Train and Evaluate Model

# Split data into training and test sets
train_data, test_data = ml_data.randomSplit([0.7, 0.3], seed=42)
print(f"\nTraining data count: {train_data.count()}")
print(f"Test data count: {test_data.count()}")

# Initialize Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10, regParam=0.1, elasticNetParam=0.0)

# Train the model
print("\n--- Training Logistic Regression Model ---")
lr_model = lr.fit(train_data)
print("Model training complete.")

# Make predictions on the test data
predictions = lr_model.transform(test_data)

predictions.select("customer_id", "amount", "label", "prediction", "probability").show(10, truncate=False)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"\nArea Under ROC (AUC) on test data: {auc}")

evaluator_accuracy = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="accuracy")
accuracy = evaluator_accuracy.evaluate(predictions)
print(f"Accuracy on test data: {accuracy}")

In [0]:
# Apply the Trained Model for Fraud Prediction

final_predictions_df = lr_model.transform(ml_data)

# Select relevant columns and rename 'prediction' to 'is_fraudulent' for clarity
fraud_results_df = final_predictions_df.select(
    col("customer_id"),
    col("transaction_date"),
    col("amount"),
    col("recipient"),
    col("device_type"),
    col("label").alias("actual_fraud_label"), 
    col("prediction").cast("boolean").alias("is_fraudulent_predicted"), 
    col("probability")[1].alias("fraud_probability") 
)

print("\n--- Final Fraud Predictions on All Data (ordered by fraud probability) ---")
fraud_results_df.orderBy(col("fraud_probability").desc()).show(truncate=False)

print("\n--- Transaction Records Most Susceptible to Fraud (Predicted by ML Model) ---")
# Filter for predicted fraudulent transactions and order by probability to see most susceptible
susceptible_transactions_df = fraud_results_df.filter(col("is_fraudulent_predicted") == True).orderBy(col("fraud_probability").desc())
susceptible_transactions_df.show(truncate=False)

In [0]:
ml_fraud_output_table_name = "ml_susceptible_fraud_transactions"
susceptible_transactions_df.write.format("delta").mode("overwrite").saveAsTable(f"{database_name}.{ml_fraud_output_table_name}")
