In [0]:
from delta.tables import *
from pyspark.sql.types import*
from pyspark.sql.functions import* 
from pyspark.sql.window import Window

In [0]:
data = [
    {
        "customers":"/Volumes/rbc/rbcschema/raw/customers.csv",
        "transactions":"/Volumes/rbc/rbcschema/raw/transactions.csv",
        "join_type":"inner",
        "groupBy_key":"customer_id",
        "avg_key":"amount",
        "sum_key":"amount",
        "transformations":["join", "avg", "sum","rank"],
        "save_format":"delta",
        "destination":"/Volumes/rbc/rbcschema/curated/customer_transaction.delta"
    }
]

schema = StructType([
    StructField("customers", StringType(), True),
    StructField("transactions", StringType(), True),
    StructField("join_type", StringType(), True),
    StructField("groupBy_key", StringType(), True),
    StructField("avg_key", StringType(), True),
    StructField("sum_key", StringType(), True),
    StructField("transformations",ArrayType(StringType()), True),
    StructField("save_format", StringType(), True),
    StructField("destination", StringType(), True)
])

df.write.format('json').option('header',True).save("/Volumes/rbc/rbcschema/raw/config_file")

In [0]:
# Initialize Spark session
# spark = SparkSession.builder.appName("CustomerTransactions").getOrCreate()

# Define the CustomerTransactions class
class CustomerTransactions:
    @staticmethod
    def read_data(path):
        try:
            df = spark.read.format('csv').option('header', True).load(path)
            return df
        except Exception as e:
            print(f"Error occurred while reading file {path}: {e}")
            return None

    @staticmethod
    def joining(customers, transactions, join_type):
        return customers.join(transactions, on="customer_id", how=join_type)

    @staticmethod
    def avgTransaction(joined_df, avg_key, groupBy_key):
        return joined_df.groupBy(groupBy_key).agg(avg(avg_key).alias("avg_transaction_amount"))

    @staticmethod
    def totalTransaction(joined_df, sum_key, groupBy_key):
        return joined_df.groupBy(groupBy_key).agg(sum(sum_key).alias("total_transaction_amount"))

    @staticmethod
    def rankingCustomers(total_aggregated_df):
        window_rank = Window.orderBy(desc("total_transaction_amount"))
        return total_aggregated_df.withColumn("rank", rank().over(window_rank))

# Instantiate the class
cus_instance = CustomerTransactions()

# Read the config JSON file
config_df = spark.read.format('json').load("/Volumes/rbc/rbcschema/raw/config_file")

# Iterate through config DataFrame rows
for row in config_df.collect():
    customers_path = row["customers"]
    transactions_path = row["transactions"]
    join_type = row["join_type"]
    groupBy_key = row["groupBy_key"]
    avg_key = row["avg_key"]
    sum_key = row["sum_key"]
    transformations = row["transformations"]  # Already an array

    # Read customers and transactions data
    customers_df = cus_instance.read_data(customers_path)
    transactions_df = cus_instance.read_data(transactions_path)

    if not customers_df or not transactions_df:
        print(f"Skipping processing for {customers_path} and {transactions_path} due to errors")
        continue

    # Apply transformations step-by-step
    result_df = None

    if "join" in transformations:
        result_df = cus_instance.joining(customers_df, transactions_df, join_type)
    if "avg" in transformations and result_df is not None:
        result_df = cus_instance.avgTransaction(result_df, avg_key, groupBy_key)
    if "sum" in transformations and result_df is not None:
        result_df = cus_instance.totalTransaction(result_df, sum_key, groupBy_key)
    if "rank" in transformations and result_df is not None:
        result_df = cus_instance.rankingCustomers(result_df)

    # Save the final result
    if result_df:
        save_path = row["destination"]
        save_format = row["save_format"]
        result_df.write.format(save_format).mode("overwrite").save(save_path)
        print(f"Saved result to {save_path} in {save_format} format")
