In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, count, substring, when
from pyspark.sql.window import Window

### Initialize Spark session

In [None]:
spark = SparkSession.builder.appName("ECOM_Orders_Transformations").getOrCreate()

### Load the data from the Data Lake

In [None]:
ecom_order_df = spark.read.csv("path_to_ECOM_ORDER_files", header=True, inferSchema=True)

## Task 1: Identifying Customers Who Recently Placed Orders
### Assuming 'OrderDate' column exists and is in the format 'yyyy-MM-dd'

In [None]:
recent_orders_df = ecom_order_df.filter(col("OrderDate") >= '2023-01-01') # Adjust the date as needed

## Task 2: Identify Top-Spending Customers
### Assuming 'TotalAmount' column exists

In [None]:
top_spending_customers_df = ecom_order_df.groupBy("CustomerID").agg(sum("TotalAmount").alias("TotalSpending")).orderBy(desc("TotalSpending"))

## Task 3: Identify Customers with Missing City Information

In [None]:
customers_missing_city_df = ecom_order_df.filter(col("City").isNull())

## Task 4: Analyzing Most Frequent Zip Code Prefixes
### Assuming 'ZipCode' column exists and prefix is first 3 digits

In [None]:
zip_code_prefix_df = ecom_order_df.withColumn("ZipCodePrefix", substring(col("ZipCode"), 1, 3))
frequent_zip_code_prefix_df = zip_code_prefix_df.groupBy("ZipCodePrefix").agg(count("*").alias("Count")).orderBy(desc("Count"))

## Task 5: Identifying Customers with Recent Purchases
### Assuming 'PurchaseDate' column exists

In [None]:
recent_purchases_df = ecom_order_df.filter(col("PurchaseDate") >= '2023-01-01') # Adjust the date as needed

### Combine all the results into one dataframe for loading

In [None]:
final_df = ecom_order_df.join(recent_orders_df, "CustomerID", "left") \
                        .join(top_spending_customers_df, "CustomerID", "left") \
                        .join(customers_missing_city_df, "CustomerID", "left") \
                        .join(frequent_zip_code_prefix_df, "CustomerID", "left") \
                        .join(recent_purchases_df, "CustomerID", "left")

### Save the final transformed data back to Data Lake or directly to SQL Database

In [None]:
final_df.write.csv("path_to_transformed_data", header=True)

### Stop Spark session

In [None]:
spark.stop()