In [0]:
customers = spark.table("customer_transactions")
customers.show(5)

+-----------+---+------+---------+-------------+--------------+-------------+------------------+----------------+------------------------+-----------------+-------+--------------------+------------------+
|customer_id|age|gender|  country|annual_income|spending_score|num_purchases|avg_purchase_value|membership_years|website_visits_per_month|cart_abandon_rate|churned|       feedback_text|last_purchase_date|
+-----------+---+------+---------+-------------+--------------+-------------+------------------+----------------+------------------------+-----------------+-------+--------------------+------------------+
|          1| 37|  Male|  Germany|        85886|            14|           18|              41.2|               6|                      20|             0.95|      0|Very satisfied wi...|        2025-06-22|
|          2| 40|  Male|    India|        41041|             4|           10|             31.73|               4|                      29|             0.21|      0|Good quality and

In [0]:
customers.printSchema()


root
 |-- customer_id: long (nullable = true)
 |-- age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- country: string (nullable = true)
 |-- annual_income: long (nullable = true)
 |-- spending_score: long (nullable = true)
 |-- num_purchases: long (nullable = true)
 |-- avg_purchase_value: double (nullable = true)
 |-- membership_years: long (nullable = true)
 |-- website_visits_per_month: long (nullable = true)
 |-- cart_abandon_rate: double (nullable = true)
 |-- churned: long (nullable = true)
 |-- feedback_text: string (nullable = true)
 |-- last_purchase_date: date (nullable = true)



In [0]:
customers.select(
    "country",
    "age",
    "annual_income",
    "avg_purchase_value",
    "churned"
).show(10)


+---------+---+-------------+------------------+-------+
|  country|age|annual_income|avg_purchase_value|churned|
+---------+---+-------------+------------------+-------+
|  Germany| 37|        85886|              41.2|      0|
|    India| 40|        41041|             31.73|      0|
|Australia| 69|       143869|             65.96|      0|
|       UK| 30|        87261|             51.87|      0|
|       UK| 69|       110678|             59.64|      0|
|Australia| 28|        90330|             35.63|      1|
|    India| 40|        27302|             38.99|      0|
|   Brazil| 21|        39171|             48.84|      0|
|    India| 47|        54766|             35.37|      0|
|    India| 42|        47550|             47.16|      0|
+---------+---+-------------+------------------+-------+
only showing top 10 rows


In [0]:
customers.filter("cart_abandon_rate > 0.8").count()


1922

In [0]:
from pyspark.sql.functions import avg

customers.groupBy("country") \
    .agg(avg("churned").alias("churn_rate")) \
    .orderBy("churn_rate", ascending=False) \
    .show()


+------------+-------------------+
|     country|         churn_rate|
+------------+-------------------+
|          UK|0.12833168805528133|
|      France|          0.1203125|
|       Japan|0.11717171717171718|
|South Africa|  0.116751269035533|
|     Germany|0.11210317460317461|
|      Brazil|0.11024390243902439|
|       India| 0.1089238845144357|
|         USA|0.10046044370029301|
|   Australia|  0.097953216374269|
|      Canada| 0.0966183574879227|
+------------+-------------------+



In [0]:
customers.groupBy("country") \
    .agg(avg("avg_purchase_value").alias("avg_spend")) \
    .orderBy("avg_spend", ascending=False) \
    .show()


+------------+------------------+
|     country|         avg_spend|
+------------+------------------+
|       Japan| 51.62406060606057|
|   Australia| 51.35679824561404|
|      Canada| 51.18106280193238|
|     Germany|50.899444444444505|
|         USA| 50.89614064462121|
|          UK| 50.61978282329713|
|      Brazil|41.286819512195194|
|South Africa|  41.2365989847716|
|       India|41.073700787401556|
|      France| 40.74535937499992|
+------------+------------------+



In [0]:
from pyspark.sql.functions import avg

customers.groupBy("country") \
    .agg(avg("churned").alias("churn_rate")) \
    .write.mode("overwrite") \
    .saveAsTable("churn_by_country")

spark.table("churn_by_country").show()


+------------+-------------------+
|     country|         churn_rate|
+------------+-------------------+
|     Germany|0.11210317460317461|
|       Japan|0.11717171717171718|
|      Canada| 0.0966183574879227|
|South Africa|  0.116751269035533|
|      France|          0.1203125|
|   Australia|  0.097953216374269|
|         USA|0.10046044370029301|
|          UK|0.12833168805528133|
|       India| 0.1089238845144357|
|      Brazil|0.11024390243902439|
+------------+-------------------+

