In [0]:
df = spark.table("gold.customer_summary")

In [0]:
df.printSchema()

root
 |-- country: string (nullable = true)
 |-- total_customers: long (nullable = true)
 |-- avg_income: double (nullable = true)



In [0]:
df.select("avg_income").describe().show()

+-------+------------------+
|summary|        avg_income|
+-------+------------------+
|  count|                10|
|   mean| 84300.74305165041|
| stddev|25365.281742966054|
|    min| 53936.98687664042|
|    max|106217.04242424242|
+-------+------------------+



In [0]:
import pyspark.sql.functions as F

df.groupBy("country") \
  .agg(F.avg("avg_income").alias("avg_income")) \
  .orderBy("avg_income", ascending=False) \
  .show()

+------------+------------------+
|     country|        avg_income|
+------------+------------------+
|       Japan|106217.04242424242|
|   Australia|106057.34649122808|
|         USA|103614.36040184177|
|      Canada|103189.84299516908|
|     Germany|102271.41964285714|
|          UK|102161.52912142152|
|      Brazil|55385.631219512194|
|      France|     55375.9109375|
|South Africa| 54797.36040609137|
|       India| 53936.98687664042|
+------------+------------------+



In [0]:
from pyspark.sql import functions as F

features_df = df \
    .withColumn("income_log", F.log(F.col("avg_income") + 1)) \
    .withColumn("customers_per_income",
                F.col("total_customers") / (F.col("avg_income") + 1)) \
    .withColumn("country_length", F.length("country"))

features_df.select(
    "country",
    "income_log",
    "customers_per_income",
    "country_length"
).show()

+------------+------------------+--------------------+--------------+
|     country|        income_log|customers_per_income|country_length|
+------------+------------------+--------------------+--------------+
|     Germany|11.535395312881255|0.009856029646311396|             7|
|       Japan|11.573249264360534|0.004660225218828029|             5|
|      Canada|11.544335397425835|0.008023967785966854|             6|
|South Africa|10.911415552891635|0.007189996143683946|            12|
|      France| 10.92191801578136|0.011557163250263684|             6|
|   Australia| 11.57174466025494|0.006449280255906805|             9|
|         USA|11.548440864244183| 0.02305642706578411|             3|
|          UK|11.534320246874353|0.009915572849572235|             2|
|       India|10.895590274466674| 0.02825466963543679|             5|
|      Brazil|10.922093529887317|0.018506270871352475|             6|
+------------+------------------+--------------------+--------------+



In [0]:
df.stat.corr("avg_income", "total_customers")

0.13217741289874924