In [0]:
from pyspark.sql.functions import col

df = spark.read.table("default.gold_country_year")

recent = df.filter(col("year").between(2021, 2024))
historical = df.filter(col("year").between(2018, 2024))


In [0]:
from pyspark.sql.functions import min, max, pow

cagr = (
    recent.groupBy("country_code", "country_name", "region")
          .agg(
              min("year").alias("start_year"),
              max("year").alias("end_year"),
              min("total_value_usd").alias("start_value"),
              max("total_value_usd").alias("end_value")
          )
)

cagr = cagr.withColumn(
    "years",
    col("end_year") - col("start_year")
).withColumn(
    "cagr_3y",
    pow(col("end_value") / col("start_value"), 1 / col("years")) - 1
)


In [0]:
from pyspark.sql.functions import avg

size = (
    historical.groupBy("country_code")
              .agg(avg("total_value_usd").alias("avg_export_value"))
)


In [0]:
from pyspark.sql.functions import stddev

volatility = (
    historical.groupBy("country_code")
              .agg(stddev("total_value_usd").alias("volatility"))
)


In [0]:
emerging = (
    cagr.join(size, "country_code")
        .join(volatility, "country_code")
)


In [0]:
from pyspark.sql.functions import when

emerging = emerging.withColumn(
    "emerging_score",
    (col("cagr_3y") * 0.5) +
    (when(col("avg_export_value") < 2e11, 0.3).otherwise(0)) +
    (when(col("volatility") < 5e10, 0.2).otherwise(0))
)


In [0]:
from pyspark.sql.functions import desc

ranked = emerging.orderBy(desc("emerging_score"))


In [0]:
ranked.select(
    "country_code",
    "country_name",
    "region",
    "cagr_3y",
    "avg_export_value",
    "volatility",
    "emerging_score"
).write \
 .mode("overwrite") \
 .format("delta") \
 .saveAsTable("default.gold_emerging_markets")
