In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, avg, count, when, rank
from pyspark.sql.window import Window

# Iniciar la sesión de Spark
spark = SparkSession.builder \
    .appName("Proyecto de felicidad mundial - Big Data Processing - Bootcamp Big Data, IA y ML") \
    .getOrCreate()

# Cargar los datos
file_location = "/FileStore/tables/world_happiness_report_2021-1.csv"
file_type = "csv"
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

df_2021 = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df_2021.limit(5))

file_location = "/FileStore/tables/world_happiness_report.csv"

df_historical = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df_historical.limit(5))


Country name,Regional indicator,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
Finland,Western Europe,7.842,0.032,7.904,7.78,10.775,0.954,72.0,0.949,-0.098,0.186,2.43,1.446,1.106,0.741,0.691,0.124,0.481,3.253
Denmark,Western Europe,7.62,0.035,7.687,7.552,10.933,0.954,72.7,0.946,0.03,0.179,2.43,1.502,1.108,0.763,0.686,0.208,0.485,2.868
Switzerland,Western Europe,7.571,0.036,7.643,7.5,11.117,0.942,74.4,0.919,0.025,0.292,2.43,1.566,1.079,0.816,0.653,0.204,0.413,2.839
Iceland,Western Europe,7.554,0.059,7.67,7.438,10.878,0.983,73.0,0.955,0.16,0.673,2.43,1.482,1.172,0.772,0.698,0.293,0.17,2.967
Netherlands,Western Europe,7.464,0.027,7.518,7.41,10.932,0.942,72.4,0.913,0.175,0.338,2.43,1.501,1.079,0.753,0.647,0.302,0.384,2.798


Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
Afghanistan,2008,3.724,7.37,0.451,50.8,0.718,0.168,0.882,0.518,0.258
Afghanistan,2009,4.402,7.54,0.552,51.2,0.679,0.19,0.85,0.584,0.237
Afghanistan,2010,4.758,7.647,0.539,51.6,0.6,0.121,0.707,0.618,0.275
Afghanistan,2011,3.832,7.62,0.521,51.92,0.496,0.162,0.731,0.611,0.267
Afghanistan,2012,3.783,7.705,0.521,52.24,0.531,0.236,0.776,0.71,0.268


In [0]:
# EJERCICIO 1: Buscar el país más feliz del 2021

happiest_country_2021 = df_2021.select("Country name", "Ladder score")\
    .orderBy(col("Ladder score").desc())\
    .first()

print(f'El país más feliz del 2021 fue: {happiest_country_2021["Country name"]} con un puntaje de felicidad de {happiest_country_2021["Ladder score"]}')


El país más feliz del 2021 fue: Finland con un puntaje de felicidad de 7.842


In [0]:
# EJERCICIO 2: Agrupar por continente (Regional indicator) y encontrar el país más feliz en cada uno

happiest_country_by_continent_2021 = df_2021.alias("r2021").groupBy("r2021.Regional indicator")\
    .agg(max("r2021.Ladder score").alias("Max Ladder Score"))\
    .join(df_2021.alias("r2021_join"), (col("r2021_join.Ladder score") == col("Max Ladder Score")) & \
          (col("r2021_join.Regional indicator") == col("r2021.Regional indicator")))\
    .select(col("r2021_join.Regional indicator"), "r2021_join.Country name", "r2021_join.Ladder score")


happiest_country_by_continent_2021.show()


+--------------------+--------------------+------------+
|  Regional indicator|        Country name|Ladder score|
+--------------------+--------------------+------------+
|      Western Europe|             Finland|       7.842|
|North America and...|         New Zealand|       7.277|
|Middle East and N...|              Israel|       7.157|
|Latin America and...|          Costa Rica|       7.069|
|Central and Easte...|      Czech Republic|       6.965|
|           East Asia|Taiwan Province o...|       6.584|
|      Southeast Asia|           Singapore|       6.377|
|Commonwealth of I...|          Uzbekistan|       6.179|
|  Sub-Saharan Africa|           Mauritius|       6.049|
|          South Asia|               Nepal|       5.269|
+--------------------+--------------------+------------+



In [0]:
# EJERCICIO 3: Encontrar el país que más veces ocupó el primer lugar

# Definimos un WindowSpec para particionar los datos por año y ordenarlos por la puntuación de Life Ladder en orden descendente
windowSpec = Window.partitionBy("year").orderBy(col("Life Ladder").desc())

# Usamos la función rank() para asignar un rango basado en la puntuación de Life Ladder dentro de cada año
ranked = df_historical.withColumn("rank", rank().over(windowSpec))

# Filtramos para mantener solo las filas donde rank es 1 (los más felices cada año)
top_countries_each_year = ranked.filter(col("rank") == 1)

most_happy_counts = top_countries_each_year.groupBy("Country name").count().orderBy(col("count").desc())
most_happy_counts.show()

top_country_record = most_happy_counts.first()  

# Extraemos el nombre del país y el conteo del registro
top_country_name = top_country_record["Country name"]
times_top = top_country_record["count"]

print(f'El país que más veces ha ocupado el primer lugar es: {top_country_name}, con {times_top} veces.')



+------------+-----+
|Country name|count|
+------------+-----+
|     Denmark|    7|
|     Finland|    6|
|      Norway|    1|
| Switzerland|    1|
|      Canada|    1|
+------------+-----+

El país que más veces ha ocupado el primer lugar es: Denmark, con 7 veces.


In [0]:
# EJERCICIO 4: Encontrar el país con el mayor GDP en 2020

country_with_highest_gdp_2020 = df_historical.filter(df_historical["year"] == 2020)\
    .orderBy(col("Log GDP per capita").desc())\
    .first()

happiness_rank_2021 = df_2021.filter(df_2021["Country name"] == country_with_highest_gdp_2020["Country name"])\
    .select("Country name", "Ladder score")\
    .first()

print(f'El país con el mayor GDP en 2020 fue: {country_with_highest_gdp_2020["Country name"]}, y su puesto de felicidad en 2021 fue el: {round(float(happiness_rank_2021["Ladder score"]))}')


El país con el mayor GDP en 2020 fue: Ireland, y su puesto de felicidad en 2021 fue el: 7


In [0]:
# EJERCICIO 5: Calcular el GDP promedio mundial para 2020 y 2021

avg_gdp_2020 = df_historical.filter(df_historical["year"] == 2020)\
    .agg(avg("Log GDP per capita").alias("AvgGDP 2020")).first()

avg_gdp_2021 = df_2021.agg(avg("Logged GDP per capita").alias("Avg GDP 2021")).first()

percentage_change = ((avg_gdp_2021["Avg GDP 2021"] - avg_gdp_2020["AvgGDP 2020"]) / avg_gdp_2020["AvgGDP 2020"]) * 100

change_description = "un aumento" if percentage_change > 0 else "una disminución"
percentage_change_gdp_rounded = round(percentage_change, 2)

print(f'El GDP promedio mundial ha experimentado {change_description} de {percentage_change_gdp_rounded}% de 2020 a 2021.')


El GDP promedio mundial ha experimentado una disminución de -3.27% de 2020 a 2021.


In [0]:
# EJERCICIO 6: Identificar el país con la mayor expectativa de vida en 2021, su valor en 2019, y el promedio en los últimso 5 años

country_with_highest_life_expectancy_2021 = df_2021.orderBy(col("Healthy life expectancy").desc()).first()

# Buscamos la expectativa de vida de ese país en 2019
life_expectancy_2019 = df_historical.filter((df_historical["Country name"] == country_with_highest_life_expectancy_2021["Country name"]) & (df_historical["year"] == 2019))\
    .select("Country name", "Healthy life expectancy at birth").first()

print(f'El país con la mayor expectativa de vida es: {country_with_highest_life_expectancy_2021["Country name"]} con {round(float(country_with_highest_life_expectancy_2021["Healthy life expectancy"]),1)} años. En 2019, tenía una expectativa de vida de {round(float(life_expectancy_2019["Healthy life expectancy at birth"]),1)} años.')

# Filtramos los datos para incluir los últimos 5 años
max_year = df_historical.agg(max(col("year")).cast("int")).first()[0]
filtered_data = df_historical.filter(col("year") > max_year - 5)

# Calculamos el promedio de la expectativa de vida saludable ('Healthy life expectancy at birth') para cada país
average_expectancy = filtered_data.groupBy("Country name").agg(avg("Healthy life expectancy at birth").alias("Average Life Expectancy"))
highest_expectancy_country = average_expectancy.orderBy(col("Average Life Expectancy").desc()).first()

average_expectancy.orderBy(col("Average Life Expectancy").desc()).limit(1).show()

print(f'El país con la mayor expectativa de vida en promedio durante los últimos 5 años es: {highest_expectancy_country["Country name"]} con un promedio de {highest_expectancy_country["Average Life Expectancy"]:.1f} años.')



El país con la mayor expectativa de vida es: Singapore con 77.0 años. En 2019, tenía una expectativa de vida de 77.1 años.
+------------+-----------------------+
|Country name|Average Life Expectancy|
+------------+-----------------------+
|   Singapore|                  76.65|
+------------+-----------------------+

El país con la mayor expectativa de vida en promedio durante los últimos 5 años es: Singapore con un promedio de 76.7 años.
