In [1]:
# Imports

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

In [2]:
# Creamos la sesion de spark

spark = SparkSession.builder\
        .appName("miSesion")\
        .config("spark.sql.shuffle.partitions", "5")\
        .getOrCreate()

In [14]:
# Leemos el dataframe de 2021

df_2021 = spark.read.csv("datasets\world-happiness-report-2021.csv", header=True, inferSchema=True)
df_2021.show(5)
df_2021.printSchema()

+------------+------------------+------------+------------------------------+------------+------------+---------------------+--------------+-----------------------+----------------------------+----------+-------------------------+------------------------+--------------------------------+----------------------------+-------------------------------------+------------------------------------------+------------------------+---------------------------------------+-------------------+
|Country name|Regional indicator|Ladder score|Standard error of ladder score|upperwhisker|lowerwhisker|Logged GDP per capita|Social support|Healthy life expectancy|Freedom to make life choices|Generosity|Perceptions of corruption|Ladder score in Dystopia|Explained by: Log GDP per capita|Explained by: Social support|Explained by: Healthy life expectancy|Explained by: Freedom to make life choices|Explained by: Generosity|Explained by: Perceptions of corruption|Dystopia + residual|
+------------+------------------

In [3]:
# Leemos el dataframe historico

df_historic = spark.read.csv("datasets\world-happiness-report.csv", header=True, inferSchema=True)
df_historic.show(5)
df_historic.printSchema()

+------------+----+-----------+------------------+--------------+--------------------------------+----------------------------+----------+-------------------------+---------------+---------------+
|Country name|year|Life Ladder|Log GDP per capita|Social support|Healthy life expectancy at birth|Freedom to make life choices|Generosity|Perceptions of corruption|Positive affect|Negative affect|
+------------+----+-----------+------------------+--------------+--------------------------------+----------------------------+----------+-------------------------+---------------+---------------+
| Afghanistan|2008|      3.724|              7.37|         0.451|                            50.8|                       0.718|     0.168|                    0.882|          0.518|          0.258|
| Afghanistan|2009|      4.402|              7.54|         0.552|                            51.2|                       0.679|      0.19|                     0.85|          0.584|          0.237|
| Afghanistan|2

## ¿Cual es el la puntuacion com mayor grado de felicidad por region?

In [8]:
df_grouped = df_2021.groupBy(F.col("Regional indicator"))\
                .agg(F.max(F.col("Ladder score")).alias("Max Ladder score"))\
                .orderBy(F.col("Max Ladder score"), ascending=False)

df_grouped.show()

+--------------------+----------------+
|  Regional indicator|Max Ladder score|
+--------------------+----------------+
|      Western Europe|           7.842|
|North America and...|           7.277|
|Middle East and N...|           7.157|
|Latin America and...|           7.069|
|Central and Easte...|           6.965|
|           East Asia|           6.584|
|      Southeast Asia|           6.377|
|Commonwealth of I...|           6.179|
|  Sub-Saharan Africa|           6.049|
|          South Asia|           5.269|
+--------------------+----------------+



## ¿Cual es el pais con mayor indice de felicidad?

In [13]:
df_max = df_2021.orderBy(F.col("Ladder score"), ascending=False)
df_max = df_max.select(F.col("Country name"), F.col("Regional indicator"), F.col("Ladder score")).first()
print(df_max)

Row(Country name='Finland', Regional indicator='Western Europe', Ladder score=7.842)


## ¿Cuál es el país que más veces ocupó el primer lugar en todos los años?

In [34]:
# Creamos la ventana para agrupar por años
window = Window.partitionBy(F.col("year")).orderBy(F.col("Life Ladder").desc())

# Creamos el ranking
df_ranked = df_historic.withColumn("rank", F.rank().over(window))

# Nos quedamos con el primero para cada año
df_ranked_1 = df_ranked.filter(F.col("rank") == 1)

# Contamos
df_rakned_1_count = df_ranked_1.groupBy(F.col("Country name")).count()

# Imprimimos dataframe final
df_rakned_1_count.orderBy(F.col("count").desc()).show()

+------------+-----+
|Country name|count|
+------------+-----+
|     Denmark|    7|
|     Finland|    6|
|      Canada|    1|
| Switzerland|    1|
|      Norway|    1|
+------------+-----+



## ¿Qué puesto de Felicidad por año tiene el país con mayor GDP per capita del 2020?

In [19]:
# Extraemos el pais con mayor GDP del 2020

pais_gdp = df_historic.filter(F.col("year") == 2020)\
           .orderBy(F.col("Log GDP per capita"), ascending=False)\
           .first()

pais = pais_gdp["Country name"]

# Creamos el ranking por año y filtramos por nuestro pais

windowCountry = Window.partitionBy(F.col("Country name"))\
                    .orderBy(F.col("Life Ladder").desc())
                    

target_country = df_historic.withColumn("rank", F.rank().over(windowCountry))\
                            .filter(F.col("Country name") == pais)\
                            .select(F.col("Country name"), F.col("year"), F.col("Life Ladder"), F.col("Log GDP per capita"), F.col("rank"))\
                            .orderBy(F.col("year"), asceding=False).show()

+------------+----+-----------+------------------+----+
|Country name|year|Life Ladder|Log GDP per capita|rank|
+------------+----+-----------+------------------+----+
|     Ireland|2006|      7.144|            10.972|   4|
|     Ireland|2008|      7.568|            10.929|   1|
|     Ireland|2009|      7.046|            10.866|   6|
|     Ireland|2010|      7.257|            10.879|   2|
|     Ireland|2011|      7.007|            10.878|  10|
|     Ireland|2012|      6.965|            10.876|  11|
|     Ireland|2013|       6.76|            10.884|  14|
|     Ireland|2014|      7.018|            10.959|   9|
|     Ireland|2015|       6.83|            11.174|  13|
|     Ireland|2016|      7.041|            11.199|   7|
|     Ireland|2017|       7.06|            11.266|   5|
|     Ireland|2018|      6.962|            11.332|  12|
|     Ireland|2019|      7.255|            11.371|   3|
|     Ireland|2020|      7.035|            11.323|   8|
+------------+----+-----------+-----------------

¿En que porcentaje ha variado a nivel mundial el GDP promedio del 2020 respecto al 2019 y determinar si aumentó o disminuyó?

In [32]:
# Sacamos GDP promedio para 2020 y 2021

def gdp_promedio(year:int) -> float:
    num_countries = df_historic.filter(F.col("year") == year).count()
    sum_gdp = df_historic.filter(F.col("year") == year)\
                         .agg(F.sum(F.col("Log GDP per capita")))\
                         .collect()[0][0]
    return sum_gdp / num_countries

gdp_prom_2020 = gdp_promedio(2020)
gdp_prom_2019 = gdp_promedio(2019)

pct = ((gdp_prom_2020 / gdp_prom_2019) - 1) * 100

print(f"El porcentaje a nivel mundial dee GDP per capita ha variado del 2020 al 2021 en un {round(pct, 2)}%")

El porcentaje a nivel mundial dee GDP per capita ha variado del 2020 al 2021 en un -0.54%


¿Cuál es el país con mayor expectativa de vida y qué valor tenía en el año 2019?