In [0]:
import requests
from pyspark.sql.functions import * 
from pyspark.sql.types import *

In [0]:
url = "https://ourworldindata.org/grapher/life-expectancy.csv?v=1&csvType=full&useColumnShortNames=true"

response = requests.get(url)

# Check if the request is successfull

if response.status_code == 200:
    # Save the data to csv a temp file 
    with open("/Volumes/workspace/default/tmp_customer/life-expectancy.csv", "w") as f:
        f.write(response.text)

    # Read the csv file
    df_life_expectancy = spark.read.csv("/Volumes/workspace/default/tmp_customer/life-expectancy.csv", header=True, inferSchema=True)

    # Display the DataFrame
    display(df_life_expectancy)


else:
    print(f"Request failed with status code:, {response.status_code}")

In [0]:
df_life_expectancy.printSchema()

In [0]:
df_life_expectancy.filter(col('code')=='USA').display()

In [0]:

# Fetch the metadata
metadata = requests.get("https://ourworldindata.org/grapher/life-expectancy.metadata.json?v=1&csvType=full&useColumnShortNames=true").json()

In [0]:
for good_to_know in metadata["columns"]["life_expectancy_0"]["descriptionKey"]:
  print(good_to_know)

In [0]:
metadata

In [0]:
# Filter the dataframe for the years 1980 and 2010
filtered_df = df_life_expectancy.filter((df_life_expectancy.year == 1980) | (df_life_expectancy.year == 2010))

filtered_df.show()

In [0]:
# Pivot the data to get 1980 and 2010 life expectancy side by side for each country
pivot_df = filtered_df.groupBy("entity").pivot("year").agg(first("life_expectancy_0"))


pivot_df.show()

In [0]:
# Pivot the data to get 1980 and 2010 life expectancy side by side for each country

pivot_df = filtered_df.groupBy("entity").pivot("year").agg(first("life_expectancy_0"))
display(pivot_df)

In [0]:
# Calculate the life expectancy increase between 1980 and 2010
pivot_df = pivot_df.withColumn("increase", col("2010") - col("1980"))
display(pivot_df)

In [0]:
# Get the top 10 countries with the biggest increase in life expectancy
top_10_increase = pivot_df.orderBy(col("increase").desc()).limit(10)
display(top_10_increase)

In [0]:
params={}

response = requests.get("https://ourworldindata.org/grapher/life-expectancy.csv", params=params)
data = response.text

In [0]:
print(data)

In [0]:
params = {"country": "France"}
response = requests.get("https://ourworldindata.org/grapher/life-expectancy.values.json", params=params)
data = response.json()

In [0]:
data