In [0]:
import requests
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, to_date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType


In [0]:
start_date = ((datetime.today() - relativedelta(years=20)).replace(day=1, month=1)).strftime("%d/%m/%Y")

response = requests.get(
        f'https://api.bcb.gov.br/dados/serie/bcdata.sgs.10844/dados?formato=json&dataInicial={start_date}'
    )

assert response.status_code == 200, f'Error: {response.status_code}, Response: {response.text}'

ipca_data = response.json()

spark = SparkSession.builder.getOrCreate()

df_ipca = spark.createDataFrame(ipca_data)

In [0]:
df_ipca = (
    df_ipca
    .withColumn("valor", df_ipca["valor"].cast(DoubleType()))
    .withColumn("data", to_date(df_ipca["data"], "dd/MM/yyyy").cast(DateType()))
)

df_ipca.show()
df_ipca.describe().show()

In [0]:
df_ipca.write.format("delta").mode("overwrite").saveAsTable("db_analytics.dataset_ipca")

In [0]:
%sql
SELECT * FROM api_data_delta;