In [0]:
# Criação de Database
def create_database(nome_db):
    """
    Criação do Database, instanciado no notebook Entrypoints
    """
    
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {nome_db}")

In [0]:
def to_silver_calendar(path_bronze_calendar, path_silver_calendar):
    """
    A partir do caminho do arquivo bronze em Parquet, cria um dataframe.
    Realiza a transformação das colunas de string para tipos de dados relevantes.
    Salva na camada Silver Transacional (Parquet) e Analítica (Tabelas de Banco de Dados).
    """
    
    df_calendar = spark.read.parquet(path_bronze_calendar)

    df_calendar = (df_calendar
                   .withColumn("listing_id", col("listing_id").cast(LongType()))
                   .withColumn("date", to_date("date", "yyyy-MM-dd"))
                   .withColumn("available", 
                               when(col("available") == "t", True).
                               when(col("available") == "f", False).
                               otherwise(None))
                   .withColumn("price", regexp_replace(col("price"), "\\$", ""))
                   .withColumn("price", regexp_replace(col("price"), ",", ""))
                   .withColumn("price", col("price").cast("float"))
                   .withColumn("minimum_nights", col("minimum_nights").cast("integer"))
                   .withColumn("maximum_nights", col("maximum_nights").cast("integer")))
    
    # Silver Transacional
    df_calendar.write.mode("overwrite").parquet(path_silver_calendar)

    # Silver Analítica
    df_calendar.write.format("delta").mode("overwrite").saveAsTable(f"{nome_db}.silver_calendar")

In [0]:
def to_silver_listings(path_bronze_listings, path_silver_listings):
    """
    A partir do caminho do arquivo bronze em Parquet, cria um dataframe.
    Realiza a transformação das colunas de string para tipos de dados relevantes.
    Salva na camada Silver Transacional (Parquet) e Analítica (Tabelas de Banco de Dados).
    """

    tratar_antes = ["host_response_rate", "host_acceptance_rate", "price"]

    numericos_int = ["host_id", "host_response_rate", "host_acceptance_rate","host_listings_count", "host_total_listings_count", 
                    "accommodates", "bedrooms", "beds", "minimum_nights", "maximum_nights", "minimum_minimum_nights", 
                    "maximum_minimum_nights", "minimum_maximum_nights", "maximum_maximum_nights", "availability_30", 
                    "availability_60", "availability_90", "availability_365", "number_of_reviews", "number_of_reviews_ltm", 
                    "number_of_reviews_l30d", "calculated_host_listings_count", "calculated_host_listings_count_entire_homes", 
                    "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms"]

    numericos_float = ["latitude", "longitude", "bathrooms", "price", "minimum_nights_avg_ntm", "maximum_nights_avg_ntm", 
                    "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", 
                    "review_scores_communication", "review_scores_location", "review_scores_value", "reviews_per_month"]

    numericos_long = ["id", "scrape_id"]

    colunas_data = ["last_scraped", "host_since", "calendar_last_scraped", "first_review", "last_review"]

    booleanos = ["host_is_superhost", "host_has_profile_pic", "host_identity_verified", "has_availability", "instant_bookable"]

    listas = ["host_verifications", "amenities"]

    df_listings = spark.read.parquet(path_bronze_listings)

    # Removendo caracteres
    for i in tratar_antes:
        df_listings = (df_listings
                       .withColumn(i, regexp_replace(col(i), "\\$", ""))
                       .withColumn(i, regexp_replace(col(i), ",", ""))
                       .withColumn(i, regexp_replace(col(i), " ", ""))
                       .withColumn(i, regexp_replace(col(i), "\\%", ""))
                       )
        
    # Tratamento colunas Integer
    for i in numericos_int:
        df_listings = df_listings.withColumn(i, col(i).cast(IntegerType()))

    # Transformar em Float
    for i in numericos_float:
        df_listings = df_listings.withColumn(i, col(i).cast(FloatType()))

    # Transformar em LongType
    for i in numericos_long:
        df_listings = df_listings.withColumn(i, col(i).cast(LongType()))

    # Transformar em Data
    for i in colunas_data:
        df_listings = df_listings.withColumn(i, to_date(i, "yyyy-MM-dd"))

    # Transformar em Booleanos
    for i in booleanos:
        df_listings = (df_listings.withColumn(i, 
                                              when(col(i) == "t", True).
                                              when(col(i) == "f", False).
                                              otherwise(None)
                                              ))
    
    for i in df_listings.columns:
        df_listings = (df_listings.withColumn(i, when(col(i) == "N/A", None).otherwise(col(i))))
    
    # Silver Transacional
    df_listings.write.mode("overwrite").parquet(path_silver_listings)

    # Silver Analítica
    df_listings.write.format("delta").mode("overwrite").saveAsTable(f"{nome_db}.silver_listings")

In [0]:
def to_silver_reviews(path_bronze_reviews, path_silver_reviews):
    """
    A partir do caminho do arquivo bronze em Parquet, cria um dataframe.
    Realiza a transformação das colunas de string para tipos de dados relevantes.
    Salva na camada Silver Transacional (Parquet) e Analítica (Tabelas de Banco de Dados).
    """
    
    df_reviews = spark.read.parquet(path_bronze_reviews)
    
    df_reviews = df_reviews.drop("adjusted_price")
    
    df_reviews = (df_reviews
                  .withColumn("listing_id", col("listing_id").cast(LongType()))
                  .withColumn("id", col("id").cast(LongType()))
                  .withColumn("date", to_timestamp("date", "yyyy-MM-dd"))
                  .withColumn("reviewer_id", col("reviewer_id").cast(LongType())))
    
    # Silver Transacional
    df_reviews.write.mode("overwrite").parquet(path_silver_reviews)

    # Silver Analítica
    df_reviews.write.format("delta").mode("overwrite").saveAsTable(f"{nome_db}.silver_reviews")
