In [1]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from sqlalchemy import create_engine
#from delta import *
import pyspark

from pyspark.sql.types import DateType, IntegerType, LongType, FloatType, BooleanType

from pyspark.sql.functions import col, sum

from pyspark.sql.functions import regexp_replace

#from delta import DeltaTable  # Necessário apenas se for usar operações específicas

### 0. Inicia Sessão Spark

In [3]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
print(spark.version)

3.5.0


### 1. Ler do parquet

#### 1.1 Reviews

In [4]:
reviews_parquet = spark.read.parquet('datawarehouse/bronze/bronze_reviews/')

In [5]:
reviews_parquet.count()

773134

In [6]:
reviews_parquet.printSchema()

root
 |-- listing_id: string (nullable = true)
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- reviewer_id: string (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)



In [7]:
reviews_parquet.show(3)

+----------+------------------+----------+-----------+-------------+--------------------+
|listing_id|                id|      date|reviewer_id|reviewer_name|            comments|
+----------+------------------+----------+-----------+-------------+--------------------+
|     17878|         135000108|2017-03-02|   11601165|         Luis|Very nice apartme...|
|    238802|902182527354705841|2023-05-29|  485370553|       Camila|Excelente todo. M...|
|     25026|546355719674637771|2022-01-23|  102090531|      Stéfani|Estadia perfeita!...|
+----------+------------------+----------+-----------+-------------+--------------------+
only showing top 3 rows



In [8]:
# Lista das colunas que você quer converter
colunas = ["listing_id", "id", "reviewer_id"]

for coluna in colunas:
    reviews_parquet = reviews_parquet.withColumn(coluna, reviews_parquet[coluna].cast(LongType()))

In [9]:
# Converte a coluna "coluna_data" para o tipo de data
reviews_parquet = reviews_parquet.withColumn("date", reviews_parquet["date"].cast(DateType()))

In [10]:
# Verifica o esquema para confirmar a conversão
reviews_parquet.printSchema()  # Confirma o esquema atualizado

root
 |-- listing_id: long (nullable = true)
 |-- id: long (nullable = true)
 |-- date: date (nullable = true)
 |-- reviewer_id: long (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)



In [11]:
# Conta a frequência de cada valor na coluna "coluna_exemplo"
reviews_parquet.groupBy("listing_id").count().show(n=5)

+----------+-----+
|listing_id|count|
+----------+-----+
|   1280058|  404|
|   3079210|  346|
|  14912877|  178|
|  19359961|   69|
|  23899787|  152|
+----------+-----+
only showing top 5 rows



In [12]:
null_counts = reviews_parquet.select([sum(col(c).isNull().cast("int")).alias(c) for c in reviews_parquet.columns])
null_counts.show(vertical=True)

-RECORD 0------------
 listing_id    | 0   
 id            | 0   
 date          | 0   
 reviewer_id   | 0   
 reviewer_name | 0   
 comments      | 0   



In [13]:
# Contabiliza linhas duplicadas
reviews_parquet.groupBy(reviews_parquet.columns).count().filter("count > 1").count()

0

In [14]:
# Contabiliza linhas distintas, sem contar as duplicadas
reviews_parquet.distinct().count()

773134

In [15]:
reviews_parquet = reviews_parquet.distinct()

In [16]:
# Conta os valores únicos na coluna "coluna_exemplo"
num_valores_unicos = reviews_parquet.select("listing_id").distinct().count()
print(f"Número de valores únicos: {num_valores_unicos}")

Número de valores únicos: 26218


In [17]:
# Conta os valores únicos na coluna "coluna_exemplo"
num_valores_unicos = reviews_parquet.select("id").distinct().count()
print(f"Número de valores únicos: {num_valores_unicos}")

Número de valores únicos: 773134


In [18]:
# Conta os valores únicos na coluna "coluna_exemplo"
num_valores_unicos = reviews_parquet.select("reviewer_id").distinct().count()
print(f"Número de valores únicos: {num_valores_unicos}")

Número de valores únicos: 616044


In [19]:
reviews_parquet.count()

773134

In [20]:
# Passar para a camada Silver
reviews_parquet.write.parquet("datawarehouse/silver/silver_reviews", mode="overwrite")

In [22]:
# Salvar dados como Delta Lake
reviews_parquet.write.format("delta").mode("overwrite").save("deltalake/silver/silver_reviews")

# Carregar dados Delta
#df_delta = spark.read.format("delta").load("/path/to/delta_table")

#### 1.2 Listings

In [26]:
listings = spark.read.parquet('datawarehouse/bronze/bronze_listings/')

In [27]:
# Conta os valores nulos em cada coluna
null_counts = listings.select([sum(col(c).isNull().cast("int")).alias(c) for c in listings.columns])
null_counts.show(vertical=True)

-RECORD 0---------------------------------------------
 id                                           | 0     
 listing_url                                  | 0     
 scrape_id                                    | 0     
 last_scraped                                 | 0     
 source                                       | 0     
 name                                         | 0     
 description                                  | 1163  
 neighborhood_overview                        | 17648 
 picture_url                                  | 0     
 host_id                                      | 0     
 host_url                                     | 0     
 host_name                                    | 0     
 host_since                                   | 0     
 host_location                                | 6795  
 host_about                                   | 18111 
 host_response_time                           | 0     
 host_response_rate                           | 0     
 host_acce

In [28]:
listings.count()

34664

In [29]:
# Remove o caractere "$" e converte para float
listings = listings.withColumn("price", regexp_replace("price", "\\$", "").cast(FloatType()))
listings = listings.withColumn("host_response_rate", regexp_replace("host_response_rate", "\\%", "").cast(FloatType()))
listings = listings.withColumn("host_acceptance_rate", regexp_replace("host_acceptance_rate", "\\%", "").cast(FloatType()))

In [30]:
# Lista das colunas que você quer converter

colunas = ["id","scrape_id", "host_id"]              
     
for coluna in colunas:
    listings = listings.withColumn(coluna, listings[coluna].cast(LongType()))


colunas = ["minimum_nights","maximum_nights","minimum_minimum_nights",
          "maximum_minimum_nights","minimum_maximum_nights","maximum_maximum_nights",
          "availability_30", "availability_60", "availability_90", "availability_365",
           "accommodates", "calculated_host_listings_count", "calculated_host_listings_count_entire_homes",
           "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms",
           "number_of_reviews", "number_of_reviews_ltm", "number_of_reviews_l30d", 
           "host_listings_count", "host_total_listings_count", "bathrooms", "bedrooms", "beds"]              
     
for coluna in colunas:
    listings = listings.withColumn(coluna, listings[coluna].cast(IntegerType()))


# Lista das colunas que você quer converter
colunas = ["has_availability", "host_is_superhost", "host_has_profile_pic","host_identity_verified",
          "instant_bookable"]

for coluna in colunas:
    listings = listings.withColumn(coluna, listings[coluna].cast(BooleanType()))


# Lista das colunas que você quer converter
colunas = ["last_scraped", "host_since", "calendar_last_scraped"]

for coluna in colunas:
    listings = listings.withColumn(coluna, listings[coluna].cast(DateType()))

# Lista das colunas que você quer converter
colunas = ["minimum_nights_avg_ntm", "maximum_nights_avg_ntm", "latitude", "longitude"]

for coluna in colunas:
    listings = listings.withColumn(coluna, listings[coluna].cast(FloatType()))

In [31]:
# Conta a frequência de cada valor na coluna "coluna_exemplo"
listings.groupBy("host_is_superhost").count().orderBy("count", ascending=False).show()

+-----------------+-----+
|host_is_superhost|count|
+-----------------+-----+
|            false|23534|
|             true|10262|
|             NULL|  868|
+-----------------+-----+



In [32]:
listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: long (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: float (nullable = true)
 |-- host_acceptance_rate: float (nullable = true)
 |-- host_is_superhost: boolean (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: integer

In [33]:
listings.show(1, vertical=True)

-RECORD 0------------------------------------------------------------
 id                                           | 96478                
 listing_url                                  | https://www.airbn... 
 scrape_id                                    | 20240627045056       
 last_scraped                                 | 2024-06-28           
 source                                       | city scrape          
 name                                         | APARTAMENT IN BOT... 
 description                                  | LINDO APARTAMENTO... 
 neighborhood_overview                        | bRes,resturantes,... 
 picture_url                                  | https://a0.muscac... 
 host_id                                      | 530471               
 host_url                                     | https://www.airbn... 
 host_name                                    | Hugo                 
 host_since                                   | 2011-04-23           
 host_location      

In [35]:
listings.distinct().count() == listings.count()

True

In [36]:
listings.count()

34664

In [37]:
listings.filter(listings.id.isNull()).count()

0

In [38]:
listings.filter(listings.id.isNull()).show(n=2, vertical=True)
#listings.filter("id is NULL").show()
#listings.filter(col("id").isNull()).show()

(0 rows)



In [39]:
## DROPAR COLUNAS COM ID NULL
listings.filter(listings.id.isNotNull()).count()

34664

In [40]:
# Passar para a camada Silver
listings.write.parquet("datawarehouse/silver/silver_listings", mode="overwrite")

In [41]:
# Salvar dados como Delta Lake
listings.write.format("delta").mode("overwrite").save("deltalake/silver/silver_listings")

# Carregar dados Delta
#df_delta = spark.read.format("delta").load("/path/to/delta_table")


#### 1.3 Calendar

In [42]:
calendar_parquet = spark.read.parquet('datawarehouse/bronze/bronze_calendar/')

In [43]:
calendar_parquet.count()

12652371

In [44]:
calendar_parquet.printSchema()

root
 |-- listing_id: long (nullable = true)
 |-- date: date (nullable = true)
 |-- available: string (nullable = true)
 |-- price: string (nullable = true)
 |-- adjusted_price: string (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- maximum_nights: integer (nullable = true)



In [45]:
calendar_parquet.show(n=3, vertical=True)

-RECORD 0--------------------
 listing_id     | 297908     
 date           | 2024-06-27 
 available      | f          
 price          | $250.00    
 adjusted_price | NULL       
 minimum_nights | 2          
 maximum_nights | 1125       
-RECORD 1--------------------
 listing_id     | 17878      
 date           | 2024-06-28 
 available      | f          
 price          | $350.00    
 adjusted_price | NULL       
 minimum_nights | 5          
 maximum_nights | 28         
-RECORD 2--------------------
 listing_id     | 17878      
 date           | 2024-06-29 
 available      | f          
 price          | $350.00    
 adjusted_price | NULL       
 minimum_nights | 5          
 maximum_nights | 28         
only showing top 3 rows



In [46]:
# Remove o caractere "$" e converte para float
calendar_parquet = calendar_parquet.withColumn("price", regexp_replace("price", "\\$", "").cast(FloatType()))

In [47]:
colunas = ["listing_id"]              
     
for coluna in colunas:
    calendar_parquet = calendar_parquet.withColumn(coluna, calendar_parquet[coluna].cast(LongType()))


colunas = ["minimum_nights","maximum_nights"]              
     
for coluna in colunas:
    calendar_parquet = calendar_parquet.withColumn(coluna, calendar_parquet[coluna].cast(IntegerType()))


# Lista das colunas que você quer converter
colunas = ["available"]

for coluna in colunas:
    calendar_parquet = calendar_parquet.withColumn(coluna, calendar_parquet[coluna].cast(BooleanType()))


# Lista das colunas que você quer converter
colunas = ["date"]

for coluna in colunas:
    calendar_parquet = calendar_parquet.withColumn(coluna, calendar_parquet[coluna].cast(DateType()))

In [48]:
calendar_parquet.printSchema()

root
 |-- listing_id: long (nullable = true)
 |-- date: date (nullable = true)
 |-- available: boolean (nullable = true)
 |-- price: float (nullable = true)
 |-- adjusted_price: string (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- maximum_nights: integer (nullable = true)



In [49]:
# Conta os valores nulos em cada coluna
null_counts = calendar_parquet.select([sum(col(c).isNull().cast("int")).alias(c) for c in calendar_parquet.columns])
null_counts.show(vertical=True)

-RECORD 0------------------
 listing_id     | 0        
 date           | 0        
 available      | 0        
 price          | 1570963  
 adjusted_price | 12652371 
 minimum_nights | 12       
 maximum_nights | 12       



In [50]:
calendar_parquet.count()

12652371

In [51]:
calendar_parquet.distinct().count()

12652371

In [52]:
# Passar para a camada Silver
calendar_parquet.write.parquet("datawarehouse/silver/silver_calendar", mode="overwrite")

In [53]:
# Salvar dados como Delta Lake
calendar_parquet.write.format("delta").mode("overwrite").save("deltalake/silver/silver_calendar")

In [54]:
spark.stop()