In [1]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from sqlalchemy import create_engine

### 0. Inicia Sessão Spark

In [64]:
spark.stop()

In [65]:
# Cria uma Spark Session
spark = SparkSession.builder \
    .appName("MyApp") \
    .getOrCreate()

# Exibe informações sobre a sessão para confirmar que está ativa
spark

# spark = SparkSession.builder \
#     .appName("MyApp") \
#     .config("spark.executor.memory", "2g") \
#     .config("spark.driver.memory", "2g") \
#     .config("spark.sql.shuffle.partitions", "200") \
#     .getOrCreate()


In [66]:
# Defina os detalhes da conexão
#engine = create_engine("postgresql://user:password@db:5432/mydatabase")

## 1. Ler os arquivos gz

### 1.1 Listings data

In [123]:
raw_listings = spark.read \
    .option("header", True) \
    .option("delimiter", ",") \
    .option("quote", '"') \
    .option("escape", '"') \
    .option("multiLine", True) \
    .option("mode", "PERMISSIVE") \
    .option("encoding", "UTF-8") \
    .csv("raw/listings.csv.gz")

In [124]:
raw_listings.select('id','name','description','reviews_per_month').show()#vertical=True)

+------+--------------------+--------------------+-----------------+
|    id|                name|         description|reviews_per_month|
+------+--------------------+--------------------+-----------------+
| 17878|Very Nice 2Br in ...|Please note that ...|             1.88|
| 25026|Beautiful Modern ...|**Fully renovated...|             1.70|
|220377|Suíte Casal (banh...|The apartment is ...|             0.03|
| 35764|COPACABANA SEA BR...|Our newly renovat...|             2.85|
|223073|Modern Loft 1 • I...|READ THE FREQUENT...|             3.04|
|230989|Nice Flat in Copa...|Very nice studio,...|             1.05|
|231105|            10 Suíte|Verano Stay - Rio...|             0.90|
| 48305|Bright 6bed Penth...|Enter Bossa Nova'...|             1.10|
|231497|Nice big studio i...|This is a big stu...|             0.57|
|231516|Apartment on Copa...|Special location ...|             0.19|
| 48901|Large /equipped 4...|Spacious 4 bedroo...|             0.28|
|236991|PRAIA DE COPACABA...|Cozy,

In [126]:
#print(raw_listings.summary())
#raw_listings.describe()
raw_listings.printSchema()

root
 |-- id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: string (nullable = true)
 |-- last_scraped: string (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_cou

In [127]:
raw_listings.count()

34664

In [128]:
# Conta o número de linhas no DataFrame original
total_rows = raw_listings.count()

# Conta o número de linhas únicas
unique_rows = raw_listings.dropDuplicates().count()

# Calcula o número de linhas duplicadas
duplicate_count = total_rows - unique_rows
print(f"Número de linhas duplicadas: {duplicate_count}")

Número de linhas duplicadas: 0


In [129]:
raw_listings_nodupk = raw_listings.dropDuplicates()

In [130]:
raw_listings_nodupk.count()

34664

In [131]:
raw_listings_nodupk.write.parquet("bronze/bronze_listings", mode="overwrite")

### 1.2 Reviews data

In [135]:
# Carregue o arquivo CSV comprimido diretamente
#raw_reviews = spark.read.csv("raw/reviews.csv.gz", header=True, inferSchema=True)

raw_reviews = spark.read \
    .option("header", True) \
    .option("delimiter", ",") \
    .option("quote", '"') \
    .option("escape", '"') \
    .option("multiLine", True) \
    .option("mode", "PERMISSIVE") \
    .option("encoding", "UTF-8") \
    .csv("raw/reviews.csv.gz")

In [136]:
raw_reviews.printSchema()

root
 |-- listing_id: string (nullable = true)
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- reviewer_id: string (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)



In [137]:
raw_reviews.show()

+----------+-------------------+----------+-----------+-------------+--------------------+
|listing_id|                 id|      date|reviewer_id|reviewer_name|            comments|
+----------+-------------------+----------+-----------+-------------+--------------------+
|     17878|              64852|2010-07-15|     135370|          Tia|This apartment is...|
|     17878|              76744|2010-08-11|      10206|         Mimi|we had a really g...|
|     17878|              91074|2010-09-06|      80253|          Jan|Staying in Max ap...|
|     17878|             137528|2010-11-12|     230449|        Orene|In general very g...|
|     17878|             147594|2010-12-01|     219338|        David|The apt was nice ...|
|     17878|             152368|2010-12-12|     266847|         Armi|At Copacabana apa...|
|     17878|             155565|2010-12-19|     243287|     Jonathan|A great apartment...|
|     17878|             179219|2011-02-07|     172558|         Anna|Apartment is exac...|

In [138]:
raw_reviews.count()

773134

In [139]:
# Conta o número de linhas no DataFrame original
total_rows = raw_reviews.count()

# Conta o número de linhas únicas
unique_rows = raw_reviews.dropDuplicates().count()

# Calcula o número de linhas duplicadas
duplicate_count = total_rows - unique_rows
print(f"Número de linhas duplicadas: {duplicate_count}")

Número de linhas duplicadas: 0


In [140]:
raw_reviews_nodupk = raw_reviews.dropDuplicates()

In [141]:
raw_reviews_nodupk.count()

773134

In [142]:
raw_reviews_nodupk.write.parquet("bronze/bronze_reviews", mode="overwrite")

### 1.3 Calendar data

In [143]:
# Carregue o arquivo CSV comprimido diretamente
raw_calendar = spark.read.csv("output/raw/calendar.csv.gz", header=True, inferSchema=True)

In [144]:
raw_calendar.show()

+----------+----------+---------+-------+--------------+--------------+--------------+
|listing_id|      date|available|  price|adjusted_price|minimum_nights|maximum_nights|
+----------+----------+---------+-------+--------------+--------------+--------------+
|    297908|2024-06-27|        f|$250.00|          NULL|             2|          1125|
|     17878|2024-06-28|        f|$350.00|          NULL|             5|            28|
|     17878|2024-06-29|        f|$350.00|          NULL|             5|            28|
|     17878|2024-06-30|        f|$350.00|          NULL|             5|            28|
|     17878|2024-07-01|        f|$350.00|          NULL|             5|            28|
|     17878|2024-07-02|        f|$350.00|          NULL|             5|            28|
|     17878|2024-07-03|        f|$350.00|          NULL|             5|            28|
|     17878|2024-07-04|        f|$350.00|          NULL|             5|            28|
|     17878|2024-07-05|        f|$350.00|  

In [145]:
raw_calendar.printSchema()

root
 |-- listing_id: long (nullable = true)
 |-- date: date (nullable = true)
 |-- available: string (nullable = true)
 |-- price: string (nullable = true)
 |-- adjusted_price: string (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- maximum_nights: integer (nullable = true)



In [146]:
raw_calendar.count()

12652371

In [147]:
# Conta o número de linhas no DataFrame original
total_rows = raw_calendar.count()

# Conta o número de linhas únicas
unique_rows = raw_calendar.dropDuplicates().count()

# Calcula o número de linhas duplicadas
duplicate_count = total_rows - unique_rows
print(f"Número de linhas duplicadas: {duplicate_count}")

Número de linhas duplicadas: 0


In [148]:
#raw_calendar_nodupk = raw_calendar.dropDuplicates()
#raw_calendar_nodupk.count()

In [149]:
raw_calendar.write.parquet("bronze/bronze_calendar", mode="overwrite")

In [150]:
spark.stop()

In [151]:
# Verifica se há uma sessão ativa
active_session = SparkSession.getActiveSession()
if active_session:
    active_session.stop()

In [40]:
# Suba os dados para o PostgreSQL
#df_pandas_listings.to_sql("listings", engine, if_exists="replace", index=False)