In [0]:
# Fazer Pasta
# dbutils.fs.mkdirs(path)

# Remover Pasta/Arquivo
# dbutils.fs.rm(path, recurse=True)

In [0]:
%run
./98_entrypoints

In [0]:
# Bibliotecas
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Extraindo Dados Brutos

In [0]:

raw_files = [path_listings, path_calendar, path_reviews]
bronze_files = [path_bronze_listings, path_bronze_calendar, path_bronze_reviews]

def raw_viz(dataframe):
    print(f"Linhas: {dataframe.count()} - Colunas: {len(dataframe.columns)}")
    dataframe.printSchema()
    dataframe.display()

for i in range(len(raw_files)):
  df = get_raw(raw_files[i])
  raw_viz(df)
  df = df.dropDuplicates()
  to_bronze(df, bronze_files[i])

# Limpeza Listings

In [0]:
df_listings = spark.read.parquet(path_bronze_listings)
df_listings.display()

In [0]:
sorted(df_listings.columns)

# Limpeza Calendar

In [0]:
df_calendar = spark.read.parquet(path_bronze_calendar)
df_calendar.display()

In [0]:
df_calendar.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_calendar.columns]).display()

In [0]:
df_calendar.count() == 12652371

In [0]:
# Coluna adjusted_price tem apenas nulos
df_calendar = df_calendar.drop("adjusted_price")
df_calendar.display()

## Limpezas e Ajustes de Dados

* listing_id - alterar para tipo LongType (Para integer gera muitos números nulos, LongType aceita um range maior de digitos)
* date - alterar para tipo datetime
* available - alterar t para True e f para False. Alterar tipo para booleano
* price - remover cifraão e virgulas. Alterar tipo para float
* minimum_nights - alterar para tipo integer
* maximum_nights - alterar para tipo integer

As conversões serão feitas e depois o schema vai ser informado como um todo.


In [0]:
df_new = (df_calendar
               .withColumn("listing_id", col("listing_id").cast(LongType()))
               .withColumn("date", to_timestamp("date", "yyyy-MM-dd"))
               .withColumn("available", 
                           when(col("available") == "t", True).
                           when(col("available") == "f", False).
                           otherwise(None))
               .withColumn("price", regexp_replace(col("price"), "\\$", ""))
               .withColumn("price", regexp_replace(col("price"), ",", ""))
               .withColumn("price", col("price").cast("float"))
               .withColumn("minimum_nights", col("minimum_nights").cast("integer"))
               .withColumn("maximum_nights", col("maximum_nights").cast("integer")))

df_new.display()


In [0]:
df_calendar.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_calendar.columns]).display()

In [0]:
df_new.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_new.columns]).display()

# Limpeza Reviews

In [0]:
df_reviews = spark.read.parquet(path_bronze_reviews)
df_reviews.display()

In [0]:
df_reviews.columns

In [0]:
df_new = (df_reviews
               .withColumn("listing_id", col("listing_id").cast(LongType()))
               .withColumn("id", col("id").cast(LongType()))
               .withColumn("date", to_timestamp("date", "yyyy-MM-dd"))
               .withColumn("reviewer_id", col("reviewer_id").cast(LongType())))
               
df_news.dropna()
df_new.display()