In [0]:
# Fazer Pasta
# dbutils.fs.mkdirs(path)

# Remover Pasta/Arquivo
# dbutils.fs.rm(path, recurse=True)

In [0]:
%run
./98_entrypoints

Collecting great_expectations
  Obtaining dependency information for great_expectations from https://files.pythonhosted.org/packages/04/2c/e01bf8bfab363e064eea8709afe2298275e5d7b5e582ab141f9093919d4e/great_expectations-1.2.2-py3-none-any.whl.metadata
  Downloading great_expectations-1.2.2-py3-none-any.whl.metadata (8.5 kB)
Collecting altair<5.0.0,>=4.2.1 (from great_expectations)
  Obtaining dependency information for altair<5.0.0,>=4.2.1 from https://files.pythonhosted.org/packages/18/62/47452306e84d4d2e67f9c559380aeb230f5e6ca84fafb428dd36b96a99ba/altair-4.2.2-py3-none-any.whl.metadata
  Downloading altair-4.2.2-py3-none-any.whl.metadata (13 kB)
Collecting jinja2>=2.10 (from great_expectations)
  Obtaining dependency information for jinja2>=2.10 from https://files.pythonhosted.org/packages/31/80/3a54838c3fb461f6fec263ebf3a3a41771bd05190238de3486aae8540c36/jinja2-3.1.4-py3-none-any.whl.metadata
  Downloading jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting jsonschema>=2.5.1 (

In [0]:
# Bibliotecas
from pyspark.sql.functions import *
from pyspark.sql.types import *
import great_expectations as gx
from datetime import date

# Extraindo Dados Brutos

## Calendar

In [0]:
df_raw_calendar = (spark.read.
                   option("multiline", "true").
                   option("quote", '"').
                   option("escape", '"').
                   csv(path_raw_calendar, header=True))
df_raw_calendar = df_raw_calendar.dropDuplicates()
df_raw_calendar.display()

In [0]:
df_raw_calendar.count()

In [0]:
df_raw_calendar.write.mode("overwrite").parquet(path_bronze_calendar)

## Listings

In [0]:
df_raw_listings = (spark.read.
               option("multiline", "true").
               option("quote", '"').
               option("escape", '"').
               csv(path_raw_listings, header=True))
df_raw_listings = df_raw_listings.dropDuplicates()
df_raw_listings.display()

In [0]:
df_raw_listings.count()

In [0]:
df_raw_listings.write.mode("overwrite").parquet(path_bronze_listings)

## Reviews

In [0]:
df_raw_reviews = (spark.read.
               option("multiline", "true").
               option("quote", '"').
               option("escape", '"').
               csv(path_raw_reviews, header=True))
df_raw_reviews = df_raw_reviews.dropDuplicates()
df_raw_reviews.display()

In [0]:
df_raw_reviews.count()

In [0]:
df_raw_reviews.write.mode("overwrite").parquet(path_bronze_reviews)

# Limpeza Listings

In [0]:
df_listings = spark.read.parquet(path_bronze_listings)
df_listings.display()

In [0]:
# Colunas para remover strings: "\\$","\\%", ",", " "
tratar_antes = ["host_response_rate", "host_acceptance_rate", "price"]


numericos_int = ["host_id", "host_response_rate", "host_acceptance_rate" ,"host_listings_count", "host_total_listings_count", 
                 "accommodates", "bedrooms", "beds", "minimum_nights", "maximum_nights", "minimum_minimum_nights", 
                 "maximum_minimum_nights", "minimum_maximum_nights", "maximum_maximum_nights", "availability_30", 
                 "availability_60", "availability_90", "availability_365", "number_of_reviews", "number_of_reviews_ltm", 
                 "number_of_reviews_l30d", "calculated_host_listings_count", "calculated_host_listings_count_entire_homes", 
                 "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms"]

numericos_float = ["latitude", "longitude", "bathrooms", "price", "minimum_nights_avg_ntm", "maximum_nights_avg_ntm", 
                   "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", 
                   "review_scores_communication", "review_scores_location", "review_scores_value", "reviews_per_month"]

numericos_long = ["id", "scrape_id"]

colunas_data = ["last_scraped", "host_since", "calendar_last_scraped", "first_review", "last_review"]

booleanos = ["host_is_superhost", "host_has_profile_pic", "host_identity_verified", "has_availability", "instant_bookable"]

listas = ["host_verifications", "amenities"]

Iteração nas colunas para tratamento:

* tratar_antes - remover "%", "$" "," e " "
* numericos_int - Transformar int
* numericos_float - Transformar para float
* numericos_long - Transformar para LongType (como int gera muitos números nulos)
* colunas_data - Convertido para formato "yyyy-MM-dd"
* booleanos - Convertido para Booleano
* listas - Mantido como string
* Colunas não informadas foram mantidas como string

In [0]:
df_new = df_listings

In [0]:
# Removendo caracteres

for i in tratar_antes:
    df_new = (df_new
            .withColumn(i, regexp_replace(col(i), "\\$", ""))
            .withColumn(i, regexp_replace(col(i), ",", ""))
            .withColumn(i, regexp_replace(col(i), " ", ""))
            .withColumn(i, regexp_replace(col(i), "\\%", "")))

In [0]:
# Tratamento colunas Integer
for i in numericos_int:
    df_new = df_new.withColumn(i, col(i).cast(IntegerType()))

In [0]:
# Transformar em Float
for i in numericos_float:
    df_new = df_new.withColumn(i, col(i).cast(FloatType()))

In [0]:
# Transformar em LongType
for i in numericos_long:
    df_new = df_new.withColumn(i, col(i).cast(LongType()))

In [0]:
# Transformar em Data
for i in colunas_data:
    df_new = df_new.withColumn(i, to_date(i, "yyyy-MM-dd"))

In [0]:
# Transformar em Booleanos
for i in booleanos:
    df_new = (df_new.withColumn(i, 
                                when(col(i) == "t", True).
                                when(col(i) == "f", False).
                                otherwise(None)
                                ))

In [0]:
df_listings.filter(df_listings.scrape_id == "N/A").count()

In [0]:
df_listings.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in booleanos]).display()

In [0]:
df_new.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in booleanos]).display()

# Limpeza Calendar

In [0]:
df_calendar = spark.read.parquet(path_bronze_calendar)
df_calendar.display()

In [0]:
df_calendar.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_calendar.columns]).display()

In [0]:
df_calendar.count() == 12652371

## Limpezas e Ajustes de Dados

* listing_id - alterar para tipo LongType (Para integer gera muitos números nulos, LongType aceita um range maior de digitos)
* date - alterar para tipo datetime
* available - alterar t para True e f para False. Alterar tipo para booleano
* price - remover cifraão e virgulas. Alterar tipo para float
* minimum_nights - alterar para tipo integer
* maximum_nights - alterar para tipo integer

As conversões serão feitas e depois o schema vai ser informado como um todo.


In [0]:
df_new = (df_calendar
               .withColumn("listing_id", col("listing_id").cast(LongType()))
               .withColumn("date", to_date("date", "yyyy-MM-dd"))
               .withColumn("available", 
                           when(col("available") == "t", True).
                           when(col("available") == "f", False).
                           otherwise(None))
               .withColumn("price", regexp_replace(col("price"), "\\$", ""))
               .withColumn("price", regexp_replace(col("price"), ",", ""))
               .withColumn("price", col("price").cast("float"))
               .withColumn("minimum_nights", col("minimum_nights").cast("integer"))
               .withColumn("maximum_nights", col("maximum_nights").cast("integer")))

df_new.display()


In [0]:
df_calendar.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_calendar.columns]).display()

In [0]:
df_new.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_new.columns]).display()

# Limpeza Reviews

In [0]:
df_reviews = spark.read.parquet(path_bronze_reviews)
df_reviews.display()

In [0]:
df_reviews.columns

In [0]:
df_new = (df_reviews
               .withColumn("listing_id", col("listing_id").cast(LongType()))
               .withColumn("id", col("id").cast(LongType()))
               .withColumn("date", to_timestamp("date", "yyyy-MM-dd"))
               .withColumn("reviewer_id", col("reviewer_id").cast(LongType())))

df_new.display()

In [0]:
df_reviews.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_reviews.columns]).display()

In [0]:
df_new.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_new.columns]).display()

# Great Expectations

In [0]:
df_calendar = spark.read.parquet(path_silver_calendar)
df_listings = spark.read.parquet(path_silver_listings)
df_reviews = spark.read.parquet(path_silver_reviews)

In [0]:
# Calendar

# Contextos
context_calendar = gx.get_context()

data_source_calendar = context_calendar.data_sources.add_spark("calendar_data_source")

# Data Assets
data_asset_calendar = data_source_calendar.add_dataframe_asset(name="calendar_data_asset")

# Definição de Batches
batch_definition_calendar = data_asset_calendar.add_batch_definition_whole_dataframe("calendar_batch_definition")

# Batches
batch_calendar = batch_definition_calendar.get_batch(batch_parameters={"dataframe": df_calendar})

# Suite
suite_calendar = gx.ExpectationSuite(name="suite_calendar")
suite_calendar = context.suites.add(suite_calendar)

# Colunas

In [0]:
df_listings.select(numericos_float).display()

latitude,longitude,bathrooms,price,minimum_nights_avg_ntm,maximum_nights_avg_ntm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
-22.94187,-43.18707,1.0,718.0,3.0,90.0,5.0,4.0,5.0,5.0,5.0,4.0,5.0,0.01
-22.97995,-43.19024,1.0,288.0,3.1,215.7,4.69,4.91,4.67,4.84,4.87,4.91,4.73,0.34
-22.9763,-43.19739,1.5,80.0,5.0,200.0,4.67,5.0,5.0,5.0,4.67,4.33,4.67,0.17
-22.97265,-43.35797,2.0,336.0,2.0,30.0,4.33,5.0,3.83,5.0,5.0,4.33,4.17,0.05
-23.00982,-43.29118,0.5,3286.0,2.0,1125.0,4.98,4.98,4.98,5.0,4.96,4.93,4.8,0.66
-22.949724,-43.181953,2.0,300.0,3.2,1125.0,4.81,4.7,4.8,5.0,5.0,4.86,4.71,0.17
-22.96736,-43.17821,1.5,371.0,3.0,30.0,4.72,4.78,4.78,4.94,4.94,4.94,4.33,0.15
-22.9647,-43.17796,1.0,195.0,2.0,1125.0,4.74,4.83,4.79,4.91,4.88,4.82,4.64,1.22
-22.98008,-43.19198,1.0,400.0,4.0,1125.0,4.96,4.91,4.91,5.0,5.0,4.7,4.65,0.19
-22.9322,-43.24107,2.0,100.0,10.0,50.0,,,,,,,,


In [0]:
# Listings

# Contextos
context_listings = gx.get_context()

data_source_listings = context_listings.data_sources.add_spark("listings_data_source")

# Data Assets
data_asset_listings = data_source_listings.add_dataframe_asset(name="listings_data_asset")

# Definição de Batches
batch_definition_listings = data_asset_listings.add_batch_definition_whole_dataframe("listings_batch_definition")

# Batches
batch_listings = batch_definition_listings.get_batch(batch_parameters={"dataframe": df_listings})

# Suite
suite_listings = gx.ExpectationSuite(name="suite_listings")
suite_listings = context_listings.suites.add(suite_listings)

# Colunas

numericos_int = ["host_id", "host_response_rate", "host_listings_count", "host_total_listings_count", 
                 "accommodates", "bedrooms", "beds", "minimum_nights", "maximum_nights", "minimum_minimum_nights", 
                 "maximum_minimum_nights", "minimum_maximum_nights", "maximum_maximum_nights", "availability_30", 
                 "availability_60", "availability_90", "availability_365", "number_of_reviews", "number_of_reviews_ltm", 
                 "number_of_reviews_l30d", "calculated_host_listings_count", "calculated_host_listings_count_entire_homes", 
                 "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms"]

numericos_float = ["latitude", "longitude", "bathrooms", "price", "minimum_nights_avg_ntm", "maximum_nights_avg_ntm", 
                   "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", 
                   "review_scores_communication", "review_scores_location", "review_scores_value", "reviews_per_month"]

numericos_long = ["id", "scrape_id"]

colunas_data = ["last_scraped", "host_since", "calendar_last_scraped", "first_review", "last_review"]

booleanos = ["host_is_superhost", "host_has_profile_pic", "host_identity_verified", "has_availability", "instant_bookable"]

listas = ["host_verifications", "amenities"]

string_lista = [x for x in df_listings.columns if x not in numericos_int + numericos_float + numericos_long + colunas_data + booleanos + listas]

# Validações
for coluna in numericos_int:
  expectation = gx.expectations.ExpectColumnValuesToBeOfType(column=coluna, type_="IntegerType")
  suite_listings.add_expectation(expectation)

for coluna in numericos_float:
  expectation = gx.expectations.ExpectColumnValuesToBeOfType(column=coluna, type_="FloatType")
  suite_listings.add_expectation(expectation)

for coluna in numericos_long:
  expectation = gx.expectations.ExpectColumnValuesToBeOfType(column=coluna, type_="LongType")
  suite_listings.add_expectation(expectation)

for coluna in colunas_data:
  expectation = gx.expectations.ExpectColumnValuesToBeOfType(column=coluna, type_="DateType")
  suite_listings.add_expectation(expectation)

for coluna in booleanos:
  expectation = gx.expectations.ExpectColumnValuesToBeOfType(column=coluna, type_="BooleanType")
  suite_listings.add_expectation(expectation)

for coluna in listas:
  expectation = gx.expectations.ExpectColumnValuesToBeOfType(column=coluna, type_="ListType")
  suite_listings.add_expectation(expectation)

for coluna in string_lista:
  expectation = gx.expectations.ExpectColumnValuesToBeOfType(column=coluna, type_="StringType")
  suite_listings.add_expectation(expectation)

# Maior ou igual a 0
for i in numericos_long + numericos_int + [x for x in numericos_float if x not in ["latitude", "longitude"]]:
  expectation = gx.expectations.ExpectColumnValuesToBeBetween(column=i, min_value=0, strict_min=True)
  suite_listings.add_expectation(expectation)

# Rates 0-100
for i in ["host_response_rate", "host_acceptance_rate"]:
  expectation = gx.expectations.ExpectColumnValuesToBeBetween(column=i, min_value=0, max_value=100)
  suite_listings.add_expectation(expectation)

# Rates 0-5
for i in ["review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication", "review_scores_location", "review_scores_value", "reviews_per_month"]:
  expectation = gx.expectations.ExpectColumnValuesToBeBetween(column=i, min_value=0, max_value=5)
  suite_listings.add_expectation(expectation)

# Datas entre o ano de lançamento do AirBnB e a data atual
today = date.today()
air_bnb_release = date(2008,1,1)
for i in colunas_data:
  expectation = gx.expectations.ExpectColumnValuesToBeBetween(column=i, min_value=air_bnb_release, max_value=today)
  suite_listings.add_expectation(expectation)

# Valores Booleanos
for i in booleanos:
  expectation = gx.expectations.ExpectColumnValuesToBeInSet(column=i, value_set=[True, False])
  suite_listings.add_expectation(expectation)

# Latitude
expectation = gx.expectations.ExpectColumnValuesToBeBetween(column="latitude", min_value=-90, max_value=90)
suite_listings.add_expectation(expectation)

# Longitude
expectation = gx.expectations.ExpectColumnValuesToBeBetween(column="longitude", min_value=-180, max_value=180)
suite_listings.add_expectation(expectation)

validation_definition_listings = gx.ValidationDefinition(data=batch_definition_listings, suite=suite_listings, name="validacao_listings")

validation_definition_listings.run(batch_parameters={"dataframe":df_listings})

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpwbezbcf_' for ephemeral docs site
  self.comm = Comm(**args)


Calculating Metrics:   0%|          | 0/521 [00:00<?, ?it/s]

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_of_type",
        "kwargs": {
          "batch_id": "listings_data_source-listings_data_asset",
          "column": "host_id",
          "type_": "IntegerType"
        },
        "meta": {},
        "id": "e96e8179-4592-4302-8c7b-f8ce0c6857f8"
      },
      "result": {
        "observed_value": "IntegerType"
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "batch_id": "listings_data_source-listings_data_asset",
          "column": "host_id",
          "min_value": 0.0,
          "strict_min": true
        },
        "meta": {},
        "id": "b72602e6-2a16-4bd1-ba13-229a5ed5a3e9"
    

In [0]:
# Contextos
context_reviews = gx.get_context()

data_source_reviews = context.data_sources.add_spark("reviews_data_source")

# Data Assets
data_asset_reviews = data_source_reviews.add_dataframe_asset(name="reviews_data_asset")

# Definição de Batches
batch_definition_reviews = data_asset_reviews.add_batch_definition_whole_dataframe("reviews_batch_definition")

# Batches
batch_reviews = batch_definition_reviews.get_batch(batch_parameters={"dataframe": df_reviews})

# Suite
suite_reviews = gx.ExpectationSuite(name="suite_listings")
suite_listings = context.suites.add(suite_listings)