## Criando diretório Bronze IOT

### Criei um volume delta já que no free tier o acesso a mnt/ é negado

In [0]:
%py
# Criando diretório Bronze
# Definir caminhos base
# BRONZE_PATH = "/mnt/delta/bronze"
# BRONZE_PATH = "/mnt/delta/bronze"
BRONZE_PATH = "/Volumes/workspace/default/delta/bronze/telemetria/iot"
SILVER_PATH = "/Volumes/workspace/default/delta/silver/telemetria/iot"
SOURCE_PATH = "dbfs:/databricks-datasets/"

# Criar diretórios se não existirem
dbutils.fs.mkdirs(BRONZE_PATH)
dbutils.fs.mkdirs(SILVER_PATH)

## Verificando os dados de teste disponíveis

In [0]:
display(dbutils.fs.ls(SOURCE_PATH))


### Dataset de iot

In [0]:
%fs
ls /databricks-datasets/iot

### Consultando os arquivos e salvando em delta parquet

In [0]:
fold = "dbfs:/databricks-datasets/iot/iot_devices.json"
df_iot = spark.read.format('json').load(fold)
display(df_iot)

In [0]:
df_iot.printSchema()
# colunas podem conter valores nulos

In [0]:
display(df_iot)

In [0]:
# Estatísticas descritivas das variáveis numéricas
df_iot.select("battery_level", "c02_level", "humidity", "temp").describe().show()


In [0]:
from pyspark.sql.functions import regexp_extract

df = df_iot.withColumn("prefix", regexp_extract("device_name", "^(.*)-[^-]+$", 1)) \
       .withColumn("code", regexp_extract("device_name", "([^-]+)$", 1))

display(df.select("device_name", "prefix", "code"))


In [0]:
from pyspark.sql.functions import avg

battery_country = df.groupBy("cca3").agg(avg("battery_level").alias("avg_battery"))
display(battery_country.orderBy("avg_battery", ascending=False))


In [0]:
cols = ["temp", "c02_level", "humidity"]
for c1 in cols:
    for c2 in cols:
        if c1 != c2:
            corr = df.stat.corr(c1, c2)
            print(f"Correlação entre {c1} e {c2}: {corr:.2f}")


In [0]:
# Exibe pontos no mapa (Databricks reconhece automaticamente latitude/longitude)
display(df.select("latitude", "longitude", "temp", "c02_level", "battery_level"))


In [0]:
from pyspark.sql.functions import col, mean, stddev

stats = df.select(mean(col("temp")).alias("mean_temp"), stddev(col("temp")).alias("std_temp")).collect()[0]
mean_temp = stats["mean_temp"]
std_temp = stats["std_temp"]

threshold = 2  # número de desvios padrão
df_anomalies = df.withColumn("is_anomaly", (col("temp") > mean_temp + threshold * std_temp) | 
                                            (col("temp") < mean_temp - threshold * std_temp))

display(df_anomalies.filter("is_anomaly = true").select("device_id", "device_name", "temp", "latitude", "longitude"))


In [0]:
from pyspark.sql.functions import from_unixtime, to_date

df_time = df.withColumn("date", to_date(from_unixtime(df["timestamp"]/1000)))

agg_time = df_time.groupBy("date").agg(avg("temp").alias("avg_temp"), avg("c02_level").alias("avg_c02"))
display(agg_time.orderBy("date"))


In [0]:
display(df_time)

In [0]:
#     .partitionBy("Registro_voos") \
# .saveAsTable("registro_voos") registra a tabela no catalogo
# mas não pe uma boa ideia na camada bronze

# salvando a tabela na camada bronze
# cada tabela deve ter seu proprio diretorio
df_iot \
    .write.format("delta") \
    .option("mergeSchema", "true") \
    .mode("overwrite") \
    .save(f"{BRONZE_PATH}/flight/registro_voos")


In [0]:

# leitura do arquivo txt
file_path_fly_code = f"{SOURCE_PATH}/flights/airport-codes-na.txt"

df_fly_code = spark.read \
    .option("header", "true") \
    .option("sep", "\t") \
    .option("inferSchema", "true") \
    .option("ignoreLeadingWhiteSpace", "true") \
    .option("ignoreTrailingWhiteSpace", "true") \
    .csv(file_path_fly_code)
# o metodo .csv funciona para qualquer arquivo com um delimitador
# Exibir o DataFrame
display(df_fly_code)

In [0]:
# salvando a tabela na camada bronze
df_fly_code \
    .write.format("delta") \
    .option("mergeSchema", "true") \
    .mode("overwrite") \
    .save(f"{BRONZE_PATH}/flight/codigo_voos")
