In [15]:
# Importando biblitoecas
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, \
    StringType, IntegerType, DoubleType, LongType
from pyspark.sql.functions import col, expr
import os 
from warnings import filterwarnings
filterwarnings("ignore")

# Criando objeto de sessão
spark = SparkSession\
    .builder\
    .appName("filtrando-registros")\
    .getOrCreate()

# Definindo variáveis de diretório
home_path = os.path.expanduser('~')
data_path = os.path.join(home_path, 'dev/panini-tech-lab/data')
flights_path = os.path.join(data_path, 'flights-data/summary-data/csv/2015-summary.csv')
iot_path = os.path.join(data_path, 'iot-devices/iot_devices.json')

# Definindo schema para o arquivo CSV a ser lido
flights_schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), nullable=True, metadata={"description": "País de destino dos vôos contabilizados"}),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), nullable=True, metadata={"description": "País de origem dos vôos contabilizados"}),
    StructField("count", IntegerType(), nullable=True, metadata={"description": "Contagem total de vôos entre os países de origem e de destino do registro"})
])

iot_schema = StructType([
    StructField("device_id", IntegerType(), nullable=False),
    StructField("device_name", StringType(), nullable=True),
    StructField("ip", StringType(), nullable=True),
    StructField("cca2", StringType(), nullable=True),
    StructField("cca3", StringType(), nullable=True),
    StructField("cn", StringType(), nullable=True),
    StructField("latitude", DoubleType(), nullable=True),
    StructField("longitude", DoubleType(), nullable=True),
    StructField("scale", StringType(), nullable=True),
    StructField("temp", IntegerType(), nullable=True),
    StructField("humidity", IntegerType(), nullable=True),
    StructField("battery_level", StringType(), nullable=True),
    StructField("c02_level", IntegerType(), nullable=True),
    StructField("lcd", StringType(), nullable=True),
    StructField("timestamp", LongType(), nullable=False)
])

# Lendo dados
df_flights = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load(flights_path)


df_iot = spark.read.format("json")\
    .schema(iot_schema).load(iot_path)

# Criando tabelas temporárias
df_flights.createOrReplaceTempView("tbl_flights")
df_iot.createOrReplaceTempView("tbl_iot")

In [16]:
# Verificando dados de vôos
df_flights.printSchema()
df_flights.show(5)

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [3]:
# Verificando dados de dispositivos iot
df_iot.printSchema()
df_iot.show(5)

root
 |-- device_id: integer (nullable = true)
 |-- device_name: string (nullable = true)
 |-- ip: string (nullable = true)
 |-- cca2: string (nullable = true)
 |-- cca3: string (nullable = true)
 |-- cn: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- scale: string (nullable = true)
 |-- temp: integer (nullable = true)
 |-- humidity: integer (nullable = true)
 |-- battery_level: string (nullable = true)
 |-- c02_level: integer (nullable = true)
 |-- lcd: string (nullable = true)
 |-- timestamp: long (nullable = true)

+---------+--------------------+-------------+----+----+-------------+--------+---------+-------+----+--------+-------------+---------+------+-------------+
|device_id|         device_name|           ip|cca2|cca3|           cn|latitude|longitude|  scale|temp|humidity|battery_level|c02_level|   lcd|    timestamp|
+---------+--------------------+-------------+----+----+-------------+--------+---------+-------+-

In [4]:
# Vôos com origem e destino nos EUA
df_american_flights = df_flights.select(
    col("ORIGIN_COUNTRY_NAME").alias("origem"),
    col("DEST_COUNTRY_NAME").alias("destino"),
    col("count").alias("quantidade")
).where(expr("DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME"))

# Visualizando novo conjunto
df_american_flights.show(5)

+-------------+-------------+----------+
|       origem|      destino|quantidade|
+-------------+-------------+----------+
|United States|United States|    370002|
+-------------+-------------+----------+



In [5]:
# Vôos com origem e destino nos EUA
df_american_flights = df_flights.select(
    col("ORIGIN_COUNTRY_NAME").alias("origem"),
    col("DEST_COUNTRY_NAME").alias("destino"),
    col("count").alias("quantidade")
).where(col("DEST_COUNTRY_NAME") == col("ORIGIN_COUNTRY_NAME"))

# Visualizando novo conjunto
df_american_flights.show(5)

+-------------+-------------+----------+
|       origem|      destino|quantidade|
+-------------+-------------+----------+
|United States|United States|    370002|
+-------------+-------------+----------+



In [6]:
# Criando flag para vôos de mesma origem e destino
df_american_flights = df_flights.withColumn(
    "flag_within_country", expr("DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME")
).select("*").where(col("flag_within_country"))

# Visualizando dados
df_american_flights.show(5)

+-----------------+-------------------+------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|flag_within_country|
+-----------------+-------------------+------+-------------------+
|    United States|      United States|370002|               true|
+-----------------+-------------------+------+-------------------+



In [7]:
# Filtrando registros com SparkSQL
df_american_flights = spark.sql("""
    SELECT
        ORIGIN_COUNTRY_NAME AS pais_origem,
        DEST_COUNTRY_NAME AS pais_destino,
        count AS qtd_voos

    FROM tbl_flights

    WHERE ORIGIN_COUNTRY_NAME = DEST_COUNTRY_NAME
""")

# Visualizando dados
df_american_flights.show(5)

+-------------+-------------+--------+
|  pais_origem| pais_destino|qtd_voos|
+-------------+-------------+--------+
|United States|United States|  370002|
+-------------+-------------+--------+



In [17]:
# Ordenando vôos menos comuns
df_flights.orderBy("count").show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



In [19]:
# Importando função
from pyspark.sql.functions import desc

# Ordenando principais registros de vôos
df_flights.orderBy(desc("count")).show(10)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
|   United Kingdom|      United States|  2025|
|    United States|     United Kingdom|  1970|
|            Japan|      United States|  1548|
|    United States|              Japan|  1496|
|          Germany|      United States|  1468|
+-----------------+-------------------+------+
only showing top 10 rows



In [20]:
# Top vôos com pelo menos 2000 registros
df_flights.where(expr("count > 2000")).orderBy(desc("count")).show()

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
|   United Kingdom|      United States|  2025|
+-----------------+-------------------+------+



In [26]:
# Top vôos com pelo menos 2000 registros
df_flights.where(expr("count > 2000")).orderBy(expr("count desc")).show()

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|   United Kingdom|      United States|  2025|
|           Mexico|      United States|  7140|
|    United States|             Mexico|  7187|
|           Canada|      United States|  8399|
|    United States|             Canada|  8483|
|    United States|      United States|370002|
+-----------------+-------------------+------+



In [27]:
spark.sql("""
    SELECT * FROM tbl_flights
    WHERE count >= 2000
    ORDER BY count DESC
""").show()

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
|   United Kingdom|      United States|  2025|
+-----------------+-------------------+------+

