In [7]:
# Importando bibliotecas e inicializando sessão Spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

import os
from warnings import filterwarnings
filterwarnings("ignore")

# Definindo variáveis de leitura
data_path = os.path.join(''.join(os.path.pardir + "/") * 3, 'data/flights-data/summary-data/parquet/2010-summary.parquet/part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet')

# Inicializando sessão
spark = (SparkSession
    .builder
    .appName("transformacoes")
    .getOrCreate())
spark

In [29]:
# Lendo base original de dados
df1_flights = spark.read.format("parquet").load(data_path)

# Filtrando vôos com origem nos EUA
df2_flights_eua = df1_flights.where("ORIGIN_COUNTRY_NAME = 'United States'")

# Ordenando conjunto de acordo com a contagem de vôos
df3_flights_eua_ordered = df2_flights_eua.orderBy(desc("count"))

In [30]:
# Onde estão os dados?
print(f'DataFrame 1: {df1_flights}')
print(f'DataFrame 2: {df2_flights_eua}')
print(f'DataFrame 3: {df3_flights_eua_ordered}')

# Os DataFrames são iguais?
print(f'\nOs DataFrames são iguais? {(df1_flights == df2_flights_eua == df3_flights_eua_ordered)}')

DataFrame 1: DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]
DataFrame 2: DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]
DataFrame 3: DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

Os DataFrames são iguais? False


In [32]:
# Visualizando uma amostra dos dados
df3_flights_eua_ordered.show(5)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|348113|
|           Canada|      United States|  8271|
|           Mexico|      United States|  6200|
|   United Kingdom|      United States|  1629|
|          Germany|      United States|  1392|
+-----------------+-------------------+------+
only showing top 5 rows



In [38]:
# Visualizando amostras de cada objeto DataFrame
print('DataFrame 1: Dados originais')
df1_flights.show(5)

print('DataFrame 2: Vôos com origem nos Estados Unidos')
df2_flights_eua.show(5)

print('DataFrame 3: Principais destinos de vôos americanos')
df3_flights_eua_ordered.show(5)

DataFrame 1: Dados originais
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows

DataFrame 2: Vôos com origem nos Estados Unidos
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
|       Costa Rica|      United States|  477|
|          Senegal|      United States|   29|
|           Guyana|      United States|   17|
+-----------------+-------------------+-----+
only showing top 5 rows

DataFrame 3: Principais destinos de vôos amer

In [39]:
# Verificando plano de execução
df3_flights_eua_ordered.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#193L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#193L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [id=#532]
      +- Filter (isnotnull(ORIGIN_COUNTRY_NAME#192) AND (ORIGIN_COUNTRY_NAME#192 = United States))
         +- FileScan parquet [DEST_COUNTRY_NAME#191,ORIGIN_COUNTRY_NAME#192,count#193L] Batched: true, DataFilters: [isnotnull(ORIGIN_COUNTRY_NAME#192), (ORIGIN_COUNTRY_NAME#192 = United States)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/hadoop/dev/panini-tech-lab/data/flights-data/summary-data/p..., PartitionFilters: [], PushedFilters: [IsNotNull(ORIGIN_COUNTRY_NAME), EqualTo(ORIGIN_COUNTRY_NAME,United States)], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:bigint>




In [52]:
# Criando uma tabela temporária para SparkSQL
df1_flights.createOrReplaceTempView('tbl_flights_raw')

# Consultando top vôos partindo dos EUA
df_sql = spark.sql("""
    SELECT
        *
    FROM tbl_flights_raw
    WHERE ORIGIN_COUNTRY_NAME = 'United States'
    ORDER BY count DESC
""")

# Visualizando resultado
df_sql.show(5)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|348113|
|           Canada|      United States|  8271|
|           Mexico|      United States|  6200|
|   United Kingdom|      United States|  1629|
|          Germany|      United States|  1392|
+-----------------+-------------------+------+
only showing top 5 rows



In [53]:
# Verificando plano de execução SparkSQL
df_sql.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#193L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#193L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [id=#853]
      +- Filter (isnotnull(ORIGIN_COUNTRY_NAME#192) AND (ORIGIN_COUNTRY_NAME#192 = United States))
         +- FileScan parquet [DEST_COUNTRY_NAME#191,ORIGIN_COUNTRY_NAME#192,count#193L] Batched: true, DataFilters: [isnotnull(ORIGIN_COUNTRY_NAME#192), (ORIGIN_COUNTRY_NAME#192 = United States)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/hadoop/dev/panini-tech-lab/data/flights-data/summary-data/p..., PartitionFilters: [], PushedFilters: [IsNotNull(ORIGIN_COUNTRY_NAME), EqualTo(ORIGIN_COUNTRY_NAME,United States)], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:bigint>


