In [1]:
# Importando bibliotecas
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField,\
    IntegerType, StringType
from pyspark.sql.functions import col, expr
import os

# Criando objeto de sessão
spark = SparkSession\
    .builder\
    .appName("joins-spark")\
    .getOrCreate()

# Definindo variáveis de diretório
home_path = os.path.expanduser("~")
data_path = os.path.join(home_path, "dev/panini-tech-lab/data")
flights_path = os.path.join(data_path, "flights-data")

# Definindo variáveis para leitura de arquivos
summary_path = os.path.join(flights_path, "summary-data/csv/2015-summary.csv")
airports_codes_path = os.path.join(flights_path, "airport-codes-na/airport-codes-na.txt")
departure_delay_path = os.path.join(flights_path, "departure-delays/departuredelays.csv")

# Realizando a leitura dos dados de sumário
df_flights = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load(summary_path)

# Realizando a leitura de depara de códigos de aeroportos
df_air_codes = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .option("sep", "\t")\
    .load(airports_codes_path)

# Realizando a leitura de dados de partida de vôos
df_departure = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load(departure_delay_path)

22/08/15 21:18:01 WARN Utils: Your hostname, panini-ubuntu resolves to a loopback address: 127.0.1.1; using 10.0.0.110 instead (on interface enp3s0)
22/08/15 21:18:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/15 21:18:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


                                                                                

In [2]:
# Visualizando dados de resumo
df_flights.printSchema()
df_flights.show(5)

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [3]:
# Visualizando dados de aeroportos
df_air_codes.printSchema()
df_air_codes.show(5)

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- IATA: string (nullable = true)

+----------+-----+-------+----+
|      City|State|Country|IATA|
+----------+-----+-------+----+
|Abbotsford|   BC| Canada| YXX|
|  Aberdeen|   SD|    USA| ABR|
|   Abilene|   TX|    USA| ABI|
|     Akron|   OH|    USA| CAK|
|   Alamosa|   CO|    USA| ALS|
+----------+-----+-------+----+
only showing top 5 rows



In [7]:
# Visualizando dados de saídas de vôos
df_departure.printSchema()
df_departure.show(15)

root
 |-- date: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011245|    6|     602|   ABE|        ATL|
|1020600|   -8|     369|   ABE|        DTW|
|1021245|   -2|     602|   ABE|        ATL|
|1020605|   -4|     602|   ABE|        ATL|
|1031245|   -4|     602|   ABE|        ATL|
|1030605|    0|     602|   ABE|        ATL|
|1041243|   10|     602|   ABE|        ATL|
|1040605|   28|     602|   ABE|        ATL|
|1051245|   88|     602|   ABE|        ATL|
|1050605|    9|     602|   ABE|        ATL|
|1061215|   -6|     602|   ABE|        ATL|
|1061725|   69|     602|   ABE|        ATL|
|1061230|    0|     369|   ABE|        DTW|
|1060625|   -3|     602|   ABE|        ATL|
|1070600|    0|     369|   ABE|        DTW|
+----

## Exemplos Práticos

### Aeroportos com maior média de atraso

In [26]:
# Aplicando consulta analítica
df_departure_airports = df_departure.join(
    other=df_air_codes,
    on=(df_departure.origin == df_air_codes.IATA),
    how="left"
).select(
    "date",
    "delay",
    "distance",
    expr("origin AS airport_origin"),
    expr("City AS city_origin"),
    "destination"
).join(
    other=df_air_codes,
    on=(df_departure.destination == df_air_codes.IATA),
    how="left"
).select(
    "date",
    "delay",
    "distance",
    "airport_origin",
    "city_origin",
    expr("destination AS airport_dest"),
    expr("City AS city_destination")
)

# Visualizando resultado
df_departure_airports.show(5)

+-------+-----+--------+--------------+-----------+------------+----------------+
|   date|delay|distance|airport_origin|city_origin|airport_dest|city_destination|
+-------+-----+--------+--------------+-----------+------------+----------------+
|1011245|    6|     602|           ABE|  Allentown|         ATL|         Atlanta|
|1020600|   -8|     369|           ABE|  Allentown|         DTW|         Detroit|
|1021245|   -2|     602|           ABE|  Allentown|         ATL|         Atlanta|
|1020605|   -4|     602|           ABE|  Allentown|         ATL|         Atlanta|
|1031245|   -4|     602|           ABE|  Allentown|         ATL|         Atlanta|
+-------+-----+--------+--------------+-----------+------------+----------------+
only showing top 5 rows



### Aeroportos com maior média de atraso

In [33]:
# Aplicando consulta analítica
df_most_delayed = df_departure.join(
    other=df_air_codes,
    on=(df_departure.origin == df_air_codes.IATA),
    how="left"
).groupBy("origin", "City").agg(
    expr("round(avg(delay), 2) AS avg_delay"),
    expr("count(1) AS qtd_voos")
).sort("avg_delay", ascending=False)

# Visualizando resultado
df_most_delayed.show(10)



+------+-------------+---------+--------+
|origin|         City|avg_delay|qtd_voos|
+------+-------------+---------+--------+
|   GUM|         null|    33.88|      90|
|   LSE|    La Crosse|    26.53|     154|
|   MQT|    Marquette|    23.87|      77|
|   EGE|         Vail|    20.57|     877|
|   ROA|      Roanoke|    19.89|     470|
|   MDW|      Chicago|    19.66|   20056|
|   BTV|   Burlington|    18.72|     788|
|   ORD|      Chicago|    18.59|   64228|
|   IAD|Washington DC|     18.4|   14136|
|   SCE|State College|    17.92|     167|
+------+-------------+---------+--------+
only showing top 10 rows



                                                                                

### Rotas mais comuns entre cidades

In [45]:
# Trazendo país origem do vôo
df_city_origin = df_departure.join(
    other=df_air_codes,
    how="left",
    on=(df_departure.origin == df_air_codes.IATA)
)

# Selecionando apenas dados relevantes
df_city_origin_select = df_city_origin.selectExpr(
    "City AS city_origin",
    "destination"
)

# Trazendo país destino do vôo
df_city_dest = df_city_origin_select.join(
    other=df_air_codes,
    how="left",
    on=(df_city_origin_select.destination == df_air_codes.IATA)
)

# Selecionando apenas dados relevantes
df_city_dest_select = df_city_dest.selectExpr(
    "city_origin",
    "City AS city_dest"
)

# Agrupando dados
df_city_grouped = df_city_dest_select\
    .groupBy("city_origin", "city_dest").agg(
        expr("count(1) AS count")
    )

# Ordenando dados
df_city_ordered = df_city_grouped\
    .sort("count", ascending=False)

# Visualizando resultado
df_city_ordered.show(10)



+--------------+--------------+-----+
|   city_origin|     city_dest|count|
+--------------+--------------+-----+
|        Dallas|       Houston| 4597|
|       Houston|        Dallas| 4572|
| San Francisco|   Los Angeles| 3232|
|   Los Angeles| San Francisco| 3198|
|     Las Vegas|   Los Angeles| 3016|
|   Los Angeles|     Las Vegas| 2964|
|      New York|       Atlanta| 2954|
|       Atlanta|      New York| 2951|
|Honolulu, Oahu|          null| 2895|
|          null|Honolulu, Oahu| 2869|
+--------------+--------------+-----+
only showing top 10 rows



                                                                                