In [6]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast

In [3]:
os.environ['PYSPARK_SUBMIT_ARGS'] = f"\
--conf spark.hadoop.fs.s3a.endpoint=http://minio-ml-workshop:9000 \
--conf spark.hadoop.fs.s3a.access.key=minio \
--conf spark.hadoop.fs.s3a.secret.key=minio123 \
--conf spark.hadoop.fs.s3a.path.style.access=true \
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
--conf spark.hadoop.fs.s3a.multipart.size=104857600 \
--packages org.apache.hadoop:hadoop-aws:3.2.0,org.postgresql:postgresql:42.3.3 \
--master spark://{os.environ['SPARK_CLUSTER']}:7077 pyspark-shell "

# Create the spark application
spark = SparkSession \
    .builder \
    .appName("Enrich flights data") \
    .getOrCreate()

In [9]:
keys = ["year", "month", "day", "flight_number"]

df_flights = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://pg-flights-data:5432/postgres") \
    .option("dbtable", "flights") \
    .option("user", "postgres") \
    .option("password", "postgres") \
    .option("driver", "org.postgresql.Driver") \
    .option("numPartitions", 5000) \
    .option("fetchsize", 1000) \
    .option("paritionColumn", keys) \
    .load()

df_airlines = spark.read\
                .options(delimeter=',', inferSchema='True', header='True') \
                .csv("s3a://airport-data/airlines.csv")
df_airports = spark.read\
                .options(delimiter=',', inferSchema='True', header='True') \
                .csv("s3a://airport-data/airports.csv")

df_flights.printSchema()
df_airlines.printSchema()
df_airports.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight_number: integer (nullable = true)
 |-- tail_number: string (nullable = true)
 |-- origin_airport: string (nullable = true)
 |-- destination_airport: string (nullable = true)
 |-- scheduled_departure: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- departure_delay: integer (nullable = true)
 |-- taxi_out: integer (nullable = true)
 |-- wheels_off: string (nullable = true)
 |-- scheduled_time: integer (nullable = true)
 |-- elapsed_time: integer (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- wheels_on: string (nullable = true)
 |-- taxi_in: integer (nullable = true)
 |-- scheduled_arrival: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- arrival_delay: integer (nullable =

In [10]:
df_flights = df_flights\
    .join(broadcast(df_airlines), df_flights.airline == df_airlines.IATA_CODE)\
    .join(broadcast(df_airports), df_flights.origin_airport == df_airports.IATA_CODE)\
    .join(broadcast(df_airports), df_flights.destination_airport == df_airports.IATA_CODE)

df_flights.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight_number: integer (nullable = true)
 |-- tail_number: string (nullable = true)
 |-- origin_airport: string (nullable = true)
 |-- destination_airport: string (nullable = true)
 |-- scheduled_departure: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- departure_delay: integer (nullable = true)
 |-- taxi_out: integer (nullable = true)
 |-- wheels_off: string (nullable = true)
 |-- scheduled_time: integer (nullable = true)
 |-- elapsed_time: integer (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- wheels_on: string (nullable = true)
 |-- taxi_in: integer (nullable = true)
 |-- scheduled_arrival: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- arrival_delay: integer (nullable =