In [30]:
import pyspark
from pyspark.sql import SparkSession

## Build Master

In [31]:
spark = (SparkSession.builder
         .master("local[*]")
         .appName("DataEngineeringZoomCamp")
         .getOrCreate())

## Read DataFrame

### Schemaless Read

In [32]:
# !wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz
# !gunzip fhv_tripdata_2019-10.csv.gz && rm -fr fhv_tripdata_2019-10.csv.gz

In [33]:
df = spark.read.csv("fhv_tripdata_2019-10.csv", header=True)

In [34]:
df.show(5)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B00009|2019-10-01 00:23:00|2019-10-01 00:35:00|         264|         264|   NULL|                B00009|
|              B00013|2019-10-01 00:11:29|2019-10-01 00:13:22|         264|         264|   NULL|                B00013|
|              B00014|2019-10-01 00:11:43|2019-10-01 00:37:20|         264|         264|   NULL|                B00014|
|              B00014|2019-10-01 00:56:29|2019-10-01 00:57:47|         264|         264|   NULL|                B00014|
|              B00014|2019-10-01 00:23:09|2019-10-01 00:28:27|         264|         264|   NULL|                B00014|
+--------------------+------------------

### Schema Read

In [35]:
# Enforce struct
from pyspark.sql import types

schema = types.StructType([
    types.StructField("dispatching_base_num", types.StringType()),
    types.StructField("pickup_datetime", types.TimestampType()),
    types.StructField("dropoff_datetime", types.TimestampType()),
    types.StructField("PULocationID", types.IntegerType()),
    types.StructField("DOLocationID", types.IntegerType()),
    types.StructField("SR_Flag", types.IntegerType(), True),
])

In [36]:
df = spark.read.csv("fhv_tripdata_2019-10.csv", header=True, schema=schema)

In [37]:
df.head(5)

[Row(dispatching_base_num='B00009', pickup_datetime=datetime.datetime(2019, 10, 1, 0, 23), dropoff_datetime=datetime.datetime(2019, 10, 1, 0, 35), PULocationID=264, DOLocationID=264, SR_Flag=None),
 Row(dispatching_base_num='B00013', pickup_datetime=datetime.datetime(2019, 10, 1, 0, 11, 29), dropoff_datetime=datetime.datetime(2019, 10, 1, 0, 13, 22), PULocationID=264, DOLocationID=264, SR_Flag=None),
 Row(dispatching_base_num='B00014', pickup_datetime=datetime.datetime(2019, 10, 1, 0, 11, 43), dropoff_datetime=datetime.datetime(2019, 10, 1, 0, 37, 20), PULocationID=264, DOLocationID=264, SR_Flag=None),
 Row(dispatching_base_num='B00014', pickup_datetime=datetime.datetime(2019, 10, 1, 0, 56, 29), dropoff_datetime=datetime.datetime(2019, 10, 1, 0, 57, 47), PULocationID=264, DOLocationID=264, SR_Flag=None),
 Row(dispatching_base_num='B00014', pickup_datetime=datetime.datetime(2019, 10, 1, 0, 23, 9), dropoff_datetime=datetime.datetime(2019, 10, 1, 0, 28, 27), PULocationID=264, DOLocationID

## Save as Parquet

In [38]:
df.repartition(6).write.parquet("fhvtripdata/2019/10/", mode="overwrite")

### Read from Parquet

In [39]:
df = spark.read.parquet("fhvtripdata/2019/10/")

In [40]:
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: integer (nullable = true)

