In [1]:
import pyspark
from pyspark.sql import SparkSession, types

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [None]:
schema = types.StructType([
    types.StructField('hvfhs_license_num', types.StringType(), True),
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True),    
])

df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('./fhvhv_tripdata_2021-01.csv')
    
print(df.head(5))

df = df.repartition(24)
df.write.parquet('fhvhv/2021/01/')

In [2]:
df = spark.read.parquet('fhvhv/2021/01/')
df.printSchema()
df.select('pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID').filter(df.hvfhs_license_num == 'HV0003').show()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)

+-------------------+-------------------+------------+------------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|
+-------------------+-------------------+------------+------------+
|2021-01-02 23:41:01|2021-01-02 23:44:14|          20|          20|
|2021-01-01 17:38:33|2021-01-01 17:42:16|         254|         254|
|2021-01-01 02:31:50|2021-01-01 02:42:34|         173|         260|
|2021-01-01 22:59:30|2021-01-01 23:13:00|          49|         226|
|2021-01-01 09:34:40|2021-01-01 09:38:49|         212|         213|
|2021-01-01 07:02:50|2021-01-01 07:20:06|         235|         116|
|2021-01-02 20:16:11|2021-01-02 20:35:53|         2

In [None]:
from pyspark.sql import functions as F

df.withColumn("pickup_date", F.to_date(df.pickup_datetime)) \
  .withColumn("dropoff_date", F.to_date(df.dropoff_datetime)) \
  .select('pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
  .show()

+-----------+------------+------------+------------+
|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-----------+------------+------------+------------+
| 2021-01-01|  2021-01-01|         163|          48|
| 2021-01-01|  2021-01-01|         117|         201|
| 2021-01-02|  2021-01-02|          20|          20|
| 2021-01-01|  2021-01-01|         254|         254|
| 2021-01-01|  2021-01-01|         173|         260|
| 2021-01-01|  2021-01-01|          49|         226|
| 2021-01-01|  2021-01-01|         212|         213|
| 2021-01-01|  2021-01-01|         235|         116|
| 2021-01-02|  2021-01-02|         225|          97|
| 2021-01-01|  2021-01-01|          60|          51|
| 2021-01-02|  2021-01-02|         167|         116|
| 2021-01-01|  2021-01-01|          32|          20|
| 2021-01-02|  2021-01-02|         181|         249|
| 2021-01-01|  2021-01-01|         151|          24|
| 2021-01-02|  2021-01-02|         236|         246|
| 2021-01-02|  2021-01-02|         188|       

In [5]:
def crazy_stuff(base_num):
    num = int(base_num[1:])
    if num % 7 == 0:
        return f's/{num:03x}'
    elif num % 3 == 0:
        return f'a/{num:03x}'
    else:
        return f'e/{num:03x}'

In [7]:
crazy_stuff_udf = F.udf(crazy_stuff, returnType=types.StringType())

df.withColumn("pickup_date", F.to_date(df.pickup_datetime)) \
  .withColumn("dropoff_date", F.to_date(df.dropoff_datetime)) \
  .withColumn("base_id", crazy_stuff_udf(df.dispatching_base_num)) \
  .select('base_id', 'pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
  .show()

+-------+-----------+------------+------------+------------+
|base_id|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-------+-----------+------------+------------+------------+
|  e/9ce| 2021-01-01|  2021-01-01|         163|          48|
|  e/9ce| 2021-01-01|  2021-01-01|         117|         201|
|  s/b13| 2021-01-02|  2021-01-02|          20|          20|
|  e/b42| 2021-01-01|  2021-01-01|         254|         254|
|  e/acc| 2021-01-01|  2021-01-01|         173|         260|
|  e/b3b| 2021-01-01|  2021-01-01|          49|         226|
|  e/b38| 2021-01-01|  2021-01-01|         212|         213|
|  a/a7a| 2021-01-01|  2021-01-01|         235|         116|
|  e/b32| 2021-01-02|  2021-01-02|         225|          97|
|  e/b3c| 2021-01-01|  2021-01-01|          60|          51|
|  e/b32| 2021-01-02|  2021-01-02|         167|         116|
|  e/b3f| 2021-01-01|  2021-01-01|          32|          20|
|  e/9ce| 2021-01-02|  2021-01-02|         181|         249|
|  s/af0| 2021-01-01|  2