In [17]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types, functions as F

In [18]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .config("spark.sql.legacy.parquet.nanosAsLong", "true") \
    .getOrCreate()

In [19]:
data_folder = 'taxi_data/'
data_green = 'green/'
data_yellow = 'yellow/'
parquet = 'pq/'

In [20]:
yellow_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("tpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("tpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

!wget https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2021-01.parquet

In [None]:
years = [2019, 2020]
for year in years:
    for month in range (1,13):
        file_name = f'yellow_tripdata_{year}-{month:02d}.csv.gz'
        pq_name = f'yellow_tripdata_{year}-{month:02d}.parquet'
        print(f'Converting {file_name} to {pq_name}')
        df_yellow = spark.read \
            .option("header", "true") \
            .schema(yellow_schema) \
            .csv(data_folder+data_yellow+file_name)
            # .option("inferSchema","true") \ we can use it, but it very expensive in terms of computing and time
    
        df_yellow \
            .repartition(4) \
            .write.parquet(data_folder+data_yellow+parquet+pq_name)


In [41]:
df_green = spark.read \
    .option("header", "true") \
    .parquet(data_folder+data_green)
    
df_green.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 1545405449000000000|  1545405537000000000|                 N|         1|         264|         264|              5|          0.0|        3.0|  0.5|    0.

In [42]:
df_green =df_green \
    .withColumn('pickup_datetime', F.to_timestamp((df_green["lpep_pickup_datetime"])/10e8)) \
    .withColumn('dropoff_datetime', F.to_timestamp((df_green["lpep_dropoff_datetime"])/10e8)) \
    .drop('lpep_pickup_datetime').drop('lpep_dropoff_datetime')

df_green \
.select('dropoff_datetime', 'pickup_datetime').show(10)

+-------------------+-------------------+
|   dropoff_datetime|    pickup_datetime|
+-------------------+-------------------+
|2018-12-21 15:18:57|2018-12-21 15:17:29|
|2019-01-01 00:16:32|2019-01-01 00:10:16|
|2019-01-01 00:31:38|2019-01-01 00:27:11|
|2019-01-01 01:04:54|2019-01-01 00:46:20|
|2019-01-01 00:39:43|2019-01-01 00:19:06|
|2019-01-01 00:19:09|2019-01-01 00:12:35|
|2019-01-01 01:00:01|2019-01-01 00:47:55|
|2019-01-01 00:30:50|2019-01-01 00:12:47|
|2019-01-01 00:39:46|2019-01-01 00:16:23|
|2019-01-01 01:19:02|2019-01-01 00:58:02|
+-------------------+-------------------+
only showing top 10 rows



In [44]:
years = [2019, 2020]
for year in years:
    for month in range (1,13):
        pq_name = f'green_tripdata_{year}-{month:02d}.parquet'
        print(f'Converting {pq_name} to partitioned file.')
        df_green = spark.read \
            .option("header", "true") \
            .parquet(data_folder+data_green+pq_name)
            # .option("inferSchema","true") \ we can use it, but it very expensive in terms of computing and time
    
        df_green \
            .withColumn('pickup_datetime', F.to_timestamp((df_green["lpep_pickup_datetime"])/10e8)) \
            .withColumn('dropoff_datetime', F.to_timestamp((df_green["lpep_dropoff_datetime"])/10e8)) \
            .drop('lpep_pickup_datetime').drop('lpep_dropoff_datetime') \
            .repartition(4) \
            .write.parquet(data_folder+data_green+parquet+pq_name)


Converting green_tripdata_2019-01.parquet to partitioned file.


                                                                                

Converting green_tripdata_2019-02.parquet to partitioned file.


                                                                                

Converting green_tripdata_2019-03.parquet to partitioned file.


                                                                                

Converting green_tripdata_2019-04.parquet to partitioned file.


                                                                                

Converting green_tripdata_2019-05.parquet to partitioned file.


                                                                                

Converting green_tripdata_2019-06.parquet to partitioned file.


                                                                                

Converting green_tripdata_2019-07.parquet to partitioned file.


                                                                                

Converting green_tripdata_2019-08.parquet to partitioned file.


                                                                                

Converting green_tripdata_2019-09.parquet to partitioned file.


                                                                                

Converting green_tripdata_2019-10.parquet to partitioned file.


                                                                                

Converting green_tripdata_2019-11.parquet to partitioned file.


                                                                                

Converting green_tripdata_2019-12.parquet to partitioned file.


                                                                                

Converting green_tripdata_2020-01.parquet to partitioned file.


                                                                                

Converting green_tripdata_2020-02.parquet to partitioned file.


                                                                                

Converting green_tripdata_2020-03.parquet to partitioned file.


                                                                                

Converting green_tripdata_2020-04.parquet to partitioned file.
Converting green_tripdata_2020-05.parquet to partitioned file.
Converting green_tripdata_2020-06.parquet to partitioned file.
Converting green_tripdata_2020-07.parquet to partitioned file.
Converting green_tripdata_2020-08.parquet to partitioned file.
Converting green_tripdata_2020-09.parquet to partitioned file.
Converting green_tripdata_2020-10.parquet to partitioned file.
Converting green_tripdata_2020-11.parquet to partitioned file.
Converting green_tripdata_2020-12.parquet to partitioned file.
