# Data Preprocessing

In [1]:
# import context manager: SparkSession
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder \
        .master("local[*]") \
        .appName("mllib_classifier") \
        .config("spark.executor.memory", '20g') \
        .config('spark.executor.cores', '2') \
        .config('spark.executor.instances', '3') \
        .config("spark.driver.memory",'1g') \
        .getOrCreate()

sc = spark.sparkContext

## Benchmark DF Approach

In [11]:
data_file = spark.read.csv("/../../project/ds5559/Alice_Ed_Michael_Sam_project/BigTrips.csv", header=True)

In [12]:
data_file.printSchema()

root
 |-- Trip ID: string (nullable = true)
 |-- Trip Start Timestamp: string (nullable = true)
 |-- Trip End Timestamp: string (nullable = true)
 |-- Trip Seconds: string (nullable = true)
 |-- Trip Miles: string (nullable = true)
 |-- Pickup Census Tract: string (nullable = true)
 |-- Dropoff Census Tract: string (nullable = true)
 |-- Pickup Community Area: string (nullable = true)
 |-- Dropoff Community Area: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Tip: string (nullable = true)
 |-- Additional Charges: string (nullable = true)
 |-- Trip Total: string (nullable = true)
 |-- Shared Trip Authorized: string (nullable = true)
 |-- Trips Pooled: string (nullable = true)
 |-- Pickup Centroid Latitude: string (nullable = true)
 |-- Pickup Centroid Longitude: string (nullable = true)
 |-- Pickup Centroid Location: string (nullable = true)
 |-- Dropoff Centroid Latitude: string (nullable = true)
 |-- Dropoff Centroid Longitude: string (nullable = true)
 |-- Dropoff 

Raw data file is ~12 gigabytes. Take a 25% sample, stratified by sample, to get a 3 gigabyte file. Commented out line is for testing smaller subsamples.

In [13]:
#final_DF = data_file.sample(0.25).sample(0.0001)
final_DF = data_file
del(data_file)
final_DF.cache()

DataFrame[Trip ID: string, Trip Start Timestamp: string, Trip End Timestamp: string, Trip Seconds: string, Trip Miles: string, Pickup Census Tract: string, Dropoff Census Tract: string, Pickup Community Area: string, Dropoff Community Area: string, Fare: string, Tip: string, Additional Charges: string, Trip Total: string, Shared Trip Authorized: string, Trips Pooled: string, Pickup Centroid Latitude: string, Pickup Centroid Longitude: string, Pickup Centroid Location: string, Dropoff Centroid Latitude: string, Dropoff Centroid Longitude: string, Dropoff Centroid Location: string]

In [14]:
#https://stackoverflow.com/questions/53304688/spark-date-format-mmm-dd-yyyy-hhmmss-am-to-timestamp-in-df
#https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
#start_times = final_DF.select("Trip ID","Trip Start Timestamp")
#st = start_times.withColumn("Trip_Start_Timestamp",F.to_timestamp(F.col("Trip Start Timestamp"), "MM/dd/yyyy hh:mm:ss a"))

#end_times = final_DF.select("Trip ID","Trip End Timestamp")
#et = end_times.withColumn("Trip_End_Timestamp",F.to_timestamp(F.col("Trip End Timestamp"), "MM/dd/yyyy hh:mm:ss a"))
final_DF = final_DF.withColumn("Trip Start Timestamp",F.to_timestamp(F.col("Trip Start Timestamp"), "MM/dd/yyyy hh:mm:ss a"))
final_DF = final_DF.withColumn("Trip End Timestamp",F.to_timestamp(F.col("Trip End Timestamp"), "MM/dd/yyyy hh:mm:ss a"))

In [15]:
#final_DF = final_DF.join(st.select("Trip ID","Trip_Start_Timestamp"),on="Trip ID").join(et.select("Trip ID","Trip_End_Timestamp"),on="Trip ID")
#https://stackoverflow.com/questions/49397966/in-pyspark-how-do-you-add-concat-a-string-to-a-column
final_DF = final_DF.withColumn("Day_Month_str", F.concat(F.dayofmonth(F.col("Trip Start Timestamp")),F.lit("-"),F.month(F.col("Trip Start Timestamp"))).cast("string"))


In [16]:
final_DF = final_DF.withColumn('Trip Seconds',F.col('Trip Seconds').cast("integer"))
final_DF = final_DF.withColumn('Trip Miles',F.col('Trip Miles').cast("double"))
final_DF = final_DF.withColumn('Pickup Community Area',F.col('Pickup Community Area').cast("integer"))
final_DF = final_DF.withColumn('Dropoff Community Area',F.col('Dropoff Community Area').cast("integer"))
final_DF = final_DF.withColumn('Fare',F.col('Fare').cast("double"))
final_DF = final_DF.withColumn('Tip',F.col('Tip').cast("double"))
final_DF = final_DF.withColumn('Additional Charges',F.col('Additional Charges').cast("double"))
final_DF = final_DF.withColumn('Trip Total',F.col('Trip Total').cast("double"))
final_DF = final_DF.withColumn('Shared Trip Authorized',F.col('Shared Trip Authorized').cast("boolean"))
final_DF = final_DF.withColumn('Trips Pooled',F.col('Trips Pooled').cast("integer"))

In [17]:
final_DF = final_DF.withColumn("Tip_Flag", F.when(F.col("Tip") > 0,1).otherwise(0))

In [18]:
#https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#data-types
final_DF = final_DF.withColumn("Trip_Year", F.year(F.col("Trip Start Timestamp"))) \
                    .withColumn("Trip_Month", F.month(F.col("Trip Start Timestamp"))) \
                    .withColumn("Trip_WeekNumber", F.weekofyear(F.col("Trip Start Timestamp"))) \
                    .withColumn("Trip_DayofWeek", F.dayofweek(F.col("Trip Start Timestamp"))) \
                    .withColumn("Trip_Start_Hour", F.hour(F.col("Trip Start Timestamp"))) \
                    .withColumn("Trip_End_Hour", F.hour(F.col("Trip End Timestamp"))) \
                    .withColumn("Date", F.to_date(F.col("Trip Start Timestamp")))


In [19]:
final_DF = final_DF.withColumnRenamed("Trip ID","Trip_ID") \
                    .withColumnRenamed("Trip Start Timestamp","Trip_Start_Timestamp") \
                    .withColumnRenamed("Trip End Timestamp","Trip_End_Timestamp") \
                    .withColumnRenamed("Trip Seconds","Trip_Seconds") \
                    .withColumnRenamed("Trip Miles","Trip_Miles") \
                    .withColumnRenamed("Pickup Census Tract","Pickup_Census_Tract") \
                    .withColumnRenamed("Dropoff Census Tract","Dropoff_Census_Tract") \
                    .withColumnRenamed("Pickup Community Area","Pickup_Community_Area") \
                    .withColumnRenamed("Dropoff Community Area","Dropoff_Community_Area") \
                    .withColumnRenamed("Additional Charges","Additional_Charges_str") \
                    .withColumnRenamed("Trip Total","Trip_Total") \
                    .withColumnRenamed("Shared Trip Authorized","Shared_Trip_Authorized") \
                    .withColumnRenamed("Trips Pooled","Trips_Pooled") \
                    .withColumnRenamed("Pickup Centroid Latitude","Pickup_Centroid_Latitude") \
                    .withColumnRenamed("Pickup Centroid Longitude","Pickup_Centroid_Longitude") \
                    .withColumnRenamed("Pickup Centroid Location","Pickup_Centroid_Location") \
                    .withColumnRenamed("Dropoff Centroid Latitude","Dropoff_Centroid_Latitude") \
                    .withColumnRenamed("Dropoff Centroid Longitude","Dropoff_Centroid_Longitude") \
                    .withColumnRenamed("Dropoff Centroid Location","Dropoff_Centroid_Location")

In [20]:
%%time
final_DF.count()

CPU times: user 1.71 ms, sys: 616 µs, total: 2.33 ms
Wall time: 6.09 s


49108003

In [21]:
final_DF.printSchema()

root
 |-- Trip_ID: string (nullable = true)
 |-- Trip_Start_Timestamp: timestamp (nullable = true)
 |-- Trip_End_Timestamp: timestamp (nullable = true)
 |-- Trip_Seconds: integer (nullable = true)
 |-- Trip_Miles: double (nullable = true)
 |-- Pickup_Census_Tract: string (nullable = true)
 |-- Dropoff_Census_Tract: string (nullable = true)
 |-- Pickup_Community_Area: integer (nullable = true)
 |-- Dropoff_Community_Area: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Tip: double (nullable = true)
 |-- Additional_Charges_str: double (nullable = true)
 |-- Trip_Total: double (nullable = true)
 |-- Shared_Trip_Authorized: boolean (nullable = true)
 |-- Trips_Pooled: integer (nullable = true)
 |-- Pickup_Centroid_Latitude: string (nullable = true)
 |-- Pickup_Centroid_Longitude: string (nullable = true)
 |-- Pickup_Centroid_Location: string (nullable = true)
 |-- Dropoff_Centroid_Latitude: string (nullable = true)
 |-- Dropoff_Centroid_Longitude: string (nullable = true

In [22]:
%%time
final_DF.write.parquet("final_dataset.parquet")

CPU times: user 42.5 ms, sys: 38.4 ms, total: 80.9 ms
Wall time: 11min 14s


In [4]:
%%time
test = spark.read.parquet("final_dataset.parquet")

CPU times: user 796 µs, sys: 768 µs, total: 1.56 ms
Wall time: 289 ms


In [6]:
test.count()

49108003

The above code cell takes approximately 700-800 milliseconds to run (five trials all fell within this range).

## Benchmark RDD Approach

In [2]:
all_data = sc.textFile('/../../project/ds5559/Alice_Ed_Michael_Sam_project/BigTrips.csv')
all_data = all_data.map(lambda x: x.split(","))

rdd = sc.parallelize(all_data.sample(False,0.25).sample(False,0.0001).collect())

header = rdd.take(1)[0]
rdd = rdd.filter(lambda x: x != header)
final_DF = rdd.toDF()

del(all_data)
final_DF.cache()
start_times = rdd.map(lambda x: (x[0],x[1]))
start_times = start_times.toDF()
st = start_times.withColumn("Trip_Start_Timestamp",F.to_timestamp(F.col("_2"), "MM/dd/yyyy hh:mm:ss a"))

end_times = rdd.map(lambda x: (x[0],x[2]))
end_times = end_times.toDF()
et = end_times.withColumn("Trip_End_Timestamp",F.to_timestamp(F.col("_2"), "MM/dd/yyyy hh:mm:ss a"))

final_DF = final_DF.join(st.select("_1","Trip_Start_Timestamp"),on="_1").join(et.select("_1","Trip_End_Timestamp"),on="_1")
final_DF = final_DF.withColumn("Day_Month_str", F.concat(F.dayofmonth(F.col("Trip_Start_Timestamp")),F.lit("-"),F.month(F.col("Trip_Start_Timestamp"))).cast("string"))
final_DF.count()

The above code cell takes approximately 6 seconds to run (five trials fell within ~0.5 seconds of this timemark).

## Dataset Subselection

In [None]:
#https://stackoverflow.com/questions/40421845/pyspark-dataframe-filter-or-include-based-on-list
#filter by holidays - Christmas, New Year's Day, Valentine's Day
final_DF.filter(final_DF.Day_Month_str.isin(["25-12", \
                                             "1-1", \
                                             "14-2"
                                            ])).show(5)

Alternatively, include year, for holidays not set on a particular date.

In [None]:
final_DF = final_DF.withColumn("Date_str", F.to_date(F.col("Trip_Start_Timestamp")).cast("string"))

In [None]:
#filter by full date, Christmas 2019, Easter 2020
final_DF.filter(final_DF.Date_str.isin(["2019-12-25", \
                                        "2020-04-12",
                                        ])).show(5)