In [9]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType, IntegerType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, window, when, count

In [1]:
inputPath = "file:///root/DataScience/Spark/Lab3/taxi-data"

In [3]:
# Initialize Spark Session
spark = SparkSession.builder.master("local")\
    .appName("Task 1 - Lab 03")\
    .config("spark.some.config.option", "some-value")\
    .getOrCreate()

spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark.conf.set("spark.sql.streaming.statefulOperator.checkCorrectness.enabled", "false")

24/05/20 20:00:15 WARN Utils: Your hostname, DESKTOP-MINMIN resolves to a loopback address: 127.0.1.1; using 172.24.159.154 instead (on interface eth0)
24/05/20 20:00:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/20 20:00:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/20 20:00:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# Define the schema for yellow taxi rides
schema_yellow = StructType() \
    .add("type", StringType()) \
    .add("VendorID", StringType()) \
    .add("tpep_pickup_datetime", TimestampType()) \
    .add("tpep_dropoff_datetime", TimestampType()) \
    .add("passenger_count", StringType()) \
    .add("trip_distance", StringType()) \
    .add("pickup_longitude", DoubleType()) \
    .add("pickup_latitude", DoubleType()) \
    .add("RatecodeID", StringType()) \
    .add("store_and_fwd_flag", StringType()) \
    .add("dropoff_longitude", DoubleType()) \
    .add("dropoff_latitude", DoubleType()) \
    .add("payment_type", StringType()) \
    .add("fare_amount", StringType()) \
    .add("extra", StringType()) \
    .add("mta_tax", StringType()) \
    .add("tip_amount", StringType()) \
    .add("tolls_amount", StringType()) \
    .add("improvement_surcharge", StringType()) \
    .add("total_amount", StringType())

# Define the schema for green taxi rides
schema_green = StructType() \
    .add("type", StringType()) \
    .add("VendorID", StringType()) \
    .add("lpep_pickup_datetime", TimestampType()) \
    .add("lpep_dropoff_datetime", TimestampType()) \
    .add("store_and_fwd_flag", StringType()) \
    .add("RatecodeID", StringType()) \
    .add("pickup_longitude", DoubleType()) \
    .add("pickup_latitude", DoubleType()) \
    .add("dropoff_longitude", DoubleType()) \
    .add("dropoff_latitude", DoubleType()) \
    .add("passenger_count", StringType()) \
    .add("trip_distance", StringType()) \
    .add("fare_amount", StringType()) \
    .add("extra", StringType()) \
    .add("mta_tax", StringType()) \
    .add("tip_amount", StringType()) \
    .add("tolls_amount", StringType()) \
    .add("ehail_fee", StringType()) \
    .add("improvement_surcharge", StringType()) \
    .add("total_amount", StringType()) \
    .add("payment_type", StringType()) \
    .add("trip_type", StringType())

default_schema = StructType([
    StructField("_c0", StringType(), True),
    StructField("_c1", StringType(), True),
    StructField("_c2", StringType(), True),
    StructField("_c3", StringType(), True),
    StructField("_c4", StringType(), True),
    StructField("_c5", StringType(), True),
    StructField("_c6", StringType(), True),
    StructField("_c7", StringType(), True),
    StructField("_c8", StringType(), True),
    StructField("_c9", StringType(), True),
    StructField("_c10", StringType(), True),
    StructField("_c11", StringType(), True),
    StructField("_c12", StringType(), True),
    StructField("_c13", StringType(), True),
    StructField("_c14", StringType(), True),
    StructField("_c15", StringType(), True),
    StructField("_c16", StringType(), True),
    StructField("_c17", StringType(), True),
    StructField("_c18", StringType(), True),
    StructField("_c19", StringType(), True),
    StructField("_c20", StringType(), True),
    StructField("_c21", StringType(), True)
])

In [5]:
default_df = (
    spark.readStream
    .format("csv")
    .schema(default_schema)
    .option("header", "false")
    .load(inputPath)
)

In [6]:
# Filter and apply schema for yellow taxi trips
yellow_trips = default_df.filter(col("_c0") == "yellow") \
                           .drop("_c20","_c21") \
                           .toDF(*schema_yellow.names)

# Convert the columns as per the new schema
yellow_df = yellow_trips.withColumn("tpep_pickup_datetime", col("tpep_pickup_datetime").cast(TimestampType())) \
    .withColumn("tpep_dropoff_datetime", col("tpep_dropoff_datetime").cast(TimestampType())) \
    .withColumn("pickup_longitude", col("pickup_longitude").cast(DoubleType())) \
    .withColumn("pickup_latitude", col("pickup_latitude").cast(DoubleType())) \
    .withColumn("dropoff_longitude", col("dropoff_longitude").cast(DoubleType())) \
    .withColumn("dropoff_latitude", col("dropoff_latitude").cast(DoubleType()))

yellow_df.printSchema()

root
 |-- type: string (nullable = true)
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)



In [7]:
# Filter and apply schema for green taxi trips
green_trips = default_df.filter(col("_c0") == "green") \
                           .toDF(*schema_green.names)

# Convert the columns as per the new schema
green_df = green_trips.withColumn("lpep_pickup_datetime", col("lpep_pickup_datetime").cast(TimestampType())) \
    .withColumn("lpep_dropoff_datetime", col("lpep_dropoff_datetime").cast(TimestampType())) \
    .withColumn("pickup_longitude", col("pickup_longitude").cast(DoubleType())) \
    .withColumn("pickup_latitude", col("pickup_latitude").cast(DoubleType())) \
    .withColumn("dropoff_longitude", col("dropoff_longitude").cast(DoubleType())) \
    .withColumn("dropoff_latitude", col("dropoff_latitude").cast(DoubleType()))

green_df.printSchema()

root
 |-- type: string (nullable = true)
 |-- VendorID: string (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- ehail_fee: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- trip_type: string (nullable = true)



In [10]:
streamingCountDF = green_df.groupBy("VendorID").agg(
    count(when(col("VendorID") == 1, True)).alias("VendorID_count")
)

print("is ok", streamingCountDF.isStreaming)

is ok True
