In [1]:
import findspark
findspark.init()

In [2]:
import pyspark  
from pyspark.sql import SparkSession
import os

In [3]:
spark = SparkSession.builder.master("local[*]").appName('HelloWorld').getOrCreate()


In [4]:
# Read parquet spark

df = spark.read.parquet("../resources/datasets/fhvhv_tripdata_2022-01.parquet")

#You can also create dataframe from Pandas
# spark.createDataFrame(df_pandas)

# Print S
# df.display()
# df.head(10)


AnalysisException: Path does not exist: file:/d:/Educational Others/2023 Data Engineering Zoomcamp/resources/datasets/fhvhv_tripdata_2022-01.parquet

In [None]:
#You can also specify schema
df.schema # schema of the dataframe

from pyspark.sql import types
schema = types.StructType([
    types.StructField('hvfhs_license_num', types.StringType(), True), 
    types.StructField('dispatching_base_num', types.StringType(), True), 
    types.StructField('originating_base_num', types.StringType(), True), 
    types.StructField('request_datetime', types.TimestampType(), True), 
    types.StructField('on_scene_datetime', types.TimestampType(), True), 
    types.StructField('pickup_datetime', types.TimestampType(), True), 
    types.StructField('dropoff_datetime', types.TimestampType(), True), 
    types.StructField('PULocationID', types.LongType(), True), 
    types.StructField('DOLocationID', types.LongType(), True), 
    types.StructField('trip_miles', types.DoubleType(), True), 
    types.StructField('trip_time', types.LongType(), True), 
    types.StructField('base_passenger_fare', types.DoubleType(), True), 
    types.StructField('tolls', types.DoubleType(), True), 
    types.StructField('bcf', types.DoubleType(), True), 
    types.StructField('sales_tax', types.DoubleType(), True), 
    types.StructField('congestion_surcharge', types.DoubleType(), True), 
    types.StructField('airport_fee', types.DoubleType(), True), 
    types.StructField('tips', types.DoubleType(), True), 
    types.StructField('driver_pay', types.DoubleType(), True), 
    types.StructField('shared_request_flag', types.StringType(), True), 
    types.StructField('shared_match_flag', types.StringType(), True), 
    types.StructField('access_a_ride_flag', types.StringType(), True), 
    types.StructField('wav_request_flag', types.StringType(), True), 
    types.StructField('wav_match_flag', types.StringType(), True)
])

#You can only define schema during the load
df = spark.read.schema(schema).parquet("../resources/datasets/fhvhv_tripdata_2022-01.parquet")
print("Row count: ", df.count())



Row count:  14751591


## Repartitions
When reading a single large file, Spark will read the file in a single partition. This can be a problem if the file is too large to fit in memory. In this case, we can repartition the data into smaller partitions. This can be done by using the `repartition` method.

In [None]:
df = df.repartition(4)
# df = df.coalesce(1)

# Now, the file is partitioned into 24 but will be saved as 1
# There would be a temporary directory created in the process
df.write.mode("overwrite").option("header","true").csv("../resources/datasets/fhvhv_tripdata_2022-01.csv")


In [None]:
spark.stop()

## Reading multiple files
Spark can read multiple files at once. This can be done by using the `glob` method to read all files that match a pattern. For example, to read all files that end with `.csv`, we can use the following code:

In [None]:
df = spark.read.csv('../resources/datasets/fhvhv_tripdata_2022-01.csv/', header=True)

# Get number of rows
print("Row count: ", df.count())
df.printSchema()

Row count:  14751591
root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: string (nullable = true)
 |-- on_scene_datetime: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- trip_miles: string (nullable = true)
 |-- trip_time: string (nullable = true)
 |-- base_passenger_fare: string (nullable = true)
 |-- tolls: string (nullable = true)
 |-- bcf: string (nullable = true)
 |-- sales_tax: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- airport_fee: string (nullable = true)
 |-- tips: string (nullable = true)
 |-- driver_pay: string (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_ride_fl

## Selecting columns
Similar to SQL, we can select columns using the `select` method. This method takes a list of column names as arguments.

**Transformations** are things that are not executed right away (laz)
* `Selecting` columns
* `Filtering` rows
* `Grouping` rows
* `Joins` between tables

**Actions** are things that are executed right away
* `show`
* `take`
* `head`
* `write`

In [None]:
df.select('hvfhs_license_num', 'dispatching_base_num','tips').filter(df.hvfhs_license_num == 'HV0003').show(5)

+-----------------+--------------------+----+
|hvfhs_license_num|dispatching_base_num|tips|
+-----------------+--------------------+----+
|           HV0003|              B03404| 0.0|
|           HV0003|              B03404| 0.0|
|           HV0003|              B03404| 0.0|
|           HV0003|              B03404| 0.0|
|           HV0003|              B03404| 0.0|
+-----------------+--------------------+----+
only showing top 5 rows



It is also possible to use SQL queries to select columns. This can be done by registering the DataFrame as a temporary table and then using the `sql` method to execute the query.

## Pyspark Functions
Pyspark has a lot of built-in functions that can be used to manipulate data. These functions can be found in the `pyspark.sql.functions` module. The following code shows how to import the module and use some of the functions.
Documentation: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html

Notable functions in the video:
`pyspark.sql.functions.udf`

In [None]:
from pyspark.sql import functions as F
df.withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime))\
    .select('pickup_date', 'dropoff_date','PULocationID','DOLocationID')\
    .show(5)

+-----------+------------+------------+------------+
|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-----------+------------+------------+------------+
| 2022-01-10|  2022-01-10|         188|         231|
| 2022-01-22|  2022-01-22|          87|         137|
| 2022-01-20|  2022-01-20|         231|          97|
| 2022-01-25|  2022-01-25|         116|         159|
| 2022-01-04|  2022-01-04|          68|         148|
+-----------+------------+------------+------------+
only showing top 5 rows

