In [1]:
import pandas as pd
import pyspark

print(pyspark.__version__)
print(pd.__version__)

3.4.3
2.2.3


In [2]:
from pyspark.sql import SparkSession

# Create a Spark session
# `.master("local[*]")` => Use local resources 'local' with all available cores '[*]'
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

spark

your 131072x1 screen size is bogus. expect trouble
25/03/26 23:58:36 WARN Utils: Your hostname, Trydex resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/03/26 23:58:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/26 23:58:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
"""
Run this block if you need to download the datasets and unzip them, Ignore if already exists
"""
# !wget -P ../datasets https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-01.csv.gz
# !gunzip ../datasets/fhvhv_tripdata_2021-01.csv.gz

'\nRun this block if you need to download the datasets and unzip them, Ignore if already exists\n'

In [4]:
# Read csv file into spark
df = spark.read \
    .option("header", "true") \
    .csv('../datasets/fhvhv_tripdata_2021-01.csv')
df.schema

StructType([StructField('hvfhs_license_num', StringType(), True), StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropoff_datetime', StringType(), True), StructField('PULocationID', StringType(), True), StructField('DOLocationID', StringType(), True), StructField('SR_Flag', StringType(), True)])

In [5]:
"""
Extract 100 lines of data from csv to avoid using the entire data and save it as 'head.csv'
"""
# !head -n 101 '../datasets/fhvhv_tripdata_2021-01.csv' > '../datasets/head.csv'

"\nExtract 100 lines of data from csv to avoid using the entire data and save it as 'head.csv'\n"

In [6]:
# Read csv as pandas Dataframe
df_pandas = pd.read_csv('../datasets/head.csv')
df_pandas.dtypes

hvfhs_license_num        object
dispatching_base_num     object
pickup_datetime          object
dropoff_datetime         object
PULocationID              int64
DOLocationID              int64
SR_Flag                 float64
dtype: object

In [7]:
# Convert pandas Dataframe into Spark Dataframe
spark.createDataFrame(df_pandas).schema

StructType([StructField('hvfhs_license_num', StringType(), True), StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropoff_datetime', StringType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('SR_Flag', DoubleType(), True)])

In [8]:
from pyspark.sql import types

# Set Schema of the attributes
schema = types.StructType([
    types.StructField('hvfhs_license_num', types.StringType(), True),
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True)
])

In [9]:
# Read csv and apply schema
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('../datasets/fhvhv_tripdata_2021-01.csv')
df.schema

StructType([StructField('hvfhs_license_num', StringType(), True), StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', TimestampType(), True), StructField('dropoff_datetime', TimestampType(), True), StructField('PULocationID', IntegerType(), True), StructField('DOLocationID', IntegerType(), True), StructField('SR_Flag', StringType(), True)])

In [10]:
# Check number of partitions created for the csv file
print(df.rdd.getNumPartitions())

20


In [11]:
# Set the number of partitions for the spark Dataframe
df = df.repartition(30)
df

DataFrame[hvfhs_license_num: string, dispatching_base_num: string, pickup_datetime: timestamp, dropoff_datetime: timestamp, PULocationID: int, DOLocationID: int, SR_Flag: string]

In [12]:
# New updated partitions for the spark dataframe
print(df.rdd.getNumPartitions())

[Stage 1:==>                                                      (1 + 19) / 20]

30




In [13]:
# # Save file as parquet
# df.write.parquet('../artifacts/fhvhv_2021_01')

In [36]:
# Read the saved parquet file
df = spark.read.parquet('../artifacts/fhvhv_2021_01/')
df.show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2021-01-01 19:42:07|2021-01-01 19:44:59|          74|          74|   null|
|           HV0003|              B02875|2021-01-01 08:21:01|2021-01-01 08:32:29|          18|         185|   null|
|           HV0003|              B02887|2021-01-01 21:38:30|2021-01-01 21:56:11|          69|         185|   null|
|           HV0005|              B02510|2021-01-02 14:24:10|2021-01-02 14:30:53|         225|         177|   null|
|           HV0005|              B02510|2021-01-01 07:50:30|2021-01-01 08:19:52|         191|         244|   null|
|           HV0003|              B02872|2021-01-01 21:07:14|2021-01-01 21:15:05|

In [37]:
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)



### **Actions vs Transformations**

**Actions - (Eager)**
- These are executed by spark immediately when called
- .show(), take(), head(), tail(), write(), read(), etc.

**Transformations - (Lazy)**
- These are waiting to be executed by spark and is merely an instruction
- .filter(), .select(), etc.

In [33]:
# Perform select
df_sel = df.select('pickup_datetime','dropoff_datetime','PULocationID', 'DOLocationID')
df_sel.show()

+-------------------+-------------------+------------+------------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|
+-------------------+-------------------+------------+------------+
|2021-01-01 19:42:07|2021-01-01 19:44:59|          74|          74|
|2021-01-01 08:21:01|2021-01-01 08:32:29|          18|         185|
|2021-01-01 21:38:30|2021-01-01 21:56:11|          69|         185|
|2021-01-02 14:24:10|2021-01-02 14:30:53|         225|         177|
|2021-01-01 07:50:30|2021-01-01 08:19:52|         191|         244|
|2021-01-01 21:07:14|2021-01-01 21:15:05|          95|         130|
|2021-01-01 17:52:35|2021-01-01 18:05:11|         141|          48|
|2021-01-01 18:51:49|2021-01-01 19:16:43|         129|         265|
|2021-01-02 07:56:58|2021-01-02 08:11:46|         215|          93|
|2021-01-01 12:34:37|2021-01-01 12:45:51|          97|         256|
|2021-01-01 16:52:10|2021-01-01 16:59:45|         107|         229|
|2021-01-01 22:09:21|2021-01-01 22:38:04|       

In [34]:
# Perform filter
df_filter = df_sel.filter(df.PULocationID == 74)
df_filter.show()

+-------------------+-------------------+------------+------------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|
+-------------------+-------------------+------------+------------+
|2021-01-01 19:42:07|2021-01-01 19:44:59|          74|          74|
|2021-01-01 18:52:48|2021-01-01 18:57:22|          74|          75|
|2021-01-01 15:10:03|2021-01-01 15:12:57|          74|          75|
|2021-01-02 06:24:16|2021-01-02 06:35:24|          74|          59|
|2021-01-01 19:17:03|2021-01-01 19:39:43|          74|         185|
|2021-01-01 03:31:49|2021-01-01 03:43:12|          74|         169|
|2021-01-01 07:36:11|2021-01-01 07:48:29|          74|         141|
|2021-01-01 18:44:44|2021-01-01 18:59:15|          74|         167|
|2021-01-01 01:09:09|2021-01-01 01:14:02|          74|          75|
|2021-01-02 12:31:31|2021-01-02 12:45:13|          74|         151|
|2021-01-01 16:08:23|2021-01-01 16:12:06|          74|          74|
|2021-01-01 14:32:29|2021-01-01 14:54:20|       

In [47]:
from pyspark.sql import functions as F

# Apply a to_date transformation to columns
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .select(
        'pickup_datetime',
        'dropoff_datetime',
        'pickup_date',
        'dropoff_date',
        'PULocationID', 
        'DOLocationID',
        ) \
    .show()

+-------------------+-------------------+-----------+------------+------------+------------+
|    pickup_datetime|   dropoff_datetime|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-------------------+-------------------+-----------+------------+------------+------------+
|2021-01-01 19:42:07|2021-01-01 19:44:59| 2021-01-01|  2021-01-01|          74|          74|
|2021-01-01 08:21:01|2021-01-01 08:32:29| 2021-01-01|  2021-01-01|          18|         185|
|2021-01-01 21:38:30|2021-01-01 21:56:11| 2021-01-01|  2021-01-01|          69|         185|
|2021-01-02 14:24:10|2021-01-02 14:30:53| 2021-01-02|  2021-01-02|         225|         177|
|2021-01-01 07:50:30|2021-01-01 08:19:52| 2021-01-01|  2021-01-01|         191|         244|
|2021-01-01 21:07:14|2021-01-01 21:15:05| 2021-01-01|  2021-01-01|          95|         130|
|2021-01-01 17:52:35|2021-01-01 18:05:11| 2021-01-01|  2021-01-01|         141|          48|
|2021-01-01 18:51:49|2021-01-01 19:16:43| 2021-01-01|  2021-01-01|    

In [28]:
# User defined functions
def crazy_stuff(base_num):
    num = int(base_num[1:])
    if num % 7 == 0:
        return f's/{num:03x}'
    elif num % 3 == 0:
        return f'a/{num:03x}'
    else:
        return f'e/{num:03x}'
crazy_stuff('B02884')

's/b44'

In [50]:
# Register python function as Spark's user defined function
crazy_stuff_udf = F.udf(crazy_stuff, returnType=types.StringType())

In [None]:
# Apply a udf transformation to column
df \
    .withColumn('base_id', crazy_stuff_udf(df.dispatching_base_num)) \
    .select(
        'dispatching_base_num',
        'base_id'
        ) \
    .show()

+--------------------+-------+
|dispatching_base_num|base_id|
+--------------------+-------+
|              B02510|  e/9ce|
|              B02875|  e/b3b|
|              B02887|  e/b47|
|              B02510|  e/9ce|
|              B02510|  e/9ce|
|              B02872|  e/b38|
|              B02882|  e/b42|
|              B02510|  e/9ce|
|              B02866|  e/b32|
|              B02510|  e/9ce|
|              B02887|  e/b47|
|              B02512|  e/9d0|
|              B02764|  e/acc|
|              B02764|  e/acc|
|              B02882|  e/b42|
|              B02764|  e/acc|
|              B02765|  s/acd|
|              B02617|  e/a39|
|              B02875|  e/b3b|
|              B02884|  s/b44|
+--------------------+-------+
only showing top 20 rows

