In [6]:
import pyspark
from pyspark.sql import SparkSession

In [7]:
# Add the vevn to spark's settings, so inject the venv’s Python into both driver & worker configs before recreating the session, to find the right python interpreter
import os
venv_python = r"C:\Sandeep SSD\Programming SSD\Data Engineering Zoomcamp\data-engineering-zoomcamp\dataenginzoomvenv\Scripts\python.exe"

# 1) Ensure the worker uses exactly this Python executable:
os.environ['PYSPARK_PYTHON'] = venv_python
os.environ['PYSPARK_DRIVER_PYTHON'] = venv_python


In [8]:
# Build SparkSession with those settings
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .config("spark.pyspark.python",        venv_python) \
    .config("spark.pyspark.driver.python", venv_python) \
    .getOrCreate()

In [9]:
# 3) Quick sanity check
df_s = spark.createDataFrame([{"x":1}])
df_s.show()


+---+
|  x|
+---+
|  1|
+---+



In [10]:
# Use curl in vscode notebooks to download the file 
# !wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-01.csv.gz
# !curl -L https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-01.csv.gz


In [11]:
# To unzip the file 
# !gzip -dc fhvhv_tripdata_2021-01.csv.gz

In [12]:
# Works only in git bash or linux environments, not in vscode notebooks
# !wc -l fhvhv_tripdata_2021-01.csv
# ../../Data/data/2021/csv_unzipped/fhvhv_tripdata_2021-01.csv/fhvhv_tripdata_2021-01.csv"
# Can use a python function to count the lines
from pathlib import Path

path = Path("../../Data/data/2021/csv_unzipped/fhvhv_tripdata_2021-01.csv/fhvhv_tripdata_2021-01.csv")
# Count lines (including header)
with path.open("r", encoding="utf8", errors="ignore") as f:
    nlines = sum(1 for _ in f)

print(f"Total lines: {nlines}")
print(f"Data rows:   {nlines - 1}")  # subtract header

Total lines: 11908469
Data rows:   11908468


In [13]:
# Read the csv using spark, saying header is true so the first line will be used as column headers
df = spark.read \
    .option("header", "true") \
    .csv('../../Data/data/2021/csv_unzipped/fhvhv_tripdata_2021-01.csv/fhvhv_tripdata_2021-01.csv')

In [14]:
# Show the csv we just read
df.show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0003|              B02682|2021-01-01 00:33:44|2021-01-01 00:49:07|         230|         166|   null|
|           HV0003|              B02682|2021-01-01 00:55:19|2021-01-01 01:18:21|         152|         167|   null|
|           HV0003|              B02764|2021-01-01 00:23:56|2021-01-01 00:38:05|         233|         142|   null|
|           HV0003|              B02764|2021-01-01 00:42:51|2021-01-01 00:45:50|         142|         143|   null|
|           HV0003|              B02764|2021-01-01 00:48:14|2021-01-01 01:08:42|         143|          78|   null|
|           HV0005|              B02510|2021-01-01 00:06:59|2021-01-01 00:43:01|

In [15]:
# to show the first five elements, usefull for seeing types for example, in this case everything is string
df.head(5)

[Row(hvfhs_license_num='HV0003', dispatching_base_num='B02682', pickup_datetime='2021-01-01 00:33:44', dropoff_datetime='2021-01-01 00:49:07', PULocationID='230', DOLocationID='166', SR_Flag=None),
 Row(hvfhs_license_num='HV0003', dispatching_base_num='B02682', pickup_datetime='2021-01-01 00:55:19', dropoff_datetime='2021-01-01 01:18:21', PULocationID='152', DOLocationID='167', SR_Flag=None),
 Row(hvfhs_license_num='HV0003', dispatching_base_num='B02764', pickup_datetime='2021-01-01 00:23:56', dropoff_datetime='2021-01-01 00:38:05', PULocationID='233', DOLocationID='142', SR_Flag=None),
 Row(hvfhs_license_num='HV0003', dispatching_base_num='B02764', pickup_datetime='2021-01-01 00:42:51', dropoff_datetime='2021-01-01 00:45:50', PULocationID='142', DOLocationID='143', SR_Flag=None),
 Row(hvfhs_license_num='HV0003', dispatching_base_num='B02764', pickup_datetime='2021-01-01 00:48:14', dropoff_datetime='2021-01-01 01:08:42', PULocationID='143', DOLocationID='78', SR_Flag=None)]

In [16]:
# Again using schema we see that everything is stringtype
df.schema

StructType([StructField('hvfhs_license_num', StringType(), True), StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropoff_datetime', StringType(), True), StructField('PULocationID', StringType(), True), StructField('DOLocationID', StringType(), True), StructField('SR_Flag', StringType(), True)])

In [17]:
# Keeps only the first 1001 rows of the file and saves them to head.csv file... 
# This is bash only, so turn into windows command using python 
# !head -n 1001 fhvhv_tripdata_2021-01.csv > head.csv

from itertools import islice
from pathlib import Path

src  = Path("../../Data/data/2021/csv_unzipped/fhvhv_tripdata_2021-01.csv/fhvhv_tripdata_2021-01.csv")
dest = Path("../../Data/data/2021/csv_unzipped/fhvhv_tripdata_2021-01.csv/fhvhv_tripdata_2021-01_head_1001_rows.csv")

with src.open("r", encoding="utf8", errors="ignore") as fin, \
     dest.open("w", encoding="utf8", errors="ignore") as fout:
    # write the first 1001 lines
    for line in islice(fin, 1001):
        fout.write(line)


In [18]:
# Double check the count on that file
from pathlib import Path

path = Path("../../Data/data/2021/csv_unzipped/fhvhv_tripdata_2021-01.csv/fhvhv_tripdata_2021-01_head_1001_rows.csv")
# Count lines (including header)
with path.open("r", encoding="utf8", errors="ignore") as f:
    nlines = sum(1 for _ in f)

print(f"Total lines: {nlines}")
print(f"Data rows:   {nlines - 1}")  # subtract header

Total lines: 1001
Data rows:   1000


In [19]:
import pandas as pd

In [20]:
df_pandas = pd.read_csv('../../Data/data/2021/csv_unzipped/fhvhv_tripdata_2021-01.csv/fhvhv_tripdata_2021-01_head_1001_rows.csv')

In [21]:
# After reading the smaller head file with pandas, we can use it to identify the types the objects should have, as shown in the output below...
# But pandas is not that smart, datetimes should be timestamp, but are objects or string instead...
df_pandas.dtypes

hvfhs_license_num        object
dispatching_base_num     object
pickup_datetime          object
dropoff_datetime         object
PULocationID              int64
DOLocationID              int64
SR_Flag                 float64
dtype: object

In [22]:
# Pandas .iter items was removed in pandas 2.0, so gotta fix by either using pandas 1.x version or creating iter items yourself...
# turn your pandas DataFrame into a list of pure-Python dicts
# Upgrading spark to 3.4.2 fixed everything, no other changes or downgrades needed...
spark.createDataFrame(df_pandas)

DataFrame[hvfhs_license_num: string, dispatching_base_num: string, pickup_datetime: string, dropoff_datetime: string, PULocationID: bigint, DOLocationID: bigint, SR_Flag: double]

In [23]:
spark.createDataFrame(df_pandas).show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0003|              B02682|2021-01-01 00:33:44|2021-01-01 00:49:07|         230|         166|    NaN|
|           HV0003|              B02682|2021-01-01 00:55:19|2021-01-01 01:18:21|         152|         167|    NaN|
|           HV0003|              B02764|2021-01-01 00:23:56|2021-01-01 00:38:05|         233|         142|    NaN|
|           HV0003|              B02764|2021-01-01 00:42:51|2021-01-01 00:45:50|         142|         143|    NaN|
|           HV0003|              B02764|2021-01-01 00:48:14|2021-01-01 01:08:42|         143|          78|    NaN|
|           HV0005|              B02510|2021-01-01 00:06:59|2021-01-01 00:43:01|

In [24]:
spark.createDataFrame(df_pandas).schema

StructType([StructField('hvfhs_license_num', StringType(), True), StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropoff_datetime', StringType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('SR_Flag', DoubleType(), True)])

#### From the output above, lcationid's are interpeted as long types, but they are memory inefficient format, so let's fix then as integer...
##### Integer - 4 bytes vs Long - 8 bytes

In [25]:
from pyspark.sql import types

In [26]:
# From the spark schema output we create a schema, with name, types and if nullable (true means can be null)
schema = types.StructType([
    types.StructField('hvfhs_license_num', types.StringType(), True),
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True)
])

In [32]:
# Now we provide the schema when reading this csv
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('../../Data/data/2021/csv_unzipped/fhvhv_tripdata_2021-01.csv/fhvhv_tripdata_2021-01.csv')

In [37]:
df.count()

11908468

In [33]:
# Show to check if it works
df.show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0003|              B02682|2021-01-01 00:33:44|2021-01-01 00:49:07|         230|         166|   null|
|           HV0003|              B02682|2021-01-01 00:55:19|2021-01-01 01:18:21|         152|         167|   null|
|           HV0003|              B02764|2021-01-01 00:23:56|2021-01-01 00:38:05|         233|         142|   null|
|           HV0003|              B02764|2021-01-01 00:42:51|2021-01-01 00:45:50|         142|         143|   null|
|           HV0003|              B02764|2021-01-01 00:48:14|2021-01-01 01:08:42|         143|          78|   null|
|           HV0005|              B02510|2021-01-01 00:06:59|2021-01-01 00:43:01|

In [34]:
# Print the first few to see if the types are properly parsed, eg pickup_datetime should be datetime type, and check others, string or int
df.head(10)

[Row(hvfhs_license_num='HV0003', dispatching_base_num='B02682', pickup_datetime=datetime.datetime(2021, 1, 1, 0, 33, 44), dropoff_datetime=datetime.datetime(2021, 1, 1, 0, 49, 7), PULocationID=230, DOLocationID=166, SR_Flag=None),
 Row(hvfhs_license_num='HV0003', dispatching_base_num='B02682', pickup_datetime=datetime.datetime(2021, 1, 1, 0, 55, 19), dropoff_datetime=datetime.datetime(2021, 1, 1, 1, 18, 21), PULocationID=152, DOLocationID=167, SR_Flag=None),
 Row(hvfhs_license_num='HV0003', dispatching_base_num='B02764', pickup_datetime=datetime.datetime(2021, 1, 1, 0, 23, 56), dropoff_datetime=datetime.datetime(2021, 1, 1, 0, 38, 5), PULocationID=233, DOLocationID=142, SR_Flag=None),
 Row(hvfhs_license_num='HV0003', dispatching_base_num='B02764', pickup_datetime=datetime.datetime(2021, 1, 1, 0, 42, 51), dropoff_datetime=datetime.datetime(2021, 1, 1, 0, 45, 50), PULocationID=142, DOLocationID=143, SR_Flag=None),
 Row(hvfhs_license_num='HV0003', dispatching_base_num='B02764', pickup_dat

In [35]:
# Partition the file into partitions to make use of the full capabilities of spark clusters
df = df.repartition(24)

In [None]:
# Can't run this twice, will commplain path already exists, unless you add mode='overwrite'
df.write.parquet('../../Data/data/2021/csv_unzipped/fhvhv_tripdata_2021-01.csv/parquet')

In [39]:
# Now read the parquet file 
df = spark.read.parquet('../../Data/data/2021/csv_unzipped/fhvhv_tripdata_2021-01.csv/parquet')

In [None]:
# Print the spark dataframe, with all the columns and types
df

DataFrame[hvfhs_license_num: string, dispatching_base_num: string, pickup_datetime: timestamp, dropoff_datetime: timestamp, PULocationID: int, DOLocationID: int, SR_Flag: string]

In [42]:
# Prints the schema but nicer
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)



SELECT * FROM df WHERE hvfhs_license_num =  HV0003

In [41]:
# Import functions from spark 
from pyspark.sql import functions as F 

In [None]:
# For example to date() keeps only date info on a timestamp, removes hours, minutes, etc... 
F.to_date()

In [47]:
# withcolumn adds an extra column to the dataframe
# Let's add some extra columns, if same name as existing columns then it'll overwrite those, so be careful
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .select('pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
    .show()

+-----------+------------+------------+------------+
|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-----------+------------+------------+------------+
| 2021-01-01|  2021-01-01|          21|          55|
| 2021-01-02|  2021-01-02|          42|         159|
| 2021-01-01|  2021-01-01|         213|          32|
| 2021-01-01|  2021-01-01|         220|         235|
| 2021-01-02|  2021-01-02|         190|          61|
| 2021-01-02|  2021-01-02|          71|          61|
| 2021-01-02|  2021-01-02|         198|         198|
| 2021-01-01|  2021-01-01|         107|          48|
| 2021-01-02|  2021-01-02|          61|         177|
| 2021-01-01|  2021-01-01|         132|          48|
| 2021-01-01|  2021-01-01|          56|         196|
| 2021-01-02|  2021-01-02|         190|          87|
| 2021-01-02|  2021-01-02|         248|         254|
| 2021-01-02|  2021-01-02|          26|          26|
| 2021-01-01|  2021-01-01|         108|           7|
| 2021-01-01|  2021-01-01|          47|       

In [49]:
# The dataframe doesn't get mutated
df.show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2021-01-01 02:28:38|2021-01-01 02:44:23|          21|          55|   null|
|           HV0005|              B02510|2021-01-02 13:14:14|2021-01-02 13:37:31|          42|         159|   null|
|           HV0003|              B02875|2021-01-01 03:35:52|2021-01-01 03:46:17|         213|          32|   null|
|           HV0003|              B02835|2021-01-01 21:29:06|2021-01-01 21:49:01|         220|         235|   null|
|           HV0003|              B02872|2021-01-02 17:26:59|2021-01-02 17:40:43|         190|          61|   null|
|           HV0003|              B02866|2021-01-02 17:54:07|2021-01-02 18:07:12|

In [50]:
# depending on the result, returns a dispatching base num starting with s, a or e...
# Method is just for fun 
def crazy_stuff(base_num):
    num = int(base_num[1:])
    if num % 7 == 0:
        return f's/{num:03x}'
    elif num % 3 == 0:
        return f'a/{num:03x}'
    else:
        return f'e/{num:03x}'

In [51]:
crazy_stuff('B02884')

's/b44'

In [53]:
# Spark has UDFs, user defined functions, where you pass a functions and it's return type
crazy_stuff_udf = F.udf(crazy_stuff, returnType=types.StringType())

In [54]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .withColumn('base_id', crazy_stuff_udf(df.dispatching_base_num)) \
    .select('base_id', 'pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
    .show()

+-------+-----------+------------+------------+------------+
|base_id|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-------+-----------+------------+------------+------------+
|  e/9ce| 2021-01-01|  2021-01-01|          21|          55|
|  e/9ce| 2021-01-02|  2021-01-02|          42|         159|
|  e/b3b| 2021-01-01|  2021-01-01|         213|          32|
|  s/b13| 2021-01-01|  2021-01-01|         220|         235|
|  e/b38| 2021-01-02|  2021-01-02|         190|          61|
|  e/b32| 2021-01-02|  2021-01-02|          71|          61|
|  e/b35| 2021-01-02|  2021-01-02|         198|         198|
|  e/b38| 2021-01-01|  2021-01-01|         107|          48|
|  e/a39| 2021-01-02|  2021-01-02|          61|         177|
|  e/9ce| 2021-01-01|  2021-01-01|         132|          48|
|  e/b38| 2021-01-01|  2021-01-01|          56|         196|
|  s/b44| 2021-01-02|  2021-01-02|         190|          87|
|  e/b30| 2021-01-02|  2021-01-02|         248|         254|
|  e/9ce| 2021-01-02|  2

In [None]:
# Transformations (lazy), selects some rows and filters in spark's dataframe
# Actions such as show() as eager, that's when all the execution happens
df.select('pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID') \
  .filter(df.hvfhs_license_num == 'HV0003')


[Row(pickup_datetime=datetime.datetime(2021, 1, 1, 0, 23, 13), dropoff_datetime=datetime.datetime(2021, 1, 1, 0, 30, 35), PULocationID=147, DOLocationID=159),
 Row(pickup_datetime=datetime.datetime(2021, 1, 6, 11, 43, 12), dropoff_datetime=datetime.datetime(2021, 1, 6, 11, 55, 7), PULocationID=79, DOLocationID=164),
 Row(pickup_datetime=datetime.datetime(2021, 1, 4, 15, 35, 32), dropoff_datetime=datetime.datetime(2021, 1, 4, 15, 52, 2), PULocationID=174, DOLocationID=18),
 Row(pickup_datetime=datetime.datetime(2021, 1, 4, 13, 42, 15), dropoff_datetime=datetime.datetime(2021, 1, 4, 14, 4, 57), PULocationID=201, DOLocationID=180),
 Row(pickup_datetime=datetime.datetime(2021, 1, 3, 18, 42, 3), dropoff_datetime=datetime.datetime(2021, 1, 3, 19, 12, 22), PULocationID=132, DOLocationID=72)]

In [50]:
!head -n 10 head.csv

hvfhs_license_num,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,SR_Flag

HV0003,B02682,2021-01-01 00:33:44,2021-01-01 00:49:07,230,166,

HV0003,B02682,2021-01-01 00:55:19,2021-01-01 01:18:21,152,167,

HV0003,B02764,2021-01-01 00:23:56,2021-01-01 00:38:05,233,142,

HV0003,B02764,2021-01-01 00:42:51,2021-01-01 00:45:50,142,143,

HV0003,B02764,2021-01-01 00:48:14,2021-01-01 01:08:42,143,78,

HV0005,B02510,2021-01-01 00:06:59,2021-01-01 00:43:01,88,42,

HV0005,B02510,2021-01-01 00:50:00,2021-01-01 01:04:57,42,151,

HV0003,B02764,2021-01-01 00:14:30,2021-01-01 00:50:27,71,226,

HV0003,B02875,2021-01-01 00:22:54,2021-01-01 00:30:20,112,255,

