### Purpose of this notebook is to turn our raw csv files with all the data to parquet files with schema's, this way all the parquet files are uniform and they can be ingested without problem

In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
# Add the vevn to spark's settings, so inject the venv’s Python into both driver & worker configs before recreating the session, to find the right python interpreter
import os
venv_python = r"C:\Sandeep SSD\Programming SSD\Data Engineering Zoomcamp\data-engineering-zoomcamp\dataenginzoomvenv\Scripts\python.exe"

# 1) Ensure the worker uses exactly this Python executable:
os.environ['PYSPARK_PYTHON'] = venv_python
os.environ['PYSPARK_DRIVER_PYTHON'] = venv_python


In [3]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .config("spark.pyspark.python", venv_python) \
    .config("spark.pyspark.driver.python", venv_python) \
    .getOrCreate()

In [13]:
# Check which port the Spark UI is running on
print(spark.sparkContext.uiWebUrl)

http://192.168.0.181:4041


In [4]:
import pandas as pd

In [5]:
from pyspark.sql import types

# Schemas for yellow and green with correct types

In [6]:
green_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("lpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("lpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("ehail_fee", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("trip_type", types.IntegerType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

yellow_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("tpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("tpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

# Convert green csvs

## Green 2020

In [8]:
year = 2020
input_base = '../../Data/data/csv/green'
output_base = '../../Data/data/csv/green/spark_parquet'


for month in range(1, 13):
    mm = f'{month:02d}'
    print(f'processing data for {year}/{mm}')

    # Since my files are located here under the green folder, like this one
    # ..\..\\Data\data\csv\green\green_tripdata_2019-12.csv.gz"
    # I will format it that way
    input_path = f'{input_base}/green_tripdata_{year}-{mm}.csv*'
    
    # My parquet files will go there
    # Still uses year/month folders, so parquet will be well organized
    output_path = f'{output_base}/{year}/{mm}'

    df_green = spark.read \
        .option("header", "true") \
        .schema(green_schema) \
        .csv(input_path)

    df_green \
        .repartition(4) \
        .write.parquet(output_path)

processing data for 2020/01
processing data for 2020/02
processing data for 2020/03
processing data for 2020/04
processing data for 2020/05
processing data for 2020/06
processing data for 2020/07
processing data for 2020/08
processing data for 2020/09
processing data for 2020/10
processing data for 2020/11
processing data for 2020/12


## Green 2021

In [9]:
# Does the same as the previous but just the year is different, could have made a method out of it but yea, better for seeing clear output
year = 2021
input_base = '../../Data/data/csv/green'
output_base = '../../Data/data/csv/green/spark_parquet'


for month in range(1, 13):
    mm = f'{month:02d}'
    print(f'processing data for {year}/{mm}')

    # Since my files are located here under the green folder, like this one
    # ..\..\\Data\data\csv\green\green_tripdata_2019-12.csv.gz"
    # I will format it that way
    input_path = f'{input_base}/green_tripdata_{year}-{mm}.csv*'
    
    # My parquet files will go there
    # Still uses year/month folders, so parquet will be well organized
    output_path = f'{output_base}/{year}/{mm}'

    df_green = spark.read \
        .option("header", "true") \
        .schema(green_schema) \
        .csv(input_path)

    df_green \
        .repartition(4) \
        .write.parquet(output_path)

processing data for 2021/01
processing data for 2021/02
processing data for 2021/03
processing data for 2021/04
processing data for 2021/05
processing data for 2021/06
processing data for 2021/07
processing data for 2021/08


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/c:/Sandeep SSD/Programming SSD/Data Engineering Zoomcamp/data-engineering-zoomcamp/Data/data/csv/green/green_tripdata_2021-08.csv*.

# Yellow

## Yellow 2020

In [10]:
year = 2020 
input_base = '../../Data/data/csv/yellow'
output_base = '../../Data/data/csv/yellow/spark_parquet'


for month in range(1, 13):
    mm = f'{month:02d}'
    print(f'processing data for {year}/{mm}')

    # Since my files are located here under the yellow folder, like this one
    # ..\..\\Data\data\csv\yellow\yellow_tripdata_2019-12.csv.gz"
    # I will format it that way
    input_path = f'{input_base}/yellow_tripdata_{year}-{mm}.csv*'
    
    # My parquet files will go there
    # Still uses year/month folders, so parquet will be well organized
    output_path = f'{output_base}/{year}/{mm}'
    
    df_yellow = spark.read \
        .option("header", "true") \
        .schema(yellow_schema) \
        .csv(input_path)

    df_yellow \
        .repartition(4) \
        .write.parquet(output_path)

processing data for 2020/01
processing data for 2020/02
processing data for 2020/03
processing data for 2020/04
processing data for 2020/05
processing data for 2020/06
processing data for 2020/07
processing data for 2020/08
processing data for 2020/09
processing data for 2020/10
processing data for 2020/11
processing data for 2020/12


## Yellow 2021

In [11]:
year = 2021
input_base = '../../Data/data/csv/yellow'
output_base = '../../Data/data/csv/yellow/spark_parquet'


for month in range(1, 13):
    mm = f'{month:02d}'
    print(f'processing data for {year}/{mm}')

    # Since my files are located here under the yellow folder, like this one
    # ..\..\\Data\data\csv\yellow\yellow_tripdata_2019-12.csv.gz"
    # I will format it that way
    input_path = f'{input_base}/yellow_tripdata_{year}-{mm}.csv*'
    
    # My parquet files will go there
    # Still uses year/month folders, so parquet will be well organized
    output_path = f'{output_base}/{year}/{mm}'
    
    df_yellow = spark.read \
        .option("header", "true") \
        .schema(yellow_schema) \
        .csv(input_path)

    df_yellow \
        .repartition(4) \
        .write.parquet(output_path)

processing data for 2021/01
processing data for 2021/02
processing data for 2021/03
processing data for 2021/04
processing data for 2021/05
processing data for 2021/06
processing data for 2021/07
processing data for 2021/08


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/c:/Sandeep SSD/Programming SSD/Data Engineering Zoomcamp/data-engineering-zoomcamp/Data/data/csv/yellow/yellow_tripdata_2021-08.csv*.