In [0]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import col,lit,substring
from snowflake.snowpark import functions as F
from snowflake.snowpark.types import StringType,IntegerType,DoubleType,TimestampType,TimeType
from snowflake.snowpark.types import StructType, StructField
import os

In [0]:
User = dbutils.secrets.get("snowparkdetails", "username")
Password = dbutils.secrets.get("snowparkdetails", "password")
Account = dbutils.secrets.get("snowparkdetails", "account")
TenandId = dbutils.secrets.get("gen2-storage", "tenant-id")
SchemaName = "taxi"
DatabaseName = "NYCTAXI"
Warehouse = "cluster1"
DBrole = "ACCOUNTADMIN"
CONNECTION_PARAMETERS = {
    'account': Account,
    'user': User,
    'password': Password,
    'schema': SchemaName,
    'database': DatabaseName,
    'warehouse': Warehouse,
    'role':DBrole,
}

session = Session.builder.configs(CONNECTION_PARAMETERS).create()
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

In [0]:
#Canonical ordered column list for green taxi across years to homogenize schema
canonicalTripSchemaColList = ["taxi_type","vendor_id","pickup_datetime","dropoff_datetime","store_and_fwd_flag","rate_code_id","pickup_location_id","dropoff_location_id","pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude","passenger_count","trip_distance","fare_amount","extra","mta_tax","tip_amount","tolls_amount","ehail_fee","improvement_surcharge","total_amount","payment_type","trip_type","trip_year","trip_month"]

In [0]:
df_stages=session.sql(" list @azure_csv_stage").collect()
for val in df_stages:
  if 'transactional-data' in val.name and 'type=yellow' in val.name and '.csv' in val.name:
    print(val.name)
    

#### Define schema for source data

In [0]:
#Schema for data based on year and month

#>2019

greenTripSchema = StructType([
    StructField("vendor_id", IntegerType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("rate_code_id", IntegerType(), True),
    StructField("pickup_location_id", IntegerType(), True),
    StructField("dropoff_location_id", IntegerType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("ehail_fee", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("payment_type", IntegerType(), True),
    StructField("trip_type", IntegerType(), True),
    StructField("congestion_surcharge", DoubleType(), True)])


#### Some functions

In [0]:
#1) Function to determine schema for a given year and month
#Input:  Year and month
#Output: StructType for applicable schema 
#Sample call: println(getSchemaStruct(2019,1))

def getTaxiSchema(tripYear, tripMonth):
  if((tripYear == 2013 and tripMonth > 7) or tripYear == 2014):
    taxiSchema = greenTripSchemaPre2015
  elif(tripYear == 2015 and tripMonth < 7):
    taxiSchema = greenTripSchema2015H1
  elif((tripYear == 2015 and tripMonth > 6) or (tripYear == 2016 and tripMonth < 7)):
    taxiSchema = greenTripSchema2015H22016H1
  elif(tripYear == 2016 and tripMonth > 6):
    taxiSchema = greenTripSchema2016H2
  elif(tripYear == 2017 and tripMonth < 7):
    taxiSchema = greenTripSchema2017H1
  else: #this is for Year >=2019
    taxiSchema = greenTripSchema
  
  return taxiSchema


In [0]:
#2) Function to add columns to dataframe as required to homogenize schema
#Input:  Dataframe, year and month
#Output: Dataframe with homogenized schema 
#Sample call: println(getSchemaHomogenizedDataframe(DF,2019,6))

def getSchemaHomogenizedDataframe(sourceDF,tripYear,tripMonth):
  
  if(tripYear >=2019):

    sourceDF = (sourceDF.with_column("pickup_longitude", lit(""))
              .with_column("pickup_latitude", lit(""))
              .with_column("dropoff_longitude", lit(""))
              .with_column("dropoff_latitude", lit(""))
              .with_column("trip_year",substring(col("pickup_datetime"),0, 4))
              .with_column("trip_month",substring(col("pickup_datetime"),6,2))
              .with_column("taxi_type",lit("green"))
              .with_column("junk1",lit(""))
              .with_column("junk2",lit("")))

  else:
    sourceDF
  return sourceDF


####  Read CSV, homogenize schema across years, save as parquet

In [0]:
srcDataDirRoot="@azure_csv_stage/transactional-data/"

In [0]:


for j in [2019,2020,2021,2022]:
  if j ==2022:
    endMonth=3 
  else:
    endMonth=13
#   endMonth = 12
  for i in range(1,endMonth):
    srcDataFile= "{}year={}/month={:02d}/type=green/green_tripdata_{}-{:02d}.csv".format(srcDataDirRoot,j,i,j,i)
    print("Year={}; Month={}".format(j,i))
    print(srcDataFile)

    #Source schema
    taxiSchema = getTaxiSchema(j,i)
    taxiDF = session.read \
                    .schema(taxiSchema) \
                    .options({"skip_header": 1, "field_delimiter":"," ,"FIELD_OPTIONALLY_ENCLOSED_BY": '"' })  \
                    .csv(srcDataFile)
    
    taxiFormattedDF = getSchemaHomogenizedDataframe(taxiDF, j, i)
    
    taxiCanonicalDF = taxiFormattedDF.select(*canonicalTripSchemaColList)
    taxiCanonicalDF.write.mode("append").saveAsTable("green_taxi_trips_raw")


In [0]:
session.sql("select count(*) from green_taxi_trips_raw").collect()

In [0]:
session.sql("select top 10 * from green_taxi_trips_raw").toPandas()

Unnamed: 0,TAXI_TYPE,VENDOR_ID,PICKUP_DATETIME,DROPOFF_DATETIME,STORE_AND_FWD_FLAG,RATE_CODE_ID,PICKUP_LOCATION_ID,DROPOFF_LOCATION_ID,PICKUP_LONGITUDE,PICKUP_LATITUDE,DROPOFF_LONGITUDE,DROPOFF_LATITUDE,PASSENGER_COUNT,TRIP_DISTANCE,FARE_AMOUNT,EXTRA,MTA_TAX,TIP_AMOUNT,TOLLS_AMOUNT,EHAIL_FEE,IMPROVEMENT_SURCHARGE,TOTAL_AMOUNT,PAYMENT_TYPE,TRIP_TYPE,TRIP_YEAR,TRIP_MONTH
0,green,2,2020-07-01 00:05:18,2020-07-01 00:22:07,N,1,134,35,,,,,2,6.38,20.5,0.5,0.5,0.0,0.0,,0.3,21.8,2,1,2020,7
1,green,2,2020-07-01 00:47:06,2020-07-01 00:52:13,N,1,41,42,,,,,1,1.06,6.0,0.5,0.5,1.46,0.0,,0.3,8.76,1,1,2020,7
2,green,2,2020-07-01 00:24:59,2020-07-01 00:35:18,N,1,42,159,,,,,1,2.1,9.0,0.5,0.5,0.0,0.0,,0.3,10.3,2,1,2020,7
3,green,2,2020-07-01 00:55:12,2020-07-01 00:58:45,N,1,116,116,,,,,1,0.7,5.0,0.5,0.5,0.0,0.0,,0.3,6.3,2,1,2020,7
4,green,2,2020-07-01 00:12:36,2020-07-01 00:20:14,N,1,43,141,,,,,1,1.84,8.0,0.5,0.5,0.0,0.0,,0.3,12.05,2,1,2020,7
5,green,2,2020-07-01 00:30:55,2020-07-01 00:37:05,N,5,74,262,,,,,1,2.04,27.0,0.0,0.0,0.0,0.0,,0.3,30.05,2,1,2020,7
6,green,2,2020-07-01 00:13:00,2020-07-01 00:19:09,N,1,159,119,,,,,1,1.35,6.5,0.5,0.5,0.0,0.0,,0.3,7.8,2,1,2020,7
7,green,2,2020-07-01 00:39:09,2020-07-01 00:40:55,N,1,75,75,,,,,1,0.35,-3.5,-0.5,-0.5,0.0,0.0,,-0.3,-4.8,4,1,2020,7
8,green,2,2020-07-01 00:39:09,2020-07-01 00:40:55,N,1,75,75,,,,,1,0.35,3.5,0.5,0.5,0.0,0.0,,0.3,4.8,2,1,2020,7
9,green,2,2020-07-01 00:45:59,2020-07-01 01:01:02,N,1,75,87,,,,,1,8.17,24.0,0.5,0.5,4.21,0.0,,0.3,32.26,1,1,2020,7
