# What's in this exercise?
We run the common functions notebook so we can reuse capability defined there, and then...<BR>
1) Load yellow taxi data in staging directory to raw data directory, and save as Snowflake Tables<BR>

In [0]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import col,lit,substring
from snowflake.snowpark import functions as F
from snowflake.snowpark.types import StringType,IntegerType,DoubleType,TimestampType,TimeType
from snowflake.snowpark.types import StructType, StructField
import os

In [0]:
User = dbutils.secrets.get("snowparkdetails", "username")
Password = dbutils.secrets.get("snowparkdetails", "password")
Account = dbutils.secrets.get("snowparkdetails", "account")
TenandId = dbutils.secrets.get("gen2-storage", "tenant-id")
SchemaName = "taxi"
DatabaseName = "NYCTAXI"
Warehouse = "cluster1"
DBrole = "ACCOUNTADMIN"
CONNECTION_PARAMETERS = {
    'account': Account,
    'user': User,
    'password': Password,
    'schema': SchemaName,
    'database': DatabaseName,
    'warehouse': Warehouse,
    'role':DBrole,
}

session = Session.builder.configs(CONNECTION_PARAMETERS).create()
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

In [0]:
df_stages=session.sql(" list @azure_csv_stage").collect()
for val in df_stages:
  if 'transactional-data' in val.name and '.csv' in val.name:
    print(val.name)
    

#Canonical ordered column list for yellow taxi across years to homogenize schema
canonicalTripSchemaColList = ["taxi_type","vendor_id","pickup_datetime","dropoff_datetime","store_and_fwd_flag","rate_code_id","pickup_location_id","dropoff_location_id","pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude","passenger_count","trip_distance","fare_amount","extra","mta_tax","tip_amount","tolls_amount","improvement_surcharge","total_amount","payment_type","trip_year","trip_month"]

#### Define schema for source data

In [0]:
canonicalTripSchemaColList = ["taxi_type","vendor_id","pickup_datetime","dropoff_datetime","store_and_fwd_flag","rate_code_id","pickup_location_id","dropoff_location_id","pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude","passenger_count","trip_distance","fare_amount","extra","mta_tax","tip_amount","tolls_amount","improvement_surcharge","total_amount","payment_type","trip_year","trip_month"]

In [0]:
#2019 and above
yellowTripSchema = StructType([
    StructField("vendor_id", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("rate_code_id", IntegerType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("pickup_location_id", IntegerType(), True),
    StructField("dropoff_location_id", IntegerType(), True),
    StructField("payment_type", StringType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
   StructField("congestion_surcharge", DoubleType(), True)])


#### Some functions

In [0]:
#1) Function to determine schema for a given year and month
#Input:  Year and month
#Output: StructType for applicable schema 
#Sample call: print getSchemaStruct(2009,1)

def getTaxiSchema(tripYear, tripMonth):
  taxiSchema = None

  if(tripYear > 2008 and tripYear < 2015):
    taxiSchema = yellowTripSchemaPre2015
  elif(tripYear == 2016 and tripMonth > 6):
    taxiSchema = yellowTripSchema2016H2
  elif((tripYear == 2016 and tripMonth < 7) or (tripYear == 2015)):
    taxiSchema = yellowTripSchema20152016H1
  elif(tripYear == 2017 and tripMonth < 7):
    taxiSchema = yellowTripSchema2017H1
  else:
    taxiSchema = yellowTripSchema
  
  return taxiSchema

In [0]:
#2) Function to add columns to dataframe as required to homogenize schema
#Input:  Dataframe, year and month
#Output: Dataframe with homogenized schema 
#Sample call: println(getSchemaHomogenizedDataframe(DF,2014,6))

def getSchemaHomogenizedDataframe(sourceDF,tripYear, tripMonth):
  if(tripYear >= 2019):
    sourceDF = (sourceDF.with_column("pickup_longitude", lit(""))
              .with_column("pickup_latitude", lit(""))
              .with_column("dropoff_longitude", lit(""))
              .with_column("dropoff_latitude", lit(""))
              .with_column("trip_year",substring(col("pickup_datetime"),0, 4))
              .with_column("trip_month",substring(col("pickup_datetime"),6,2))
              .with_column("taxi_type",lit("yellow"))
              .with_column("junk1",lit(""))
              .with_column("junk2",lit(""))
              .with_column("temp_vendor_id", col("vendor_id").cast("string")).drop("vendor_id").with_column_renamed("temp_vendor_id","vendor_id")
              .with_column("temp_payment_type", col("payment_type").cast("string")).drop("payment_type").with_column_renamed("temp_payment_type", "payment_type"))
  else:
    sourceDF
    
  return sourceDF


#### Read CSV, homogenize schema across years, save as Snowflake table

In [0]:
# print(srcDataDirRoot)
srcDataDirRoot="@azure_csv_stage/transactional-data/"

In [0]:
#Process data, save as parquet

for j in [2019,2020,2021,2022]:
  if j ==2022:
    endMonth=3 
  else:
    endMonth=13
  for i in range(1,endMonth):
    srcDataFile= "{}year={}/month={:02d}/type=yellow/yellow_tripdata_{}-{:02d}.csv".format(srcDataDirRoot,j,i,j,i)
    print("Year={}; Month={}".format(j,i))
    print(srcDataFile)

    #Source schema
    taxiSchema = getTaxiSchema(j,i)
    taxiDF = session.read \
                    .schema(taxiSchema) \
                    .options({"skip_header": 1, "field_delimiter":"," ,"FIELD_OPTIONALLY_ENCLOSED_BY": '"' })  \
                    .csv(srcDataFile)
    
    taxiFormattedDF = getSchemaHomogenizedDataframe(taxiDF, j, i)
    
    taxiCanonicalDF = taxiFormattedDF.select(*canonicalTripSchemaColList)
    
    taxiCanonicalDF.write.mode("append").saveAsTable("yellow_taxi_trips_raw")


In [0]:
session.sql("select count(*) from yellow_taxi_trips_raw").collect()


In [0]:

session.sql("select count(*) from yellow_taxi_trips_raw").collect()

from
124330472


In [0]:
session.sql("""
select * from taxi.yellow_taxi_trips_raw limit 10;
""").toPandas()

Unnamed: 0,TAXI_TYPE,VENDOR_ID,PICKUP_DATETIME,DROPOFF_DATETIME,STORE_AND_FWD_FLAG,RATE_CODE_ID,PICKUP_LOCATION_ID,DROPOFF_LOCATION_ID,PICKUP_LONGITUDE,PICKUP_LATITUDE,DROPOFF_LONGITUDE,DROPOFF_LATITUDE,PASSENGER_COUNT,TRIP_DISTANCE,FARE_AMOUNT,EXTRA,MTA_TAX,TIP_AMOUNT,TOLLS_AMOUNT,IMPROVEMENT_SURCHARGE,TOTAL_AMOUNT,PAYMENT_TYPE,TRIP_YEAR,TRIP_MONTH
0,yellow,1,2019-01-04 18:06:06,2019-01-04 18:21:06,N,1,138,233,,,,,2,8.2,23.5,1.0,0.5,6.2,5.76,0.3,37.26,1,2019,1
1,yellow,1,2019-01-04 18:27:08,2019-01-04 18:40:05,N,1,162,79,,,,,1,2.0,10.0,1.0,0.5,2.35,0.0,0.3,14.15,1,2019,1
2,yellow,1,2019-01-04 18:41:08,2019-01-04 18:53:39,N,1,79,231,,,,,1,1.8,10.0,1.0,0.5,2.35,0.0,0.3,14.15,1,2019,1
3,yellow,1,2019-01-04 18:57:47,2019-01-04 19:08:19,N,1,231,113,,,,,1,1.9,9.0,1.0,0.5,1.7,0.0,0.3,12.5,1,2019,1
4,yellow,1,2019-01-04 18:12:20,2019-01-04 18:20:42,N,1,100,164,,,,,1,1.1,7.0,1.0,0.5,1.75,0.0,0.3,10.55,1,2019,1
5,yellow,1,2019-01-04 18:24:23,2019-01-04 18:37:15,N,1,234,148,,,,,1,1.9,10.0,1.0,0.5,2.35,0.0,0.3,14.15,1,2019,1
6,yellow,1,2019-01-04 18:52:56,2019-01-04 19:09:50,N,1,87,79,,,,,1,4.1,15.5,1.0,0.5,3.46,0.0,0.3,20.76,1,2019,1
7,yellow,1,2019-01-04 18:14:47,2019-01-04 18:19:44,N,1,143,142,,,,,1,0.6,5.5,1.0,0.5,1.5,0.0,0.3,8.8,1,2019,1
8,yellow,1,2019-01-04 18:29:09,2019-01-04 18:35:17,N,1,163,230,,,,,1,0.8,6.0,1.0,0.5,0.0,0.0,0.3,7.8,2,2019,1
9,yellow,1,2019-01-04 18:39:16,2019-01-04 18:49:55,N,1,100,230,,,,,1,1.1,8.0,1.0,0.5,1.95,0.0,0.3,11.75,1,2019,1
