# What's in this exercise?

1) Read raw data, augment with derived attributes, augment with reference data & persist<BR>

In [0]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import col,lit,substring,call_builtin
from snowflake.snowpark import functions as F
from snowflake.snowpark.types import StringType,IntegerType,DoubleType,TimestampType,TimeType
from snowflake.snowpark.types import StructType, StructField
import os

In [0]:
User = dbutils.secrets.get("snowparkdetails", "username")
Password = dbutils.secrets.get("snowparkdetails", "password")
Account = dbutils.secrets.get("snowparkdetails", "account")
TenandId = dbutils.secrets.get("gen2-storage", "tenant-id")
SchemaName = "taxi"
DatabaseName = "NYCTAXI"
Warehouse = "cluster1"
DBrole = "ACCOUNTADMIN"
CONNECTION_PARAMETERS = {
    'account': Account,
    'user': User,
    'password': Password,
    'schema': SchemaName,
    'database': DatabaseName,
    'warehouse': Warehouse,
    'role':DBrole,
}

session = Session.builder.configs(CONNECTION_PARAMETERS).create()
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

### Read raw, augment, persist as Snowflake Tables

In [0]:
curatedDF = session.sql("""
  select distinct t.taxi_type,
      t.vendor_id as vendor_id,
      t.pickup_datetime,
      t.dropoff_datetime,
      t.store_and_fwd_flag,
      t.rate_code_id,
      t.pickup_location_id,
      t.dropoff_location_id,
      t.pickup_longitude,
      t.pickup_latitude,
      t.dropoff_longitude,
      t.dropoff_latitude,
      t.passenger_count,
      t.trip_distance,
      t.fare_amount,
      t.extra,
      t.mta_tax,
      t.tip_amount,
      t.tolls_amount,
      t.improvement_surcharge,
      t.total_amount,
      t.payment_type,
      t.trip_year,
      t.trip_month,
      v.abbreviation as vendor_abbreviation,
      v.description  as vendor_description,
      tm.month_name_short,
      tm.month_name_full,
      cast(pt.description as varchar(30)) as payment_type_description,
      rc.description as rate_code_description,
      tzpu.borough as pickup_borough,
      tzpu.zone as pickup_zone,
      tzpu.service_zone as pickup_service_zone,
      tzdo.borough as dropoff_borough,
      tzdo.zone as dropoff_zone,
      tzdo.service_zone as dropoff_service_zone,
      year(t.pickup_datetime) as pickup_year,
      month(t.pickup_datetime) as pickup_month,
      day(t.pickup_datetime) as pickup_day,
      hour(t.pickup_datetime) as pickup_hour,
      minute(t.pickup_datetime) as pickup_minute,
      second(t.pickup_datetime) as pickup_second,
      date(t.pickup_datetime) as pickup_date,
      year(t.dropoff_datetime) as dropoff_year,
      month(t.dropoff_datetime) as dropoff_month,
      day(t.dropoff_datetime) as dropoff_day,
      hour(t.dropoff_datetime) as dropoff_hour,
      minute(t.dropoff_datetime) as dropoff_minute,
      second(t.dropoff_datetime) as dropoff_second,
      date(t.dropoff_datetime) as dropoff_date
  from 
    taxi.yellow_taxi_trips_raw t
    left outer join taxi.vendor_lookup v 
      on (t.vendor_id = v.vendor_id )
    left outer join taxi.trip_month_lookup tm 
      on (t.trip_month = tm.trip_month)
    left outer join taxi.payment_type_lookup pt 
      on (cast(t.payment_type  as int) = pt.payment_type )
    left outer join taxi.rate_code_lookup rc 
      on (t.rate_code_id = rc.rate_code_id)
    left outer join taxi.taxi_zone_lookup tzpu 
      on (t.pickup_location_id = tzpu.location_id)
    left outer join taxi.taxi_zone_lookup tzdo 
      on (t.dropoff_location_id = tzdo.location_id)
  """)

curatedDFConformed = (curatedDF.withColumn("temp_vendor_id", col("vendor_id").cast("string")).drop("vendor_id").withColumnRenamed("temp_vendor_id", "vendor_id").withColumn("temp_payment_type", col("payment_type").cast("string")).drop("payment_type").withColumnRenamed("temp_payment_type", "payment_type"))


In [0]:
curatedDFConformed.limit(10).toPandas()

# curatedDF.select(call_builtin("approx_count_distinct", col("payment_type"))).show()

Unnamed: 0,TAXI_TYPE,PICKUP_DATETIME,DROPOFF_DATETIME,STORE_AND_FWD_FLAG,RATE_CODE_ID,PICKUP_LOCATION_ID,DROPOFF_LOCATION_ID,PICKUP_LONGITUDE,PICKUP_LATITUDE,DROPOFF_LONGITUDE,DROPOFF_LATITUDE,PASSENGER_COUNT,TRIP_DISTANCE,FARE_AMOUNT,EXTRA,MTA_TAX,TIP_AMOUNT,TOLLS_AMOUNT,IMPROVEMENT_SURCHARGE,TOTAL_AMOUNT,TRIP_YEAR,TRIP_MONTH,VENDOR_ABBREVIATION,VENDOR_DESCRIPTION,MONTH_NAME_SHORT,MONTH_NAME_FULL,PAYMENT_TYPE_DESCRIPTION,RATE_CODE_DESCRIPTION,PICKUP_BOROUGH,PICKUP_ZONE,PICKUP_SERVICE_ZONE,DROPOFF_BOROUGH,DROPOFF_ZONE,DROPOFF_SERVICE_ZONE,PICKUP_YEAR,PICKUP_MONTH,PICKUP_DAY,PICKUP_HOUR,PICKUP_MINUTE,PICKUP_SECOND,PICKUP_DATE,DROPOFF_YEAR,DROPOFF_MONTH,DROPOFF_DAY,DROPOFF_HOUR,DROPOFF_MINUTE,DROPOFF_SECOND,DROPOFF_DATE,VENDOR_ID,PAYMENT_TYPE
0,yellow,2019-03-01 00:24:41,2019-03-01 00:25:31,N,1,145,145,,,,,1,0.0,2.5,0.5,0.5,0.0,0.0,0.3,3.8,2019,3,Creative Mobile Technologies LLC,Creative Mobile Technologies LLC,Mar,,Cash,Standard rate,Queens,Long Island City/Hunters Point,Boro Zone,Queens,Long Island City/Hunters Point,Boro Zone,2019,3,1,0,24,41,2019-03-01,2019,3,1,0,25,31,2019-03-01,1,2
1,yellow,2019-03-01 00:23:06,2019-03-01 00:35:30,N,1,170,50,,,,,1,1.92,10.5,0.5,0.5,3.58,0.0,0.3,17.88,2019,3,VeriFone Inc.,VeriFone Inc.,Mar,,Credit card,Standard rate,Manhattan,Murray Hill,Yellow Zone,Manhattan,Clinton West,Yellow Zone,2019,3,1,0,23,6,2019-03-01,2019,3,1,0,35,30,2019-03-01,2,1
2,yellow,2019-03-01 00:52:52,2019-03-01 01:11:06,N,1,79,41,,,,,1,6.61,19.5,0.5,0.5,2.0,0.0,0.3,25.3,2019,3,VeriFone Inc.,VeriFone Inc.,Mar,,Credit card,Standard rate,Manhattan,East Village,Yellow Zone,Manhattan,Central Harlem,Boro Zone,2019,3,1,0,52,52,2019-03-01,2019,3,1,1,11,6,2019-03-01,2,1
3,yellow,2019-03-01 00:24:54,2019-03-01 00:38:41,N,1,144,40,,,,,1,3.6,14.0,3.0,0.5,3.55,0.0,0.3,21.35,2019,3,Creative Mobile Technologies LLC,Creative Mobile Technologies LLC,Mar,,Credit card,Standard rate,Manhattan,Little Italy/NoLiTa,Yellow Zone,Brooklyn,Carroll Gardens,Boro Zone,2019,3,1,0,24,54,2019-03-01,2019,3,1,0,38,41,2019-03-01,1,1
4,yellow,2019-03-01 00:17:26,2019-03-01 00:23:58,N,1,162,164,,,,,1,0.89,6.0,0.5,0.5,0.0,0.0,0.3,9.8,2019,3,VeriFone Inc.,VeriFone Inc.,Mar,,Cash,Standard rate,Manhattan,Midtown East,Yellow Zone,Manhattan,Midtown South,Yellow Zone,2019,3,1,0,17,26,2019-03-01,2019,3,1,0,23,58,2019-03-01,2,2
5,yellow,2019-03-01 00:34:22,2019-03-01 00:38:35,N,1,158,186,,,,,1,1.2,5.5,3.0,0.5,2.3,0.0,0.3,11.6,2019,3,Creative Mobile Technologies LLC,Creative Mobile Technologies LLC,Mar,,Credit card,Standard rate,Manhattan,Meatpacking/West Village West,Yellow Zone,Manhattan,Penn Station/Madison Sq West,Yellow Zone,2019,3,1,0,34,22,2019-03-01,2019,3,1,0,38,35,2019-03-01,1,1
6,yellow,2019-03-01 00:02:30,2019-03-01 00:05:19,N,1,229,162,,,,,1,0.32,4.0,0.5,0.5,0.0,0.0,0.3,7.8,2019,3,VeriFone Inc.,VeriFone Inc.,Mar,,Cash,Standard rate,Manhattan,Sutton Place/Turtle Bay North,Yellow Zone,Manhattan,Midtown East,Yellow Zone,2019,3,1,0,2,30,2019-03-01,2019,3,1,0,5,19,2019-03-01,2,2
7,yellow,2019-03-01 00:51:18,2019-03-01 01:01:19,N,1,41,42,,,,,1,1.8,9.5,0.5,0.5,0.0,0.0,0.3,10.8,2019,3,Creative Mobile Technologies LLC,Creative Mobile Technologies LLC,Mar,,Cash,Standard rate,Manhattan,Central Harlem,Boro Zone,Manhattan,Central Harlem North,Boro Zone,2019,3,1,0,51,18,2019-03-01,2019,3,1,1,1,19,2019-03-01,1,2
8,yellow,2019-03-01 00:28:42,2019-03-01 00:38:48,N,1,161,262,,,,,1,2.3,10.0,3.0,0.5,2.75,0.0,0.3,16.55,2019,3,Creative Mobile Technologies LLC,Creative Mobile Technologies LLC,Mar,,Credit card,Standard rate,Manhattan,Midtown Center,Yellow Zone,Manhattan,Yorkville East,Yellow Zone,2019,3,1,0,28,42,2019-03-01,2019,3,1,0,38,48,2019-03-01,1,1
9,yellow,2019-03-01 00:41:42,2019-03-01 00:48:25,N,1,239,143,,,,,0,1.2,7.0,3.0,0.5,0.0,0.0,0.3,10.8,2019,3,Creative Mobile Technologies LLC,Creative Mobile Technologies LLC,Mar,,Cash,Standard rate,Manhattan,Upper West Side South,Yellow Zone,Manhattan,Lincoln Square West,Yellow Zone,2019,3,1,0,41,42,2019-03-01,2019,3,1,0,48,25,2019-03-01,1,2


In [0]:
#Save as Snowflake Table

curatedDFConformed.write.mode("overwrite").saveAsTable("yellow_taxi_trips_curated")

In [0]:
session.sql("select count(*) from yellow_taxi_trips_curated").collect()

In [0]:
session.sql("select trip_year,trip_month, count(*) as trip_count from taxi.yellow_taxi_trips_curated where trip_year in (2019,2020,2021,2022) group by trip_year,trip_month order by trip_year desc,trip_month desc").toPandas()

Unnamed: 0,TRIP_YEAR,TRIP_MONTH,TRIP_COUNT
0,2022,5,8
1,2022,4,11
2,2022,3,23
3,2022,2,2979382
4,2022,1,2463927
5,2021,12,3214294
6,2021,11,920796
7,2021,10,3463485
8,2021,9,2963778
9,2021,8,2788300
