# What's in this exercise?

1) Read raw data, augment with derived attributes, augment with reference data & persist<BR>

In [0]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import col,lit,substring,call_builtin
from snowflake.snowpark import functions as F
from snowflake.snowpark.types import StringType,IntegerType,DoubleType,TimestampType,TimeType
from snowflake.snowpark.types import StructType, StructField
import os

In [0]:
User = dbutils.secrets.get("snowparkdetails", "username")
Password = dbutils.secrets.get("snowparkdetails", "password")
Account = dbutils.secrets.get("snowparkdetails", "account")
TenandId = dbutils.secrets.get("gen2-storage", "tenant-id")
SchemaName = "taxi"
DatabaseName = "NYCTAXI"
Warehouse = "cluster1"
DBrole = "ACCOUNTADMIN"
CONNECTION_PARAMETERS = {
    'account': Account,
    'user': User,
    'password': Password,
    'schema': SchemaName,
    'database': DatabaseName,
    'warehouse': Warehouse,
    'role':DBrole,
}

session = Session.builder.configs(CONNECTION_PARAMETERS).create()
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

###  Read raw, augment, persist as Snowflake Table

In [0]:
curatedDF = session.sql("""
  select 
      t.taxi_type,
      t.vendor_id,
      t.pickup_datetime,
      t.dropoff_datetime,
      t.store_and_fwd_flag,
      t.rate_code_id,
      t.pickup_location_id,
      t.dropoff_location_id,
      t.pickup_longitude,
      t.pickup_latitude,
      t.dropoff_longitude,
      t.dropoff_latitude,
      t.passenger_count,
      t.trip_distance,
      t.fare_amount,
      t.extra,
      t.mta_tax,
      t.tip_amount,
      t.tolls_amount,
      t.ehail_fee,
      t.improvement_surcharge,
      t.total_amount,
      t.payment_type,
      t.trip_type,
      t.trip_year,
      t.trip_month,
      v.abbreviation as vendor_abbreviation,
      v.description as vendor_description,
      tt.description as trip_type_description,
      tm.month_name_short,
      tm.month_name_full,
      pt.description as payment_type_description,
      rc.description as rate_code_description,
      tzpu.borough as pickup_borough,
      tzpu.zone as pickup_zone,
      tzpu.service_zone as pickup_service_zone,
      tzdo.borough as dropoff_borough,
      tzdo.zone as dropoff_zone,
      tzdo.service_zone as dropoff_service_zone,
      year(t.pickup_datetime) as pickup_year,
      month(t.pickup_datetime) as pickup_month,
      day(t.pickup_datetime) as pickup_day,
      hour(t.pickup_datetime) as pickup_hour,
      minute(t.pickup_datetime) as pickup_minute,
      second(t.pickup_datetime) as pickup_second,
      year(t.dropoff_datetime) as dropoff_year,
      month(t.dropoff_datetime) as dropoff_month,
      day(t.dropoff_datetime) as dropoff_day,
      hour(t.dropoff_datetime) as dropoff_hour,
      minute(t.dropoff_datetime) as dropoff_minute,
      second(t.dropoff_datetime) as dropoff_second
  from 
    taxi.green_taxi_trips_raw t
    left outer join taxi.vendor_lookup v 
      on (t.vendor_id = v.vendor_id)
    left outer join taxi.trip_type_lookup tt 
      on (t.trip_type = tt.trip_type)
    left outer join taxi.trip_month_lookup tm 
      on (t.trip_month = tm.trip_month)
    left outer join taxi.payment_type_lookup pt 
      on (t.payment_type = pt.payment_type)
    left outer join taxi.rate_code_lookup rc 
      on (t.rate_code_id = rc.rate_code_id)
    left outer join taxi.taxi_zone_lookup tzpu 
      on (t.pickup_location_id = tzpu.location_id)
    left join taxi.taxi_zone_lookup tzdo 
      on (t.dropoff_location_id = tzdo.location_id)
  """)


In [0]:
curatedDF.write.mode("overwrite").saveAsTable("green_taxi_trips_curated")

In [0]:
curatedDF.count()

###  Explore

In [0]:
session.sql("select count(*) from green_taxi_trips_curated").collect()

In [0]:
session.sql("select trip_year,trip_month, count(*) as trip_count from taxi.green_taxi_trips_curated group by trip_year,trip_month \
order by trip_year, trip_month").toPandas()

Unnamed: 0,TRIP_YEAR,TRIP_MONTH,TRIP_COUNT
0,2008,10,1
1,2008,12,23
2,2009,1,140
3,2010,9,52
4,2018,3,1
5,2018,12,66
6,2019,1,630830
7,2019,2,575672
8,2019,3,601060
9,2019,4,514387
