# Create Ingest Logic

### Incremental and Bulk Extract, Load and Transform
We expect to get new data every month which we will incrementally load.  Here we will create some functions to wrap the ELT functions from the Data Engineer.

In [1]:
from dags.snowpark_connection import snowpark_connect
session, state_dict = snowpark_connect()

In [2]:
from dags import elt as ELT

import snowflake.snowpark as snp
import uuid 

ELT.reset_database(session=session, state_dict=state_dict, prestaged=False)

state_dict.update({'download_base_url': 'https://s3.amazonaws.com/tripdata/',
                       'load_table_name': 'RAW_',
                       'trips_table_name': 'TRIPS',
                       'load_stage_name': 'LOAD_STAGE'
                  })

import json
with open('./include/state.json', 'w') as sdf:
    json.dump(state_dict, sdf)

First we will test the ELT functions.  We pick a couple of files representing the various schema and file names.

In [None]:
file_name_end2 = '202102-citibike-tripdata.csv.zip'
file_name_end1 = '201402-citibike-tripdata.zip'
file_name_end3 = '202003-citibike-tripdata.csv.zip'

files_to_download = [file_name_end1, file_name_end2, file_name_end3]

In [None]:
%%time
load_stage_names, files_to_load = ELT.extract_trips_to_stage(session=session, 
                                                            files_to_download=files_to_download, 
                                                            download_base_url=state_dict['download_base_url'], 
                                                            load_stage_name=state_dict['load_stage_name'])

This ELT logic requires downloading data to the local system in order to unzip as well as upload the file to a stage.  This can be really slow depending on network speed.  Later we will provide a __bulk-load option that uses data already in gzip format in order to speed up the hands-on-lab__.

In [None]:
%%time

files_to_load['schema1']=[file+'.gz' for file in files_to_load['schema1']]
files_to_load['schema2']=[file+'.gz' for file in files_to_load['schema2']]

stage_table_names = ELT.load_trips_to_raw(session=session, 
                                          files_to_load=files_to_load, 
                                          load_stage_names=load_stage_names, 
                                          load_table_name=state_dict['load_table_name'])

In [None]:
%%time
trips_table_name = ELT.transform_trips(session=session, 
                                       stage_table_names=stage_table_names, 
                                       trips_table_name=state_dict['trips_table_name'])

Since there are two separate schemas we will create two separate ingest paths.  For that we will want to separate the files into two groups like the following.

In [3]:
from datetime import datetime

files_to_ingest=['202004-citibike-tripdata.csv.zip', '202102-citibike-tripdata.csv.zip']
schema1_download_files = list()
schema2_download_files = list()
schema2_start_date = datetime.strptime('202102', "%Y%m")

for file_name in files_to_ingest:
    file_start_date = datetime.strptime(file_name.split("-")[0], "%Y%m")
    if file_start_date < schema2_start_date:
        schema1_download_files.append(file_name.replace('.zip','.gz'))
    else:
        schema2_download_files.append(file_name.replace('.zip','.gz'))
        
files_to_load = {'schema1': schema1_download_files, 'schema2': schema2_download_files}
files_to_load

{'schema1': ['202004-citibike-tripdata.csv.gz'],
 'schema2': ['202102-citibike-tripdata.csv.gz']}

Here we create the incremental ELT function as well as a bulk load function.  The bulk ingest function wraps the incremental ingest with a full set of data to bootstrap the project.

In [4]:
%%writefile dags/ingest.py
def incremental_elt(session, 
                    state_dict:dict, 
                    files_to_ingest:list, 
                    download_base_url,
                    use_prestaged=False) -> str:
    
    import dags.elt as ELT
    from datetime import datetime

    load_stage_name=state_dict['load_stage_name']
    load_table_name=state_dict['load_table_name']
    trips_table_name=state_dict['trips_table_name']
    
    if use_prestaged:
        print("Skipping extract.  Using provided bucket for pre-staged files.")
        
        schema1_download_files = list()
        schema2_download_files = list()
        schema2_start_date = datetime.strptime('202102', "%Y%m")

        for file_name in files_to_ingest:
            file_start_date = datetime.strptime(file_name.split("-")[0], "%Y%m")
            if file_start_date < schema2_start_date:
                schema1_download_files.append(file_name.replace('.zip','.gz'))
            else:
                schema2_download_files.append(file_name.replace('.zip','.gz'))
        
        
        load_stage_names = {'schema1':load_stage_name+'/schema1/', 'schema2':load_stage_name+'/schema2/'}
        files_to_load = {'schema1': schema1_download_files, 'schema2': schema2_download_files}
    else:
        print("Extracting files from public location.")
        load_stage_names, files_to_load = ELT.extract_trips_to_stage(session=session, 
                                                                    files_to_download=files_to_ingest, 
                                                                    download_base_url=download_base_url, 
                                                                    load_stage_name=load_stage_name)
        
        files_to_load['schema1']=[file+'.gz' for file in files_to_load['schema1']]
        files_to_load['schema2']=[file+'.gz' for file in files_to_load['schema2']]


    print("Loading files to raw.")
    stage_table_names = ELT.load_trips_to_raw(session=session, 
                                              files_to_load=files_to_load, 
                                              load_stage_names=load_stage_names, 
                                              load_table_name=load_table_name)    
    
    print("Transforming records to trips table.")
    trips_table_name = ELT.transform_trips(session=session, 
                                           stage_table_names=stage_table_names, 
                                           trips_table_name=trips_table_name)
    return trips_table_name

def bulk_elt(session, 
             state_dict:dict,
             download_base_url, 
             use_prestaged=False) -> str:
    
    #import dags.elt as ELT
    from dags.ingest import incremental_elt
    
    import pandas as pd
    from datetime import datetime

    #Create a list of filenames to download based on date range
    #For files like 201306-citibike-tripdata.zip
    date_range1 = pd.period_range(start=datetime.strptime("201306", "%Y%m"), 
                                 end=datetime.strptime("201612", "%Y%m"), 
                                 freq='M').strftime("%Y%m")
    file_name_end1 = '-citibike-tripdata.zip'
    files_to_extract = [date+file_name_end1 for date in date_range1.to_list()]

    #For files like 201701-citibike-tripdata.csv.zip
    date_range2 = pd.period_range(start=datetime.strptime("201701", "%Y%m"), 
                                 end=datetime.strptime("201912", "%Y%m"), 
                                 freq='M').strftime("%Y%m")
    
    file_name_end2 = '-citibike-tripdata.csv.zip'
    
    files_to_extract = files_to_extract + [date+file_name_end2 for date in date_range2.to_list()]        

    trips_table_name = incremental_elt(session=session, 
                                       state_dict=state_dict, 
                                       files_to_ingest=files_to_extract, 
                                       use_prestaged=use_prestaged,
                                       download_base_url=download_base_url)
    
    return trips_table_name


Overwriting dags/ingest.py


The incremental ELT function allows us to specify one or more files to extract, load and transform.  Lets try it with a couple of examples.  Start with a single file.

In [None]:
%%time
from dags.ingest import incremental_elt
from dags.elt import reset_database
from dags.snowpark_connection import snowpark_connect

session, state_dict = snowpark_connect('./include/state.json')

session.use_warehouse(state_dict['compute_parameters']['fe_warehouse'])

reset_database(session=session, state_dict=state_dict, prestaged=False)

incremental_elt(session=session, 
                state_dict=state_dict, 
                files_to_ingest=['202001-citibike-tripdata.csv.zip'], 
                download_base_url=state_dict['download_base_url'],
                use_prestaged=False)
session.close()

We may need to ingest a list of multiple files.

In [None]:
%%time
from dags.ingest import incremental_elt
from dags.elt import reset_database
from dags.snowpark_connection import snowpark_connect

session, state_dict = snowpark_connect('./include/state.json')

session.use_warehouse(state_dict['compute_parameters']['fe_warehouse'])

reset_database(session=session, state_dict=state_dict, prestaged=False)

incremental_elt(session=session, 
                state_dict=state_dict, 
                files_to_ingest=['202002-citibike-tripdata.csv.zip', '202102-citibike-tripdata.csv.zip'], 
                download_base_url=state_dict['download_base_url'],
                use_prestaged=False)

session.close()

These load functions will default to loading from the public citibike data set.  However, we may want to be able to specify files already pre-downloaded into a different S3 bucket.  The functions assume the files are in gzip format in that bucket.

In [None]:
%%time
from dags.ingest import incremental_elt
from dags.elt import reset_database
from dags.snowpark_connection import snowpark_connect

session, state_dict = snowpark_connect('./include/state.json')

session.use_warehouse(state_dict['compute_parameters']['fe_warehouse'])

reset_database(session=session, state_dict=state_dict, prestaged=True)

incremental_elt(session=session, 
                state_dict=state_dict, 
                files_to_ingest=['202001-citibike-tripdata.csv.zip', '202102-citibike-tripdata.csv.zip'],
                download_base_url=state_dict['connection_parameters']['download_base_url'],
                use_prestaged=True)
session.close()

We could also bulk load the entire historical dataset using the following.  This takes at least 30min depending on network speed to your local system. See below for an alternative.

In [None]:
# %%time
# from dags.ingest import bulk_elt
# from dags.elt import reset_database
# from dags.snowpark_connection import snowpark_connect

# session, state_dict = snowpark_connect('./include/state.json')

# session.use_warehouse(state_dict['compute_parameters']['fe_warehouse'])

# reset_database(session=session, state_dict=state_dict, prestaged=False)

# bulk_elt(session=session, 
#          state_dict=state_dict, 
#          use_prestaged=False, 
#          download_base_url='https://s3.amazonaws.com/tripdata/')
# session.close()

For the hands-on-lab we will bulk load from a different S3 bucket where the files are already in gzip format (see below).  

For this project we are going back in time and pretending it is January 2020 (so that we can experience the effect of data drift during COVID lockdown).  So this bulk load ingests from an existing bucket with data from June 2013 to January 2020.

In [5]:
%%time
from dags.ingest import bulk_elt
from dags.elt import reset_database
from dags.snowpark_connection import snowpark_connect

session, state_dict = snowpark_connect()

state_dict.update({'load_table_name': 'RAW_',
                   'trips_table_name': 'TRIPS',
                   'load_stage_name': 'LOAD_STAGE'
                  })
import json
with open('./include/state.json', 'w') as sdf:
    json.dump(state_dict, sdf)

reset_database(session=session, state_dict=state_dict, prestaged=True)

session.use_warehouse(state_dict['compute_parameters']['fe_warehouse'])

bulk_elt(session=session, 
         state_dict=state_dict, 
         download_base_url=state_dict['connection_parameters']['download_base_url'],
         use_prestaged=True)

Skipping extract.  Using provided bucket for pre-staged files.
Loading files to raw.
Transforming records to trips table.
CPU times: user 433 ms, sys: 134 ms, total: 567 ms
Wall time: 45 s


'TRIPS'

In [None]:
session.table(state_dict['trips_table_name']).count()

In [None]:
session.close()

Without the need to download locally we ingested ~90 million records in about 30 seconds.