### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import psycopg2
import snowflake.connector as sfc

### Postgres setup

In [2]:
pg_schema = 'com_zx_courier'
pg_table = 'stg_users_cognito'

pg_date_col = '"UserCreateDate"'
pg_id_col = '"Username"'

### Snowflake setup

In [3]:
sf_schema = 'BR_ZE'
sf_table = 'STR_USERS_COGNITO'

sf_date_col = 'USERCREATEDATE'
sf_id_col = 'USERNAME'

### Postgres connection and query

In [4]:
def gdw_query(query):
    conn = psycopg2.connect(
                    user='postgres',
                    password='',
                    host='localhost',
                    port='5432',
                    database='gdwprod')
    try:
        df = pd.read_sql_query(query, conn)
        return df
    finally:
        conn.close()

In [21]:
pg_query1 = '''select min({date_col}) first_entry,\
                    max({date_col}) last_entry,\
                    count(distinct {id_col}) num_id,\
                    count(*) num_entries
                from {schema}.{table};'''.format(
                                                schema=pg_schema, 
                                                table=pg_table,
                                                date_col=pg_date_col,
                                                id_col=pg_id_col)

pg_query2 = '''select min({date_col}) first_entry,\
                    max({date_col}) last_entry,\
                    count(distinct {id_col}) num_id,\
                    count(*) num_entries\
            from {schema}.{table}\
            where date_trunc('month', date({date_col})) < date_trunc('month', date('2020-03-01'));'''.format(
                                                                                            schema=pg_schema, 
                                                                                            table=pg_table,
                                                                                            date_col=pg_date_col,
                                                                                            id_col=pg_id_col)


pg_query3 = '''select date_trunc('month', date({date_col})) which_month,\
                    count(distinct {id_col}) num_id,\
                    count(*) num_entries\
                from {schema}.{table} group by date_trunc('month', date({date_col})) 
                order by date_trunc('month', date({date_col}));'''.format(
                                                                    schema=pg_schema, 
                                                                    table=pg_table,
                                                                    date_col=pg_date_col,
                                                                    id_col=pg_id_col)

### Snowflake connection and query

In [6]:
def snow_query(query):
    conn = sfc.connect(
                    user='',
                    password='',
                    account='zxventures.us-east-1',
                    warehouse='WH_INTERACTIVE',
                    database='OMNICHANNEL',)
    try:
        df = pd.read_sql_query(query, conn)
        return df
    finally:
        conn.close()

In [22]:
sf_query1 = '''select min({date_col}) first_entry,\
                    max({date_col}) last_entry,\
                    count(distinct {id_col}) num_id,\
                    count(*) num_entries
                from {schema}.{table};'''.format(
                                                schema=sf_schema, 
                                                table=sf_table,
                                                date_col=sf_date_col,
                                                id_col=sf_id_col)

sf_query2 = '''select min({date_col}) first_entry,\
                    max({date_col}) last_entry,\
                    count(distinct {id_col}) num_id,\
                    count(*) num_entries\
            from {schema}.{table}\
            where date_trunc('month', date({date_col})) < date_trunc('month', date('2020-03-01'));'''.format(
                                                                                            schema=sf_schema, 
                                                                                            table=sf_table,
                                                                                            date_col=sf_date_col,
                                                                                            id_col=sf_id_col)


sf_query3 = '''select date_trunc('month', date({date_col})) which_month,\
                    count(distinct {id_col}) num_id,\
                    count(*) num_entries\
                from {schema}.{table} group by date_trunc('month', date({date_col}))
                order by date_trunc('month', date({date_col}));'''.format(
                                                                    schema=sf_schema, 
                                                                    table=sf_table,
                                                                    date_col=sf_date_col,
                                                                    id_col=sf_id_col)

## Various overall metrics

In [23]:
pg1 = gdw_query(pg_query1)
sf1 = snow_query(sf_query1)
pg1

Unnamed: 0,first_entry,last_entry,num_id,num_entries
0,2017-06-20 18:26:45.284,2020-06-09 02:37:31.503,1045004,1045004


In [24]:
sf1

Unnamed: 0,FIRST_ENTRY,LAST_ENTRY,NUM_ID,NUM_ENTRIES
0,2017-06-20 18:26:45.284,2020-05-11 02:06:17.684,819873,819873


In [25]:
str_order_metrics = pd.DataFrame(columns=['Postgres', 'Snowflake', 'is_identical'], index=pg1.columns)
str_order_metrics['Postgres'] = pg1.T.values
str_order_metrics['Snowflake'] = sf1.T.values
str_order_metrics['is_identical'] = str_order_metrics['Postgres'] == str_order_metrics['Snowflake']

str_order_metrics

Unnamed: 0,Postgres,Snowflake,is_identical
first_entry,2017-06-20 18:26:45.284000,2017-06-20 18:26:45.284000,True
last_entry,2020-06-09 02:37:31.503000,2020-05-11 02:06:17.684000,False
num_id,1045004,819873,False
num_entries,1045004,819873,False


## Various overall metrics - before March 2020

In [26]:
pg2 = gdw_query(pg_query2)
sf2 = snow_query(sf_query2)
pg2

Unnamed: 0,first_entry,last_entry,num_id,num_entries
0,2017-06-20 18:26:45.284,2020-02-29 23:59:53.696,510500,510500


In [27]:
sf2

Unnamed: 0,FIRST_ENTRY,LAST_ENTRY,NUM_ID,NUM_ENTRIES
0,2017-06-20 18:26:45.284,2020-02-29 23:59:53.696,510500,510500


In [28]:
str_order_metrics = pd.DataFrame(columns=['Postgres', 'Snowflake', 'is_identical'], index=pg1.columns)
str_order_metrics['Postgres'] = pg2.T.values
str_order_metrics['Snowflake'] = sf2.T.values
str_order_metrics['is_identical'] = str_order_metrics['Postgres'] == str_order_metrics['Snowflake']

str_order_metrics

Unnamed: 0,Postgres,Snowflake,is_identical
first_entry,2017-06-20 18:26:45.284000,2017-06-20 18:26:45.284000,True
last_entry,2020-02-29 23:59:53.696000,2020-02-29 23:59:53.696000,True
num_id,510500,510500,True
num_entries,510500,510500,True


## Month-wise number of orders

In [29]:
pg3 = gdw_query(pg_query3)
pg3.index = pd.to_datetime(pg3.which_month)
pg3.head()

Unnamed: 0_level_0,which_month,num_id,num_entries
which_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-06-01 00:00:00+00:00,2017-06-01 00:00:00+00:00,143,143
2017-07-01 00:00:00+00:00,2017-07-01 00:00:00+00:00,4670,4670
2017-08-01 00:00:00+00:00,2017-08-01 00:00:00+00:00,5346,5346
2017-09-01 00:00:00+00:00,2017-09-01 00:00:00+00:00,7200,7200
2017-10-01 00:00:00+00:00,2017-10-01 00:00:00+00:00,4608,4608


In [30]:
sf3 = snow_query(sf_query3)
sf3.index = pd.to_datetime(sf3['WHICH_MONTH'], utc=True)
sf3.head()

Unnamed: 0_level_0,WHICH_MONTH,NUM_ID,NUM_ENTRIES
WHICH_MONTH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-06-01 00:00:00+00:00,2017-06-01,143,143
2017-07-01 00:00:00+00:00,2017-07-01,4670,4670
2017-08-01 00:00:00+00:00,2017-08-01,5346,5346
2017-09-01 00:00:00+00:00,2017-09-01,7200,7200
2017-10-01 00:00:00+00:00,2017-10-01,4608,4608


In [31]:
str_order_monthwise= pd.DataFrame(columns=['Postgres', 'Snowflake', 'is_identical'], index=pg3.index)
str_order_monthwise.index.name = 'Month'
str_order_monthwise['Postgres'] = pg3['num_entries']
str_order_monthwise['Snowflake'] = sf3['NUM_ENTRIES']
str_order_monthwise['is_identical'] = str_order_monthwise.Postgres == str_order_monthwise.Snowflake

str_order_monthwise.head()

Unnamed: 0_level_0,Postgres,Snowflake,is_identical
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-06-01 00:00:00+00:00,143,143.0,True
2017-07-01 00:00:00+00:00,4670,4670.0,True
2017-08-01 00:00:00+00:00,5346,5346.0,True
2017-09-01 00:00:00+00:00,7200,7200.0,True
2017-10-01 00:00:00+00:00,4608,4608.0,True


In [32]:
str_order_monthwise[str_order_monthwise['is_identical']!= True]

Unnamed: 0_level_0,Postgres,Snowflake,is_identical
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-05-01 00:00:00+00:00,226343,61620.0,False
2020-06-01 00:00:00+00:00,60408,,False
