### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import psycopg2
import snowflake.connector as sfc

### Postgres setup

In [2]:
pg_schema = 'com_zx_courier'
pg_table = 'stg_user'

pg_date_col = 'created_date'
pg_id_col = 'id'

### Snowflake setup

In [3]:
sf_schema = 'BR_ZE'
sf_table = 'STR_USER'

sf_date_col = 'created_date'
sf_id_col = 'id'

### Postgres connection and query

In [4]:
def gdw_query(query):
    conn = psycopg2.connect(
                    user='postgres',
                    password='',
                    host='localhost',
                    port='5432',
                    database='gdwprod')
    try:
        df = pd.read_sql_query(query, conn)
        return df
    finally:
        conn.close()

In [6]:
pg_query1 = '''select min({date_col}) first_entry,\
                    max({date_col}) last_entry,\
                    min({id_col}) first_id,\
                    max({id_col}) last_id,\
                    count(distinct {id_col}) num_id,\
                    count(*) num_entries
                from {schema}.{table};'''.format(
                                                schema=pg_schema, 
                                                table=pg_table,
                                                date_col=pg_date_col,
                                                id_col=pg_id_col)

pg_query2 = '''select min({date_col}) first_entry,\
                    max({date_col}) last_entry,\
                    min({id_col}) first_id,\
                    max({id_col}) last_id,\
                    count(distinct {id_col}) num_id,\
                    count(*) num_entries\
            from {schema}.{table}\
            where date_trunc('month', date({date_col})) < date_trunc('month', date('2020-03-01'));'''.format(
                                                                                            schema=pg_schema, 
                                                                                            table=pg_table,
                                                                                            date_col=pg_date_col,
                                                                                            id_col=pg_id_col)


pg_query3 = '''select date_trunc('month', date({date_col})) which_month,\
                    count(distinct {id_col}) num_id,\
                    count(*) num_entries\
                from {schema}.{table} group by date_trunc('month', date({date_col})) 
                order by date_trunc('month', date({date_col}));'''.format(
                                                                    schema=pg_schema, 
                                                                    table=pg_table,
                                                                    date_col=pg_date_col,
                                                                    id_col=pg_id_col)

### Snowflake connection and query

In [7]:
def snow_query(query):
    conn = sfc.connect(
                    user='',
                    password='',
                    account='zxventures.us-east-1',
                    warehouse='WH_INTERACTIVE',
                    database='OMNICHANNEL',)
    try:
        df = pd.read_sql_query(query, conn)
        return df
    finally:
        conn.close()

In [8]:
sf_query1 = '''select min({date_col}) first_entry,\
                    max({date_col}) last_entry,\
                    min({id_col}) first_id,\
                    max({id_col}) last_id,\
                    count(distinct {id_col}) num_id,\
                    count(*) num_entries
                from {schema}.{table};'''.format(
                                                schema=sf_schema, 
                                                table=sf_table,
                                                date_col=sf_date_col,
                                                id_col=sf_id_col)

sf_query2 = '''select min({date_col}) first_entry,\
                    max({date_col}) last_entry,\
                    min({id_col}) first_id,\
                    max({id_col}) last_id,\
                    count(distinct {id_col}) num_id,\
                    count(*) num_entries\
            from {schema}.{table}\
            where date_trunc('month', date({date_col})) < date_trunc('month', date('2020-03-01'));'''.format(
                                                                                            schema=sf_schema, 
                                                                                            table=sf_table,
                                                                                            date_col=sf_date_col,
                                                                                            id_col=sf_id_col)


sf_query3 = '''select date_trunc('month', date({date_col})) which_month,\
                    count(distinct {id_col}) num_id,\
                    count(*) num_entries\
                from {schema}.{table} group by date_trunc('month', date({date_col}))
                order by date_trunc('month', date({date_col}));'''.format(
                                                                    schema=sf_schema, 
                                                                    table=sf_table,
                                                                    date_col=sf_date_col,
                                                                    id_col=sf_id_col)

## Various overall metrics

In [9]:
pg1 = gdw_query(pg_query1)
sf1 = snow_query(sf_query1)
pg1

Unnamed: 0,first_entry,last_entry,first_id,last_id,num_id,num_entries
0,2019-04-02 18:31:05.786241,2020-06-12 05:13:43.456006,1664,2549328,2290619,2290619


In [10]:
sf1

Unnamed: 0,FIRST_ENTRY,LAST_ENTRY,FIRST_ID,LAST_ID,NUM_ID,NUM_ENTRIES
0,2020-01-01 00:00:05.791858,2020-06-12 00:05:19.297824,740978,2543151,1799307,1799307


In [11]:
str_order_metrics = pd.DataFrame(columns=['Postgres', 'Snowflake', 'is_identical'], index=pg1.columns)
str_order_metrics['Postgres'] = pg1.T.values
str_order_metrics['Snowflake'] = sf1.T.values
str_order_metrics['is_identical'] = str_order_metrics['Postgres'] == str_order_metrics['Snowflake']

str_order_metrics

Unnamed: 0,Postgres,Snowflake,is_identical
first_entry,2019-04-02 18:31:05.786241,2020-01-01 00:00:05.791858,False
last_entry,2020-06-12 05:13:43.456006,2020-06-12 00:05:19.297824,False
first_id,1664,740978,False
last_id,2549328,2543151,False
num_id,2290619,1799307,False
num_entries,2290619,1799307,False


## Various overall metrics - before March 2020

In [12]:
pg2 = gdw_query(pg_query2)
sf2 = snow_query(sf_query2)
pg2

Unnamed: 0,first_entry,last_entry,first_id,last_id,num_id,num_entries
0,2019-04-02 18:31:05.786241,2020-02-29 23:59:54.153645,1664,990737,734354,734354


In [13]:
sf2

Unnamed: 0,FIRST_ENTRY,LAST_ENTRY,FIRST_ID,LAST_ID,NUM_ID,NUM_ENTRIES
0,2020-01-01 00:00:05.791858,2020-02-29 23:59:54.153645,740978,990737,248937,248937


In [14]:
str_order_metrics = pd.DataFrame(columns=['Postgres', 'Snowflake', 'is_identical'], index=pg1.columns)
str_order_metrics['Postgres'] = pg2.T.values
str_order_metrics['Snowflake'] = sf2.T.values
str_order_metrics['is_identical'] = str_order_metrics['Postgres'] == str_order_metrics['Snowflake']

str_order_metrics

Unnamed: 0,Postgres,Snowflake,is_identical
first_entry,2019-04-02 18:31:05.786241,2020-01-01 00:00:05.791858,False
last_entry,2020-02-29 23:59:54.153645,2020-02-29 23:59:54.153645,True
first_id,1664,740978,False
last_id,990737,990737,True
num_id,734354,248937,False
num_entries,734354,248937,False


## Month-wise number of orders

In [15]:
pg3 = gdw_query(pg_query3)
pg3.index = pd.to_datetime(pg3.which_month)
pg3.head()

Unnamed: 0_level_0,which_month,num_id,num_entries
which_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-04-01 00:00:00+00:00,2019-04-01 00:00:00+00:00,105,105
2019-05-01 00:00:00+00:00,2019-05-01 00:00:00+00:00,22823,22823
2019-06-01 00:00:00+00:00,2019-06-01 00:00:00+00:00,85484,85484
2019-07-01 00:00:00+00:00,2019-07-01 00:00:00+00:00,101845,101845
2019-08-01 00:00:00+00:00,2019-08-01 00:00:00+00:00,109522,109522


In [50]:
sf3 = snow_query(sf_query3)
sf3.index = pd.to_datetime(sf3['WHICH_MONTH'], utc=True)
sf3.head()

Unnamed: 0_level_0,WHICH_MONTH,NUM_ID,NUM_ENTRIES
WHICH_MONTH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-01 00:00:00+00:00,2020-01-01,112251,112251
2020-02-01 00:00:00+00:00,2020-02-01,136686,136686
2020-03-01 00:00:00+00:00,2020-03-01,240506,240506
2020-04-01 00:00:00+00:00,2020-04-01,478491,478491
2020-05-01 00:00:00+00:00,2020-05-01,611748,611748


In [52]:
str_order_monthwise= pd.DataFrame(columns=['Postgres', 'Snowflake', 'is_identical'], index=pg3.index)
str_order_monthwise.index.name = 'Month'
str_order_monthwise['Postgres'] = pg3['num_entries']
str_order_monthwise['Snowflake'] = sf3['NUM_ENTRIES']
str_order_monthwise['is_identical'] = str_order_monthwise.Postgres == str_order_monthwise.Snowflake

str_order_monthwise.head()

Unnamed: 0_level_0,Postgres,Snowflake,is_identical
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-04-01 00:00:00+00:00,105,,False
2019-05-01 00:00:00+00:00,22823,,False
2019-06-01 00:00:00+00:00,85484,,False
2019-07-01 00:00:00+00:00,101845,,False
2019-08-01 00:00:00+00:00,109522,,False


In [53]:
str_order_monthwise[str_order_monthwise['is_identical']!= True]

Unnamed: 0_level_0,Postgres,Snowflake,is_identical
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-04-01 00:00:00+00:00,105,,False
2019-05-01 00:00:00+00:00,22823,,False
2019-06-01 00:00:00+00:00,85484,,False
2019-07-01 00:00:00+00:00,101845,,False
2019-08-01 00:00:00+00:00,109522,,False
2019-09-01 00:00:00+00:00,111763,,False
2019-10-01 00:00:00+00:00,67143,,False
2019-11-01 00:00:00+00:00,92539,,False
2019-12-01 00:00:00+00:00,51972,,False
2020-02-01 00:00:00+00:00,91158,136686.0,False
