### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import psycopg2
import snowflake.connector as sfc

### Postgres setup

In [2]:
pg_schema = 'com_zx_courier'
pg_table = 'stg_users_facebook'

pg_date_col = 'create_date'

### Snowflake setup

In [3]:
sf_schema = 'BR_ZE'
sf_table = 'STR_USERS_FACEBOOK'

sf_date_col = 'create_date'

### Postgres connection and query

In [4]:
def gdw_query(query):
    conn = psycopg2.connect(
                    user='postgres',
                    password='',
                    host='localhost',
                    port='5432',
                    database='gdwprod')
    try:
        df = pd.read_sql_query(query, conn)
        return df
    finally:
        conn.close()

In [5]:
pg_query1 = '''select min({date_col}) first_entry,\
                    max({date_col}) last_entry,\
                    count(*) num_entries
                from {schema}.{table};'''.format(
                                                schema=pg_schema, 
                                                table=pg_table,
                                                date_col=pg_date_col)

pg_query2 = '''select min({date_col}) first_entry,\
                    max({date_col}) last_entry,\
                    count(*) num_entries\
            from {schema}.{table}\
            where date_trunc('month', date({date_col})) < date_trunc('month', date('2020-03-01'));'''.format(
                                                                                            schema=pg_schema, 
                                                                                            table=pg_table,
                                                                                            date_col=pg_date_col)


pg_query3 = '''select date_trunc('month', date({date_col})) which_month,\
                    count(*) num_entries\
                from {schema}.{table} group by date_trunc('month', date({date_col})) 
                order by date_trunc('month', date({date_col}));'''.format(
                                                                    schema=pg_schema, 
                                                                    table=pg_table,
                                                                    date_col=pg_date_col)

### Snowflake connection and query

In [6]:
def snow_query(query):
    conn = sfc.connect(
                    user='',
                    password='',
                    account='zxventures.us-east-1',
                    warehouse='WH_INTERACTIVE',
                    database='OMNICHANNEL',)
    try:
        df = pd.read_sql_query(query, conn)
        return df
    finally:
        conn.close()

In [7]:
sf_query1 = '''select min({date_col}) first_entry,\
                    max({date_col}) last_entry,\
                    count(*) num_entries
                from {schema}.{table};'''.format(
                                                schema=sf_schema, 
                                                table=sf_table,
                                                date_col=sf_date_col)

sf_query2 = '''select min({date_col}) first_entry,\
                    max({date_col}) last_entry,\
                    count(*) num_entries\
            from {schema}.{table}\
            where date_trunc('month', date({date_col})) < date_trunc('month', date('2020-03-01'));'''.format(
                                                                                            schema=sf_schema, 
                                                                                            table=sf_table,
                                                                                            date_col=sf_date_col)


sf_query3 = '''select date_trunc('month', date({date_col})) which_month,\
                    count(*) num_entries\
                from {schema}.{table} group by date_trunc('month', date({date_col}))
                order by date_trunc('month', date({date_col}));'''.format(
                                                                    schema=sf_schema, 
                                                                    table=sf_table,
                                                                    date_col=sf_date_col)

## Various overall metrics

In [8]:
pg1 = gdw_query(pg_query1)
sf1 = snow_query(sf_query1)
pg1

Unnamed: 0,first_entry,last_entry,num_entries
0,2015-12-06T18:16:57Z,2020-05-31T04:50:35,1593976


In [9]:
sf1

Unnamed: 0,FIRST_ENTRY,LAST_ENTRY,NUM_ENTRIES
0,2015-12-06T18:16:57Z,2020-06-11T04:47:30,1717409


In [10]:
str_order_metrics = pd.DataFrame(columns=['Postgres', 'Snowflake', 'is_identical'], index=pg1.columns)
str_order_metrics['Postgres'] = pg1.T.values
str_order_metrics['Snowflake'] = sf1.T.values
str_order_metrics['is_identical'] = str_order_metrics['Postgres'] == str_order_metrics['Snowflake']

str_order_metrics

Unnamed: 0,Postgres,Snowflake,is_identical
first_entry,2015-12-06T18:16:57Z,2015-12-06T18:16:57Z,True
last_entry,2020-05-31T04:50:35,2020-06-11T04:47:30,False
num_entries,1593976,1717409,False


## Various overall metrics - before March 2020

In [11]:
pg2 = gdw_query(pg_query2)
sf2 = snow_query(sf_query2)
pg2

Unnamed: 0,first_entry,last_entry,num_entries
0,2015-12-06T18:16:57Z,2020-02-29T03:06:29,738453


In [12]:
sf2

Unnamed: 0,FIRST_ENTRY,LAST_ENTRY,NUM_ENTRIES
0,2015-12-06T18:16:57Z,2020-02-29T03:06:29,738453


In [13]:
str_order_metrics = pd.DataFrame(columns=['Postgres', 'Snowflake', 'is_identical'], index=pg1.columns)
str_order_metrics['Postgres'] = pg2.T.values
str_order_metrics['Snowflake'] = sf2.T.values
str_order_metrics['is_identical'] = str_order_metrics['Postgres'] == str_order_metrics['Snowflake']

str_order_metrics

Unnamed: 0,Postgres,Snowflake,is_identical
first_entry,2015-12-06T18:16:57Z,2015-12-06T18:16:57Z,True
last_entry,2020-02-29T03:06:29,2020-02-29T03:06:29,True
num_entries,738453,738453,True


## Month-wise number of orders

In [14]:
pg3 = gdw_query(pg_query3)
pg3.index = pd.to_datetime(pg3.which_month)
pg3.head()

Unnamed: 0_level_0,which_month,num_entries
which_month,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-12-01 00:00:00+00:00,2015-12-01 00:00:00+00:00,43
2016-01-01 00:00:00+00:00,2016-01-01 00:00:00+00:00,29
2016-02-01 00:00:00+00:00,2016-02-01 00:00:00+00:00,100
2016-03-01 00:00:00+00:00,2016-03-01 00:00:00+00:00,112
2016-04-01 00:00:00+00:00,2016-04-01 00:00:00+00:00,180


In [15]:
sf3 = snow_query(sf_query3)
sf3.index = pd.to_datetime(sf3['WHICH_MONTH'], utc=True)
sf3.head()

Unnamed: 0_level_0,WHICH_MONTH,NUM_ENTRIES
WHICH_MONTH,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-12-01 00:00:00+00:00,2015-12-01,43
2016-01-01 00:00:00+00:00,2016-01-01,29
2016-02-01 00:00:00+00:00,2016-02-01,100
2016-03-01 00:00:00+00:00,2016-03-01,112
2016-04-01 00:00:00+00:00,2016-04-01,180


In [16]:
str_order_monthwise= pd.DataFrame(columns=['Postgres', 'Snowflake', 'is_identical'], index=pg3.index)
str_order_monthwise.index.name = 'Month'
str_order_monthwise['Postgres'] = pg3['num_entries']
str_order_monthwise['Snowflake'] = sf3['NUM_ENTRIES']
str_order_monthwise['is_identical'] = str_order_monthwise.Postgres == str_order_monthwise.Snowflake

str_order_monthwise.head()

Unnamed: 0_level_0,Postgres,Snowflake,is_identical
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-12-01 00:00:00+00:00,43,43,True
2016-01-01 00:00:00+00:00,29,29,True
2016-02-01 00:00:00+00:00,100,100,True
2016-03-01 00:00:00+00:00,112,112,True
2016-04-01 00:00:00+00:00,180,180,True


In [17]:
str_order_monthwise[str_order_monthwise['is_identical']!= True]

Unnamed: 0_level_0,Postgres,Snowflake,is_identical
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
