# Pull Queries from Postgres as a CSV file

In [1]:
import pandas as pd

#Get the previous date
from datetime import date, timedelta, datetime

#Interacting with Postgres SQL
import psycopg2

#importing database
from database_config import postgres as cfg
from database_config import cust_list

import time

# Establishing Connection to Postgres via psycopg2

In [2]:
#Setting up the connection to the PostgreSQL -feeddate
conn = psycopg2.connect(database=cfg['database'],user=cfg['user'],password=cfg['password'],host=cfg['host'],port="5432")

In [3]:
#using datetime to create the date and subtracting 1 day
#this works with the last days of the month as well. 
todays_date = str(datetime.today())[:10]

In [4]:
important_customers = cust_list

## Query

In [14]:
company_na_reports = (  
'''SELECT 
    ct."id" AS curation_task_id, 
    ct.customer_id, 
    ct.resolution, 
    --- truncated the timestamp from the date for aggregation.
    date_trunc('day',ct.started_at) as "started_at",
    cpf.product_id,
    cpf."name" AS attribute_name, 
    cpf."Total_NAs", 
    cpf.curation_tasks_count
FROM "curation_tasks" AS ct

INNER JOIN 
    (SELECT 
        cpf.curation_task_id,
        cpf.customer_id,
        cpf.product_id,
        cpf.name, 
        SUM(CASE WHEN cpf.value = 'n/a' THEN 1 ELSE 0 END) as "Total_NAs",
        COUNT(cpf.id) AS curation_tasks_count
    FROM "public"."curated_product_fields"  as cpf
    WHERE customer_id = {}
    GROUP BY
        cpf.curation_task_id,    
        cpf.customer_id,
        cpf.product_id,\
        cpf.name
    ) AS cpf
ON ct.id = cpf.curation_task_id

WHERE started_at >= CURRENT_TIMESTAMP - INTERVAL '14 days'
--- specific resolution that are not taken care of my rules or bulk
AND (ct.resolution IS NULL OR ct.resolution = 'misclassified') 
AND ct.customer_id = {}
ORDER BY started_at;
''')

In [15]:
master_data = pd.DataFrame()
for cust in important_customers:
    #Returning the data in pandas to export it as a CSV.
    data = pd.read_sql(company_na_reports.format(cust,cust),conn)
    if data.shape[0] != 0:
        data['started_at'] = data['started_at'].dt.date
        data['customer_name']= important_customers[cust]
    else:
        print('No data available in your time frame for',important_customers[cust])
        pass
    master_data = master_data.append(data,sort=False)
    print(important_customers[cust], 'query and data clean completed')
    time.sleep(15)

CVS query and data clean completed
Dicks Sporting Goods query and data clean completed
No data available in your time frame for Container Store
Container Store query and data clean completed
No data available in your time frame for Paiges
Paiges query and data clean completed
Burlington query and data clean completed


In [16]:
master_data.shape

(218812, 9)

In [17]:
master_data.head()

Unnamed: 0,curation_task_id,customer_id,resolution,started_at,product_id,attribute_name,Total_NAs,curation_tasks_count,customer_name
0,82296338,55,,2018-10-09,225605278,color,0,1,CVS
1,82296337,55,,2018-10-09,225603309,color,0,1,CVS
2,82296335,55,,2018-10-09,225604748,color,0,1,CVS
3,82296334,55,,2018-10-09,40684651,color,0,1,CVS
4,82296333,55,,2018-10-09,40689265,color,0,1,CVS


In [18]:
if master_data.shape[0] != 0:
    master_data.to_csv('./data/NA_report.csv',index=False)

#### Export query to CSV. 
A `data` folder will need to be created if looking to transfer this file. This can be changed to be more dynamic. 

In [74]:
query = (
"""SELECT    
    ct.id as curation_tasks_id,
    ct.customer_id,
    ct.product_id,
    cpf.name, 
    cpf.value as attribute_values,
    ct.resolution,
    ct.type,  
    ct.started_at
FROM curation_tasks as ct

INNER JOIN curated_product_fields AS cpf
ON ct.id = cpf.curation_task_id

WHERE started_at >= CURRENT_TIMESTAMP - INTERVAL '3 month'
ORDER BY started_at DESC
""")

In [75]:
na_values = pd.read_sql(query,conn)

In [76]:
na_values.shape

(968081, 8)

In [77]:
na_values['is_na'] = na_values['attribute_values'] == 'n/a'

In [78]:
na_values['is_na']= na_values['is_na']*1

In [79]:
na_values.to_csv("./data/na_values.csv")

# Querying n/a Product data

In [9]:
query_product = (
'''SELECT
    p.customer_id,
    pb.bucket_id,
    b.name as bucket_name,
    p.id as product_id, 
    p.name as product_name,
    active, 
    p.external_id,
    p.image_url
FROM "products" AS p  

INNER JOIN products_buckets as pb
ON p.id = pb.product_id
---------------change to strategy buckets
INNER JOIN buckets as b
ON pb.bucket_id = b.id
Where active = 't'
AND p.customer_id  = {}
ORDER BY p.id
'''
)

In [10]:
master_data_prod = pd.DataFrame()
for cust in important_customers:
    #Returning the data in pandas to export it as a CSV.
    data_prod = pd.read_sql(query_product.format(cust),conn)
    if data_prod.shape[0] == 0:
        print('No data_prod available in your time frame for',important_customers[cust])
    else:
        pass
    master_data_prod = master_data_prod.append(data_prod,sort=False)
    print(important_customers[cust], 'query and data clean completed')
    time.sleep(15)

CVS query and data clean completed
Dicks Sporting Goods query and data clean completed
Container Store query and data clean completed
Paiges query and data clean completed
Burlington query and data clean completed


In [12]:
if master_data_prod.shape[0] != 0:
    master_data_prod.to_csv('./data/products.csv',index=False)

In [15]:
strat_buckets = ("""SELECT
    sb.id AS strategy_bucket_id, 
    bucket_id, 
    sb.active, 
    sb.status as sbucket_status ,
    sba.status as IO_status,
    sba.attribute_id
FROM "public"."strategy_buckets" as sb

INNER JOIN strategy_buckets_attributes as sba
ON sb.id = sba.strategy_bucket_id
WHERE sba.status = 'OPT_IN'""")

In [16]:
sb = pd.read_sql(strat_buckets,conn)

In [17]:
sb.to_csv('./data/strat_buck.csv',index=False)