# Pull Queries from Postgres as a CSV file

In [14]:
import pandas as pd

#Get the previous date
from datetime import date, timedelta, datetime

#Interacting with Postgres SQL
import psycopg2

import time

#importing database
from database_config import postgres 
import database_config

# Establishing Connection to Postgres via psycopg2

In [15]:
#Setting up the connection to the PostgreSQL -feeddate
conn = psycopg2.connect(**postgres)

In [16]:
#this works with the last days of the month as well. 
todays_date = str(datetime.today())[:10]

In [17]:
cust_list = {
78:'Paiges',
77:'Container Store',
71: 'Burlington',
55: 'CVS'
}

In [18]:
important_customers = cust_list

## Query

In [19]:
company_na_reports = (  
'''SELECT 
    ct."id" AS curation_task_id, 
    ct.customer_id, 
    ct.resolution, 
    --- truncated the timestamp from the date for aggregation.
    date_trunc('day',ct.started_at) as "started_at",
    cpf.product_id,
    cpf."name" AS attribute_name, 
    cpf."Total_NAs", 
    cpf.curation_tasks_count
FROM "curation_tasks" AS ct

INNER JOIN 
    (SELECT 
        cpf.curation_task_id,
        cpf.customer_id,
        cpf.product_id,
        cpf.name, 
        SUM(CASE WHEN cpf.value = 'n/a' THEN 1 ELSE 0 END) as "Total_NAs",
        COUNT(cpf.id) AS curation_tasks_count
    FROM "public"."curated_product_fields"  as cpf
    WHERE customer_id = {}
    GROUP BY
        cpf.curation_task_id,    
        cpf.customer_id,
        cpf.product_id,
        cpf.name
    ) AS cpf
ON ct.id = cpf.curation_task_id

WHERE started_at >= CURRENT_TIMESTAMP - INTERVAL '14 days'
--- specific resolution that are not taken care of my rules or bulk
AND (ct.resolution IS NULL OR ct.resolution = 'misclassified') 
AND ct.customer_id = {}
ORDER BY started_at;
''')

In [20]:
master_data = pd.DataFrame()
for cust in important_customers:
    #Returning the data in pandas to export it as a CSV.
    data = pd.read_sql(company_na_reports.format(cust,cust),conn)
    if data.shape[0] != 0:
        data['started_at'] = data['started_at'].dt.date
        data['customer_name']= important_customers[cust]
    else:
        print('No data available in your time frame for Cust_id ',cust)
        pass
    master_data = master_data.append(data,sort=False)
    print(cust, 'query and data clean completed')

No data available in your time frame for Cust_id  78
78 query and data clean completed
77 query and data clean completed


KeyboardInterrupt: 

In [7]:
master_data.shape

(207980, 9)

In [8]:
master_data.head()

Unnamed: 0,curation_task_id,customer_id,resolution,started_at,product_id,attribute_name,Total_NAs,curation_tasks_count,customer_name
0,98770317,55,,2018-10-23,225592236,gender,0,1,CVS
1,98770317,55,,2018-10-23,225592237,gender,0,1,CVS
2,98770317,55,,2018-10-23,225592238,gender,0,1,CVS
3,98770812,55,,2018-10-23,225583547,gender,0,1,CVS
4,98770812,55,,2018-10-23,225583548,gender,0,1,CVS


In [9]:
if master_data.shape[0] != 0:
    master_data.to_csv('./data/NA_report.csv',index=False)

# Querying n/a Product data

In [10]:
query_product = (
'''SELECT
    p.customer_id,
    pb.bucket_id,
    b.name as bucket_name,
    p.id as product_id, 
    p.name as product_name,
    p.active, 
    p.external_id,
    p.image_url,
    sba.id AS strategy_bucket_attribute_id, 
    sba.strategy_bucket_id, 
    sba.attribute_id, 
    sba.family_friendly
FROM "products" AS p  
INNER JOIN (SELECT product_id, bucket_id FROM products_buckets) as pb
ON p.id = pb.product_id
INNER JOIN (SELECT id, name FROM buckets) as b
ON pb.bucket_id = b.id
INNER JOIN (SELECT id FROM strategy_buckets) as sb
ON pb.bucket_id = sb.id
INNER JOIN (SELECT id,attribute_id, strategy_bucket_id,family_friendly FROM strategy_buckets_attributes) as sba
ON sba.strategy_bucket_id = sb.id
Where p.active = 't'
AND p.customer_id  = {}
ORDER BY p.id
'''
)

In [11]:
master_data_prod = pd.DataFrame()
for cust in important_customers:
    #Returning the data in pandas to export it as a CSV.
    data_prod = pd.read_sql(query_product.format(cust),conn)
    if data_prod.shape[0] == 0:
        print('No data_prod available in your time frame for Cust_ID ',cust)
    else:
        pass
    master_data_prod = master_data_prod.append(data_prod,sort=False)
    print('Cust_id ',cust, 'Product query and data clean completed')
    time.sleep(10)

Cust_id  55 Product query and data clean completed
Cust_id  75 Product query and data clean completed
Cust_id  77 Product query and data clean completed
Cust_id  78 Product query and data clean completed
Cust_id  71 Product query and data clean completed


In [25]:
[i for i in important_customers]

[78, 77, 71, 55]

In [12]:
if master_data_prod.shape[0] != 0:
    master_data_prod.to_csv('./data/products.csv',index=False)

In [13]:
master_data.shape[0],master_data_prod.shape[0]

(207980, 2764546)

In [14]:
if (master_data.shape[0] > 0) and (master_data_prod.shape[0] > 0):
    with open('G:/My Drive/Projects/NA_Reports/NA_Report_log.txt','r+',encoding='utf-8') as f:
        previous_contents =f.read()
        f.seek(0,0)
        f.write(todays_date+' data update has been confirmed.\n')
        f.write(previous_contents)
else:
    with open('G:/My Drive/Projects/NA_Reports/NA_Report_log.txt','r+',encoding='utf-8') as f:
        previous_contents =f.read()
        f.seek(0,0)
        f.write(todays_date+' an error has occured and no data has been updated.\n')
        f.write(previous_contents)