# Pull Queries from Postgres as a CSV file

In [6]:
import pandas as pd

#Get the previous date
from datetime import date, timedelta, datetime

#Interacting with Postgres SQL
import psycopg2

import time

#importing database
from database_config import postgres as cfg
from database_config import cust_list

# Establishing Connection to Postgres via psycopg2

In [7]:
#Setting up the connection to the PostgreSQL -feeddate
conn = psycopg2.connect(database=cfg['database'],user=cfg['user'],password=cfg['password'],host=cfg['host'],port="5432")

#this works with the last days of the month as well. 
todays_date = str(datetime.today())[:10]

In [8]:
prod_vals = ("""SELECT 
    cpf.product_id, 
    cpf.name as attribute_name, 
    cpf.value,
    p.name, 
    p.description
FROM "public"."curated_product_fields" as cpf
INNER JOIN products as p
ON cpf.product_id = p.id
WHERE p.active = 't'
LIMIT 1000000;""")

In [9]:
prod_vals = pd.read_sql(prod_vals,conn)

In [10]:
prod_vals.to_csv('./data/product_vaules.csv')

In [3]:
cust_list = pd.read_csv('./data/customer_name_list.csv',index_col='customer_id')

In [4]:
important_customers = cust_list.to_dict(orient='dict')['customer_name']

## Query

In [5]:
company_na_reports = (  
'''SELECT 
    ct."id" AS curation_task_id, 
    ct.customer_id, 
    ct.resolution, 
    ct.started_at,
    cpf.product_id,
    cpf."name" AS attribute_name, 
    cpf."Total_NAs", 
    cpf.curation_tasks_count
FROM "curation_tasks" AS ct

INNER JOIN 
    (SELECT 
        cpf.curation_task_id,
        cpf.customer_id,
        cpf.product_id,
        cpf.name, 
        SUM(CASE WHEN cpf.value = 'n/a' THEN 1 ELSE 0 END) as "Total_NAs",
        COUNT(cpf.id) AS curation_tasks_count
    FROM "public"."curated_product_fields"  as cpf
    WHERE customer_id = {}
    GROUP BY
        cpf.curation_task_id,    
        cpf.customer_id,
        cpf.product_id,
        cpf.name
    ) AS cpf
ON ct.id = cpf.curation_task_id

WHERE started_at >= CURRENT_TIMESTAMP - INTERVAL '1 week'
--- specific resolution that are not taken care of my rules or bulk
AND (ct.resolution IS NULL OR ct.resolution = 'misclassified') 
AND ct.customer_id = {}
ORDER BY started_at;
''')

In [6]:
master_data = pd.DataFrame()
for cust in important_customers:
    #Returning the data in pandas to export it as a CSV.
    data = pd.read_sql(company_na_reports.format(cust,cust),conn)
    if data.shape[0] != 0:
        data['started_at'] = data['started_at'].dt.date
        data['customer_name']= important_customers[cust]
    else:
        print('No data available in your time frame for Cust_id ',cust)
        pass
    master_data = master_data.append(data,sort=False)
    print(cust, 'query and data clean completed')
    time.sleep(15)

No data available in your time frame for Cust_id  4
4 query and data clean completed
No data available in your time frame for Cust_id  6
6 query and data clean completed
No data available in your time frame for Cust_id  7
7 query and data clean completed
No data available in your time frame for Cust_id  8
8 query and data clean completed
No data available in your time frame for Cust_id  9
9 query and data clean completed
No data available in your time frame for Cust_id  10
10 query and data clean completed


KeyboardInterrupt: 

In [None]:
master_data.shape

In [None]:
master_data.head()

In [None]:
if master_data.shape[0] != 0:
    master_data.to_csv('./data/Weekly_NA_report.csv',index=False)

# Querying n/a Product data

In [None]:
query_product = (
'''SELECT
    p.customer_id,
    pb.bucket_id,
    b.name as bucket_name,
    p.id as product_id, 
    p.name as product_name,
    active, 
    p.external_id,
    p.image_url,
    p.family_id
FROM "products" AS p  

INNER JOIN products_buckets as pb
ON p.id = pb.product_id
INNER JOIN buckets as b
ON pb.bucket_id = b.id
Where active = 't'
AND p.customer_id  = {}
ORDER BY p.id
'''
)

In [None]:
master_data_prod = pd.DataFrame()
for cust in important_customers:
    #Returning the data in pandas to export it as a CSV.
    data_prod = pd.read_sql(query_product.format(cust),conn)
    if data_prod.shape[0] == 0:
        print('No data_prod available in your time frame for Cust_ID ',cust)
    else:
        pass
    master_data_prod = master_data_prod.append(data_prod,sort=False)
    print('Cust_id ',cust, 'Product query and data clean completed')
    time.sleep(10)

In [None]:
if master_data_prod.shape[0] != 0:
    master_data_prod.to_csv('./data/Weekly_products.csv',index=False)

In [46]:
master_data.shape[0],master_data_prod.shape[0]

(337666, 1014679)

In [31]:
if (master_data.shape[0] > 0) and (master_data_prod.shape[0] > 0):
    with open('G:/My Drive/Projects/NA_Reports/NA_Report_log.txt','r+',encoding='utf-8') as f:
        previous_contents =f.read()
        f.seek(0,0)
        f.write(todays_date+' data update has been confirmed.\n')
        f.write(previous_contents)
else:
    with open('G:/My Drive/Projects/NA_Reports/NA_Report_log.txt','r+',encoding='utf-8') as f:
        previous_contents =f.read()
        f.seek(0,0)
        f.write(todays_date+' an error has occured and no data has been updated.\n')
        f.write(previous_contents)