## Big Query MPE back office

### Environment settings

In [1]:
import numpy as np
import pandas as pd
import polars as pl
import gspread
import duckdb
# Import authenticator and gspread to manage g-sheets
from oauth2client.service_account import ServiceAccountCredentials
from google.oauth2 import service_account
from google.cloud import bigquery
import connectorx as cx

In [2]:
# Create sheets and drive scopes to authenticate
scopes = ['https://www.googleapis.com/auth/spreadsheets',
        'https://www.googleapis.com/auth/drive',
        'https://www.googleapis.com/auth/analytics.readonly']

# Read google credentials
api = '../APIS/gepp-538-db.json'
# connect to google sheets
gs_credentials = ServiceAccountCredentials.from_json_keyfile_name(api, scopes)
gc = gspread.authorize(gs_credentials)
# connect to big query
bq_credentials = service_account.Credentials.from_service_account_file(api)
project_id = 'gepp-538'
client = bigquery.Client(credentials=bq_credentials,project=project_id)

### Duckdb

In [27]:
# consulta detallada usuarios
mpe_users = pl.read_csv('../usuarios.csv', separator=';', ignore_errors=True)

In [28]:
# create sql queries with duckdb
# read file with duckdb (csv, parquet, json)
duckdb.sql(
    '''
    SELECT
        Origen, COUNT(Nombre) AS usuarios
    FROM '../usuarios.csv'
    GROUP BY Origen
    ORDER BY usuarios DESC
    '''
).pl() # and pipeline to polars dataframe
#others formats include: .pdf() for pandas, .arrow() for arrow, and .fetchnumpy() for numpy arrays

Origen,usuarios
str,i64
"""WA""",765173
"""Ecommerce""",97913


**create files from duckdb queries**
* duckdb.sql('SELECT 42').write_parquet('out.parquet') # Write to a Parquet file
* duckdb.sql('SELECT 42').write_csv('out.csv')         # Write to a CSV file
* duckdb.sql("COPY (SELECT 42) TO 'out.parquet'")      # Copy to a parquet file

### MPE

In [47]:
def mpe_usuarios(file):

    mpe_usuarios = (
    pl.read_excel(file, sheet_name='Cantidades'
        ).select(
        pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
        ).with_columns(pl.col('fecha').str.strptime(pl.Datetime, strict=False)
        )
    )
    return mpe_usuarios

In [83]:
def mpe_pedidos(file):
    
    mpe_general = (
        pl.read_excel(file, sheet_name='Pedidos General',
        read_csv_options={'infer_schema_length':0})
    .select(
        pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
    ).rename(
    {
        'fecha_modificación':'fecha_modificacion',
        'hora_modificación':'hora_modificacion',
        'teléfono_de_contacto':'tel_contacto',
        'alias_de_dirección':'alias_dir',
        'dirección_completa':'dir_completa',
    }
    ).with_columns(
    pl.col('id_bodega').cast(pl.Int64),
    pl.col('cantidad_de_productos').cast(pl.Int64),
    pl.col('total_de_orden').cast(pl.Float64),
    pl.col('promociones').cast(pl.Int64),
    pl.col('ruta_de_entrega').cast(pl.Float64),
    pl.col('nud').cast(pl.Float64),
    pl.col('fecha_de_pedido').str.strptime(pl.Datetime, format='%d-%m-%Y', strict=False),
    pl.col('fecha_modificacion').str.strptime(pl.Datetime, format='%d-%m-%Y', strict=False),
    pl.col('fecha_entrega').str.strptime(pl.Datetime, format='%d-%m-%Y', strict=False),
    ).drop('nombre_completo')
    )
    return mpe_general

In [11]:
def mpe_detalle(file):
    
    mpe_det = (pl.read_excel(file, sheet_name='Pedidos Detalle',
        read_csv_options={'infer_schema_length':0})
    .select(
        pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
    ).rename(
    {
        'clave_de_promoción':'clave_de_promocion',
        'producto(s)':'productos'
    }
    ).with_columns(
    pl.col('cantidad').cast(pl.Int64),
    pl.col('precio_por_unidad').cast(pl.Float64),
    pl.col('descuento_por_unidad').cast(pl.Float64),
    pl.col('precio_acumulado').cast(pl.Float64),
    pl.col('descuento_acumulado').cast(pl.Float64),
    )
    )
    return mpe_det

In [84]:
# Access MPE bodegas worksheet by url
sheet_id = '1RdEyiPZULVGwy274uPsB6NKxg6MtWCw9rHwcuDyCwyw'
workbook = gc.open_by_key(sheet_id)
# Access data by worksheet sheet number
mpe_bodegas = workbook.worksheet('warehouses')
# Save data to table
mpe_bodegas = mpe_bodegas.get_all_values()
# Save accessed data from google sheets to dataframes
mpe_bodegas = pd.DataFrame(mpe_bodegas[1:],columns=mpe_bodegas[0])
# convert to polars dataframe and manipulate data
mpe_bodegas = pl.from_pandas(mpe_bodegas).select(
    pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
    ).with_columns(pl.col('clave_bodega').cast(pl.Int64)
    ).rename({'clave_bodega':'id_bodega'}
    ).drop(['clave_bodega_destino','tipo_preventa_tradicional','clave_bodega_origen',]
    )

In [49]:
mpe_usuarios = mpe_usuarios('~/Downloads/mpe-usuarios.xlsx')

In [13]:
mpe_detalle = mpe_detalle('~/Downloads/mpe-pedidos.xlsx')

In [85]:
mpe_pedidos = mpe_pedidos('~/Downloads/mpe-pedidos.xlsx').join(mpe_bodegas, on='id_bodega', how='left')

### GETM

In [144]:
# Pedidos General
def getm_pedidos(file):
    
    getm_pedidos = (
    pl.read_excel('~/Downloads/getm-pedidos.xlsx', sheet_name='Pedidos General', 
    read_csv_options={'infer_schema_length':0}
    ).select(
        pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
    ).rename({
        'fecha_de_creación':'fecha_de_creacion',
        'hora_de_creación':'hora_de_creacion',
        'fecha_de_modificación':'fecha_de_modificacion',
        'hora_de_modificación':'hora_de_modificacion',
    }).with_columns(
        pl.col('id_bodega').cast(pl.Int64),
        pl.col('nud').cast(pl.Int64),
        pl.col('rutappp').cast(pl.Int64),
        pl.col('ruta_de_desarrollo').cast(pl.Int64),
        pl.col('rutavpp').cast(pl.Int64),
        pl.col('monto_total').cast(pl.Float64),
        pl.col('total_de_productos').cast(pl.Int64),
        pl.col('promociones').cast(pl.Int64),
        pl.col('fecha_de_creacion').str.strptime(pl.Datetime, format='%d-%m-%Y', strict=False),
        pl.col('fecha_de_modificacion').str.strptime(pl.Datetime, format='%d-%m-%Y', strict=False),
        pl.col('fecha_de_despacho').str.strptime(pl.Datetime, format='%d-%m-%Y', strict=False),
        pl.col('fecha_de_entrega').str.strptime(pl.Datetime, format='%d-%m-%Y', strict=False),
        pl.col('fecha_cancelado').str.strptime(pl.Datetime, format='%d-%m-%Y', strict=False),
    ).drop('nombre_del_cliente_registrado')
    )
    return getm_pedidos

In [168]:
# Pedidos Detalle
def getm_detalle(file):
    
    getm_detalle = (
    pl.read_excel(file, sheet_name='Pedidos Detalle', 
    read_csv_options={'infer_schema_length':0}
    ).select(
        pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
    ).rename({
        'clave_de_promoción':'clave_de_promocion',
        'tipo_de_promoción':'tipo_de_promocion',
        'descripción_de_promoción':'descripcion_de_promocion',
        'producto(s)':'productos',
    }).with_columns(
        pl.col('id_bodega').cast(pl.Int64),
        pl.col('nud').cast(pl.Int64),
        pl.col('cantidad').cast(pl.Float64),
        pl.col('precio_por_unidad').cast(pl.Float64),
        pl.col('descuento_por_unidad').cast(pl.Float64),
        pl.col('precio_acumulado').cast(pl.Float64),
        pl.col('descuento_acumulado').cast(pl.Float64),
    )
    )
    return getm_detalle

In [145]:
getm_pedidos = getm_pedidos('~/Downloads/getm-pedidos.xlsx')

In [169]:
getm_detalle = getm_detalle('~/Downloads/getm-pedidos.xlsx')

In [172]:
# Access GETM push notifications worksheet by url
sheet_id = '1STkVkyDdCjEtTAgMOWbptf8oRqUBMPRV1ufi-fuY8xk'
workbook = gc.open_by_key(sheet_id)
# Access data by worksheet sheet number
getm_push = workbook.worksheet('Push Notifications')
# Save data to table
getm_values_push = getm_push.get_all_values()
# Save accessed data from google sheets to dataframes
getm_push = pd.DataFrame(getm_values_push[1:],columns=getm_values_push[0])
# drop irrelevant columns
getm_push = getm_push.drop(['Año','Mes','Proyecto','Publicación','Caracteres','Espacios','Tipo','ID'],
    axis=1)

In [173]:
# GETM push notifications
# convert to lowercase
getm_push.columns = getm_push.columns.str.lower()
# convert to datetime
getm_push['fecha'] = getm_push['fecha'].apply(pd.to_datetime, dayfirst=True)
# replace "," with noaught and convert to numeric
getm_push['clientes'] = getm_push['clientes'].str.replace(',', '', regex=True).apply(pd.to_numeric)
# rename columns
getm_push = getm_push.rename(columns={'categorias':'categoria','área':'area'})
# drop empty rows
getm_push.dropna(subset=['fecha'], inplace=True)

### Upload to Big Query

### MPE

In [72]:
# create dataset gbq api
#client.create_dataset('back_office')

Dataset(DatasetReference('gepp-538', 'back_office'))

In [50]:
mpe_usuarios = mpe_usuarios.to_pandas()
mpe_usuarios.to_gbq('gepp-538.back_office.mpe_usuarios',
                    project_id='gepp-538',
                    if_exists='append',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 9177.91it/s]


In [87]:
mpe_pedidos = mpe_pedidos.to_pandas()
mpe_pedidos.to_gbq('gepp-538.back_office.mpe_pedidos',
                    project_id='gepp-538',
                    if_exists='append',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 2457.12it/s]


In [52]:
mpe_detalle = mpe_detalle.to_pandas()
mpe_detalle.to_gbq('gepp-538.back_office.mpe_detalle',
                    project_id='gepp-538',
                    if_exists='append',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 7108.99it/s]


### GETM

In [147]:
getm_pedidos = getm_pedidos.to_pandas()
getm_pedidos.to_gbq('gepp-538.back_office.getm_pedidos',
                    project_id='gepp-538',
                    if_exists='append',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 2716.52it/s]


In [171]:
getm_detalle = getm_detalle.to_pandas()
getm_detalle.to_gbq('gepp-538.back_office.getm_detalle',
                    project_id='gepp-538',
                    if_exists='append',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 8630.26it/s]


In [176]:
getm_push.to_gbq('gepp-538.back_office.getm_notifications',
                    project_id='gepp-538',
                    if_exists='replace',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 7294.44it/s]


### Retrieve datasets from Big Query

In [None]:
# create sql query
query = '''
    SELECT * FROM `gepp-538.transformation.mpe_catalogo`
'''
# convert query to pandas dataframe
mpe_catalogo = pd.read_gbq(query, credentials=bq_credentials)