## Big Query upload with Polars

### Environment settings

In [1]:
import numpy as np
import pandas as pd
import polars as pl
import gspread
import duckdb
# Import authenticator and gspread to manage g-sheets
from oauth2client.service_account import ServiceAccountCredentials
from google.oauth2 import service_account
from google.cloud import bigquery
import connectorx as cx

In [2]:
# Create sheets and drive scopes to authenticate
scopes = ['https://www.googleapis.com/auth/spreadsheets',
        'https://www.googleapis.com/auth/drive',
        'https://www.googleapis.com/auth/analytics.readonly']

# Read google credentials
api = '../APIS/gepp-538-db.json'
# connect to google sheets
gs_credentials = ServiceAccountCredentials.from_json_keyfile_name(api, scopes)
gc = gspread.authorize(gs_credentials)
# connect to big query
bq_credentials = service_account.Credentials.from_service_account_file(api)
project_id = 'gepp-538'
client = bigquery.Client(credentials=bq_credentials,project=project_id)

In [3]:
pedidos = pl.read_csv('../Datasets/getm-pedidos.csv', ignore_errors=True)
pedidos_detalle = pl.read_csv('../Datasets/getm-pedidos_detalle.csv', ignore_errors=True)
promociones = pl.read_csv('../Datasets/getm-promociones.csv', ignore_errors=True)
tenderos = pl.read_csv('../Datasets/getm-tenderos.csv', ignore_errors=True)
tenderos_detalle = pl.read_csv('../Datasets/getm-tenderos_registros.csv', ignore_errors=True)

In [127]:
pedidos = pedidos.drop(
    ['NoPedido','UserID','Status','ShipOnDate','FechaDespacho','FechaRuta','ShipAdressID','IsNotify',
     'NotifyOnDate','NotifyOnFileName','TimeZoneOffSET','Notificado','ProspectoEnviado','OrdenMonitoreada',
    'Quantity','PriceInfo']
    ).with_columns(pl.col(['CreationDate','LastModifiedDate','FechaEntregado','FechaCancelado']
                         ).str.strptime(pl.Datetime, strict=False))

In [128]:
pedidos_detalle = pedidos_detalle.drop(['DiscountType','Modelo'])

In [129]:
promociones = promociones.drop(
    ['ID_Region','ID_Territorio','FechaVigenciaInicial','FechaVigenciaFinal','Imagen','PromoColorTexto',
     'PromoColorFondo','VigenciaColorTexto','VigenciaColorFondo','Encabezado','Descripcion','Contenido',
     'BotonColorTexto','BotonColorFondo','Precio','UsuarioAlta','UsuarioModificacion','ImagenPortada',
     'MaximoImpactos','Clave_Externa','ID_Bodega','ID_Layout']
    ).with_columns(pl.col(['FechaAlta','FechaModificacion']
                         ).str.strptime(pl.Datetime, strict=False))

In [130]:
# filter promociones by unique clavepromo
promociones = promociones.unique(subset=['ClavePromo'], maintain_order=True)

In [131]:
tenderos = tenderos.select(pl.col(
    ['ID_Tendero','ID_Region','ID_Bodega','ID_Territorio','NUD','NombreTienda','Telefono','Correo','Categoria',
    'Segmentacion','Activo','Registrado','FechaAlta','Genero','NumeroPuertas','RutaEcommerce','FechaRegistro',
     ])
    ).with_columns(pl.col('FechaAlta','FechaRegistro').str.strptime(pl.Datetime, strict=False))

In [132]:
tenderos_detalle = tenderos_detalle.select(
    pl.exclude(['Responsable','FechaNacimiento','TokenPushNotification','DeviceID','Telefono','Correo','Activo',
               'FechaRegistro'])
    ).with_columns(pl.col('FechaUltimaConexion').str.strptime(pl.Datetime, strict=False))

In [133]:
# join tenderos and tenderos detalle
tenderos = tenderos.join(tenderos_detalle, on='ID_Tendero', how='left')

In [134]:
# join pedidos detalle and pedidos
pedidos = pedidos_detalle.join(pedidos, on='ID_Pedido', how='left')

In [135]:
# transactions sheet
transacciones = pedidos.join(promociones, on='ClavePromo', how='left')

### Duckdb

In [27]:
# consulta detallada usuarios
mpe_users = pl.read_csv('../usuarios.csv', separator=';', ignore_errors=True)

In [28]:
# create sql queries with duckdb
# read file with duckdb (csv, parquet, json)
duckdb.sql(
    '''
    SELECT
        Origen, COUNT(Nombre) AS usuarios
    FROM '../usuarios.csv'
    GROUP BY Origen
    ORDER BY usuarios DESC
    '''
).pl() # and pipeline to polars dataframe
#others formats include: .pdf() for pandas, .arrow() for arrow, and .fetchnumpy() for numpy arrays

Origen,usuarios
str,i64
"""WA""",765173
"""Ecommerce""",97913


**create files from duckdb queries**
* duckdb.sql('SELECT 42').write_parquet('out.parquet') # Write to a Parquet file
* duckdb.sql('SELECT 42').write_csv('out.csv')         # Write to a CSV file
* duckdb.sql("COPY (SELECT 42) TO 'out.parquet'")      # Copy to a parquet file

### Upload to Big Query

**MPE**

In [33]:
# create dataset gbq api
#client.create_dataset('reportes')

Dataset(DatasetReference('gepp-538', 'reportes'))

In [137]:
tenderos = tenderos.to_pandas()
tenderos.to_gbq('gepp-538.reportes.tenderos',
                    project_id='gepp-538',
                    if_exists='replace',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 4029.11it/s]


In [139]:
transacciones = transacciones.to_pandas()
transacciones.to_gbq('gepp-538.reportes.transacciones',
                    project_id='gepp-538',
                    if_exists='replace',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 2616.53it/s]


### Retrieve datasets from Big Query

In [None]:
# create sql query
query = '''
    SELECT * FROM `gepp-538.transformation.mpe_catalogo`
'''
# convert query to pandas dataframe
mpe_catalogo = pd.read_gbq(query, credentials=bq_credentials)

In [11]:
# create sql query
query = '''
    SELECT * FROM `gepp-538.transformation.mpe_catalogo` 
    '''
query_job = client.query(query)  # API request
rows = query_job.result()  # Waits for query to finish
# convert to polars dataframe
pl_df = pl.from_arrow(rows.to_arrow())