## Big Query upload with Polars

### Environment settings

In [1]:
import numpy as np
import pandas as pd
import polars as pl
import gspread
import duckdb
# Import authenticator and gspread to manage g-sheets
from oauth2client.service_account import ServiceAccountCredentials
from google.oauth2 import service_account
from google.cloud import bigquery
import connectorx as cx

In [2]:
# Create sheets and drive scopes to authenticate
scopes = ['https://www.googleapis.com/auth/spreadsheets',
        'https://www.googleapis.com/auth/drive',
        'https://www.googleapis.com/auth/analytics.readonly']

# Read google credentials
api = '../APIS/gepp-538-db.json'
# connect to google sheets
gs_credentials = ServiceAccountCredentials.from_json_keyfile_name(api, scopes)
gc = gspread.authorize(gs_credentials)
# connect to big query
bq_credentials = service_account.Credentials.from_service_account_file(api)
project_id = 'gepp-538'
client = bigquery.Client(credentials=bq_credentials,project=project_id)

### Duckdb

In [27]:
# consulta detallada usuarios
mpe_users = pl.read_csv('../usuarios.csv', separator=';', ignore_errors=True)

In [28]:
# create sql queries with duckdb
# read file with duckdb (csv, parquet, json)
duckdb.sql(
    '''
    SELECT
        Origen, COUNT(Nombre) AS usuarios
    FROM '../usuarios.csv'
    GROUP BY Origen
    ORDER BY usuarios DESC
    '''
).pl() # and pipeline to polars dataframe
#others formats include: .pdf() for pandas, .arrow() for arrow, and .fetchnumpy() for numpy arrays

Origen,usuarios
str,i64
"""WA""",765173
"""Ecommerce""",97913


**create files from duckdb queries**
* duckdb.sql('SELECT 42').write_parquet('out.parquet') # Write to a Parquet file
* duckdb.sql('SELECT 42').write_csv('out.csv')         # Write to a CSV file
* duckdb.sql("COPY (SELECT 42) TO 'out.parquet'")      # Copy to a parquet file

### MPE

In [3]:
def mpe_usuarios(url, sheet_name='Cantidades'):
    # read file with pandas
    df = pd.read_excel(url, sheet_name)
    # convert to polars dataframe
    df= pl.from_pandas(df)
    # rename columns by lower case strings and replacing spaces 
    df = df.select(
        pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
    )
    # rename columns
    df = df.rename(
        {
            'usuarios_con_compra': 'usuarios_compra',
        }
    )
    # convert to date
    df = df.with_columns(pl.col('fecha').str.strptime(pl.Datetime, strict=False))
    return df

In [4]:
def mpe_detalle_usuarios(url, sheet_name='Detalle Usuarios'):
    # read file with pandas
    df = pd.read_excel(url, sheet_name)
    # convert to polars dataframe
    df= pl.from_pandas(df)
    # rename columns by lower case strings and replacing spaces 
    df = df.select(
        pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
    )
    # rename columns
    df = df.rename(
        {
            'teléfono': 'telefono',
        }
    )
    # drop column nombre
    df = df.select(pl.exclude('apellidos'))
    # convert to str
    df = df.with_columns(pl.col('telefono').cast(pl.Utf8))
    # convert to date
    df = df.with_columns(pl.col('fecha_alta').str.strptime(pl.Datetime, strict=False))
    return df

In [5]:
def mpe_pedidos_gral(url, sheet_name='Pedidos General'):
    # read file with pandas
    df = pd.read_excel(url, sheet_name)
    # convert to polars dataframe
    df= pl.from_pandas(df)
    # rename columns by lower case strings and replacing spaces 
    df = df.select(
        pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
    )
    # rename columns
    df = df.rename(
        {
            'origen_de_orden':'origen_orden',
            'cantidad_de_productos':'productos',
            'total_de_orden':'total_orden',
            'fecha_de_pedido':'fecha_pedido',
            'hora_de_pedido':'hora_pedido',
            'fecha_modificación':'fecha_modificacion',
            'hora_modificación':'hora_modificacion',
            'ruta_de_entrega':'ruta_entrega',
            'teléfono_de_contacto':'telefono_contacto',
            'alias_de_dirección':'alias_direccion',
            'dirección_completa':'direccion',
        }
    )
    # drop column nombre
    df = df.select(pl.exclude('nombre_completo'))
    # convert to date
    df = df.with_columns(
        pl.col(['fecha_pedido','fecha_modificacion','fecha_entrega']
              ).str.strptime(pl.Date, format="%d-%m-%Y", strict=False))
    # convert to string
    df = df.with_columns(
        pl.col('telefono_contacto').cast(pl.Utf8, strict=False))

    return df

In [6]:
def mpe_pedidos_detalle(url, sheet_name='Pedidos Detalle'):
    # read file with pandas
    df = pd.read_excel(url, sheet_name)
    # convert to polars dataframe
    df= pl.from_pandas(df)
    # rename columns by lower case strings and replacing spaces 
    df = df.select(
        pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
    )
    # rename columns
    df = df.rename(
        {
            'clave_de_producto':'clave_producto',
            'clave_de_promoción':'clave_promocion',
            'producto(s)':'productos',
            'precio_por_unidad':'precio_unidad',
            'descuento_por_unidad':'descuento_unidad',
        }
    )
    return df

In [7]:
df_mpe_usuarios = mpe_usuarios('../mpe-usuarios.xlsx')

In [8]:
df_mpe_detalle_usuarios = mpe_detalle_usuarios('../mpe-usuarios.xlsx')

In [67]:
#df_mpe_pedidos_gral = mpe_pedidos_gral('../mpe-pedidos.xlsx')
mpe_gral = pl.read_csv('../MPE_gral.csv', ignore_errors=True)

In [69]:
mpe_gral = mpe_gral.with_columns(
        pl.col(['Fecha Alta']
              ).str.strptime(pl.Date, strict=False))

In [72]:
mpe_gral = mpe_gral.select(
        pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
    )

In [77]:
mpe_gral = mpe_gral.rename({
    'fecha_modificaci�n':'fecha_modificacion',
    'hora_modificaci�n':'hora_modificacion',
    'tel�fono_de_contacto':'tel_contacto',
    'alias_de_direcci�n':'alias_dir',
    'direcci�n_completa':'dir_completa',
    'num._mes':'num_ mes',
    'a�o':'year',
    'mes|a�o':'month',
    'id|_cedis':'id_cedis',
    'regi�n':'region'
})

In [10]:
df_mpe_pedidos_detalle = mpe_pedidos_detalle('../mpe-pedidos.xlsx')

In [11]:
# create sql query
query = '''
    SELECT * FROM `gepp-538.transformation.mpe_catalogo`
'''
# convert query to pandas dataframe
df_mpe_catalogo = pd.read_gbq(query, credentials=bq_credentials)
# convert to polars dataframe
df_mpe_catalogo = pl.from_pandas(df_mpe_catalogo)

In [12]:
# join tables
df_mpe_ventas = df_mpe_pedidos_gral.join(df_mpe_pedidos_detalle, on='no_orden', how='left')
df_mpe_bodegas = df_mpe_pedidos_gral.join(df_mpe_catalogo, on='id_bodega', how='left')

In [13]:
# convert columns to float
df_mpe_ventas = df_mpe_ventas.with_columns(
    pl.col(['clave_producto','cantidad']).cast(pl.Float64, strict=False)
)

### GETM

In [14]:
def getm_pedidos_gral(url, sheet_name='Pedidos General'):
    # read file with pandas
    df = pd.read_excel(url, sheet_name)
    # convert to polars dataframe
    df= pl.from_pandas(df)
    # rename columns by lower case strings and replacing spaces 
    df = df.select(
        pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
    )
    # rename columns
    df = df.rename(
        {
            'nombre_de_tienda': 'nombre_tienda',
            'ruta_de_desarrollo':'ruta_desarrollo',
            'fecha_de_creación':'fecha_creacion',
            'hora_de_creación':'hora_creacion',
            'fecha_de_modificación':'fecha_modificacion',
            'hora_de_modificación':'hora_modificacion',
            'total_de_productos':'total_productos',
            'fecha_de_despacho':'fecha_despacho',
            'hora_de_despacho':'hora_despacho',
            'fecha_de_entrega':'fecha_entrega',
            'hora_de_entrega':'hora_entrega',
        }
    )
    # drop column nombre
    df = df.select(pl.exclude('nombre_del_cliente_registrado'))
    # convert to date
    df = df.with_columns(
        pl.col(['fecha_creacion','fecha_modificacion','fecha_despacho','fecha_entrega','fecha_cancelado',]
              ).str.strptime(pl.Datetime, format="%d/%m/%Y", strict=False).cast(pl.Datetime).dt.cast_time_unit('ns')
)
    # convert str to boolean
    df = df.with_columns(pl.col('cancelado').map_dict({'No Cancelado':False, 'Cancelado':True}))
    return df

In [15]:
def getm_pedidos_detalle(url, sheet_name='Pedidos Detalle'):
    # read file with pandas
    df = pd.read_excel(url, sheet_name)
    # convert to polars dataframe
    df= pl.from_pandas(df)
    # rename columns by lower case strings and replacing spaces 
    df = df.select(
        pl.col('*').map_alias(lambda col_name: col_name.lower().replace(' ', '_'))
    )
    # rename columns
    df = df.rename(
        {
            'clave_de_producto':'clave_producto',
            'clave_de_promoción':'clave_promocion',
            'tipo_de_promoción':'tipo_promocion',
            'descripción_de_promoción':'descripcion_promocion',
            'producto(s)':'productos',
            'precio_por_unidad':'precio_unidad',
            'descuento_por_unidad':'descuento_unidad',
        }
    )
    # convert str to boolean
    df = df.with_columns(pl.col('cancelado').map_dict({'No Cancelado':False, 'Cancelado':True}))
    return df

In [16]:
# Access GETM push notifications worksheet by url
sheet_id = '1STkVkyDdCjEtTAgMOWbptf8oRqUBMPRV1ufi-fuY8xk'
workbook = gc.open_by_key(sheet_id)
# Access data by worksheet sheet number
getm_push = workbook.worksheet('Push Notifications')
# Save data to table
getm_values_push = getm_push.get_all_values()
# Save accessed data from google sheets to dataframes
getm_push = pd.DataFrame(getm_values_push[1:],columns=getm_values_push[0])
# drop irrelevant columns
getm_push = getm_push.drop(['Año','Mes','Proyecto','Publicación','Caracteres','Espacios','Tipo','ID'],
    axis=1)

In [17]:
# GETM push notifications
# convert to lowercase
getm_push.columns = getm_push.columns.str.lower()
# convert to datetime
getm_push['fecha'] = getm_push['fecha'].apply(pd.to_datetime, dayfirst=True)
# replace "," with noaught and convert to numeric
getm_push['clientes'] = getm_push['clientes'].str.replace(',', '', regex=True).apply(pd.to_numeric)
# rename columns
getm_push = getm_push.rename(columns={'categorias':'categoria','área':'area'})
# drop empty rows
getm_push.dropna(subset=['fecha'], inplace=True)

In [18]:
df_getm_pedidos_gral = getm_pedidos_gral('../getm.xlsx')

In [19]:
df_getm_pedidos_detalle = getm_pedidos_detalle('../getm.xlsx')

### Upload to Big Query

**MPE**

In [None]:
#mpe_users = mpe_users.to_pandas()
#mpe_users.to_gbq('gepp-538.transformation.mpe_users',
#                    project_id='gepp-538',
#                    if_exists='replace',
#                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 5282.50it/s]


In [79]:
#mpe_gral = mpe_gral.to_pandas()
#mpe_gral.to_gbq('gepp-538.transformation.mpe_nvo',
#                    project_id='gepp-538',
#                    if_exists='replace',
#                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 5412.01it/s]


In [20]:
df_mpe_usuarios = df_mpe_usuarios.to_pandas()
df_mpe_usuarios.to_gbq('gepp-538.transformation.mpe_usuarios',
                    project_id='gepp-538',
                    if_exists='append',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 4369.07it/s]


In [21]:
df_mpe_detalle_usuarios = df_mpe_detalle_usuarios.to_pandas()
df_mpe_detalle_usuarios.to_gbq('gepp-538.transformation.mpe_usuarios_detalles',
                    project_id='gepp-538',
                    if_exists='append',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 2906.66it/s]


In [22]:
df_mpe_pedidos_gral = df_mpe_pedidos_gral.to_pandas()
df_mpe_pedidos_gral.to_gbq('gepp-538.transformation.mpe_general',
                    project_id='gepp-538',
                    if_exists='append',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 7194.35it/s]


In [23]:
df_mpe_pedidos_detalle = df_mpe_pedidos_detalle.to_pandas()
df_mpe_pedidos_detalle.to_gbq('gepp-538.transformation.mpe_detalle',
                    project_id='gepp-538',
                    if_exists='append',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 8355.19it/s]


In [24]:
df_mpe_ventas = df_mpe_ventas.to_pandas()
df_mpe_ventas.to_gbq('gepp-538.transformation.mpe_ventas',
                  project_id='gepp-538',
                  if_exists='append',
                  credentials=bq_credentials
                  )

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 6978.88it/s]


In [25]:
df_mpe_bodegas = df_mpe_bodegas.to_pandas()
df_mpe_bodegas.to_gbq('gepp-538.transformation.mpe_bodegas',
                  project_id='gepp-538',
                  if_exists='append',
                  credentials=bq_credentials
                  )

100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 14122.24it/s]


**GETM**

In [26]:
df_getm_pedidos_gral = df_getm_pedidos_gral.to_pandas()
df_getm_pedidos_gral.to_gbq('gepp-538.transformation.getm_general',
                    project_id='gepp-538',
                    if_exists='append',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 7244.05it/s]


In [27]:
df_getm_pedidos_detalle = df_getm_pedidos_detalle.to_pandas()
df_getm_pedidos_detalle.to_gbq('gepp-538.transformation.getm_detalle',
                    project_id='gepp-538',
                    if_exists='append',
                    credentials=bq_credentials)

100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 12985.46it/s]


In [28]:
getm_push.to_gbq('gepp-538.transformation.getm_push_notifications',
                  project_id='gepp-538',
                  if_exists='replace',
                  credentials=bq_credentials
                  )

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 6754.11it/s]


### Retrieve datasets from Big Query

In [None]:
# create sql query
query = '''
    SELECT * FROM `gepp-538.transformation.mpe_catalogo`
'''
# convert query to pandas dataframe
mpe_catalogo = pd.read_gbq(query, credentials=bq_credentials)

## Contact

<!-- Avatar -->
<img src="../Pictures/profile2.png" alt="me" width="75" height="80">
<!-- Text with color, font, fontsize and specific size -->
<p style="color:#323232; font-family: Helevetica; font-size: 20px;">Jesus L. Monroy<br>Economist | Data Scientist</p>
<!-- Insert url links in logos -->
<!-- Telegram -->
<a href="https://t.me/j3suslm" target="_blank" rel="noreferrer"> <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/Telegram_X_2019_Logo.svg/2048px-Telegram_X_2019_Logo.png?size=16&color=3b3b3b" alt="telegram" width="30" height="22" style="padding-left:8px"/>
<!-- Twitter -->
<a href="https://www.twitter.com/sqlalchemist" target="_blank" rel="noreferrer"> <img src="https://toppng.com/public/uploads/preview/twitter-x-new-logo-round-icon-png-11692480241tdbz6jparr.webp?size=16&color=3b3b3b" alt="twitter" width="30" height="22" style="padding-left:8px"/>
<!-- Github -->
<a href="https://github.com/SqlAlchemist/My-portfolio" target="_blank" rel="noreferrer"> <img src="https://icongr.am/devicon/github-original.svg?size=16&color=3b3b3b" alt="github" width="30" height="30" style="padding-left:8px"/>
<!-- Linkedin -->
<a href="https://www.linkedin.com/in/j3sus-lmonroy" target="_blank" rel="noreferrer"> <img src="https://icongr.am/simple/linkedin.svg?size=16&color=3b3b3b" alt="linkedin" width="30" height="30" style="padding-left:8px"/>
<!-- Medium -->
<a href="https://medium.com/@jesus_lmonroy" target="_blank" rel="noreferrer"> <img src="https://cdn1.iconfinder.com/data/icons/social-media-and-logos-12/32/Logo_medium-512.png?size=55&color=3b3b3b" alt="medium" width="30" height="33" style="padding-left:8px"/>