## Big Query upload Sql-Server

### Environment settings

In [1]:
import numpy as np
import pandas as pd
import polars as pl
import gspread
import duckdb
# Import authenticator and gspread to manage g-sheets
from oauth2client.service_account import ServiceAccountCredentials
from google.oauth2 import service_account
from google.cloud import bigquery
import connectorx as cx

In [2]:
# Create sheets and drive scopes to authenticate
scopes = ['https://www.googleapis.com/auth/spreadsheets',
        'https://www.googleapis.com/auth/drive',
        'https://www.googleapis.com/auth/analytics.readonly']

# Read google credentials
api = '../APIS/gepp-538-db.json'
# connect to google sheets
gs_credentials = ServiceAccountCredentials.from_json_keyfile_name(api, scopes)
gc = gspread.authorize(gs_credentials)
# connect to big query
bq_credentials = service_account.Credentials.from_service_account_file(api)
project_id = 'gepp-538'
client = bigquery.Client(credentials=bq_credentials,project=project_id)

### Duckdb

In [27]:
# consulta detallada usuarios
mpe_users = pl.read_csv('../usuarios.csv', separator=';', ignore_errors=True)

In [28]:
# create sql queries with duckdb
# read file with duckdb (csv, parquet, json)
duckdb.sql(
    '''
    SELECT
        Origen, COUNT(Nombre) AS usuarios
    FROM '../usuarios.csv'
    GROUP BY Origen
    ORDER BY usuarios DESC
    '''
).pl() # and pipeline to polars dataframe
#others formats include: .pdf() for pandas, .arrow() for arrow, and .fetchnumpy() for numpy arrays

Origen,usuarios
str,i64
"""WA""",765173
"""Ecommerce""",97913


**create files from duckdb queries**
* duckdb.sql('SELECT 42').write_parquet('out.parquet') # Write to a Parquet file
* duckdb.sql('SELECT 42').write_csv('out.csv')         # Write to a CSV file
* duckdb.sql("COPY (SELECT 42) TO 'out.parquet'")      # Copy to a parquet file

### MPE

In [3]:
mpe_usuarios = (
    pl.read_csv('../mpe-usuarios.csv', dtypes={'telefono': pl.Utf8, 'id_atg':pl.Utf8})
    .with_columns(
        pl.col('fecha_alta').str.strptime(pl.Datetime, strict=False)
    )
)

In [4]:
mpe_usuarios_direcciones = (
    # read csv file
    pl.read_csv('../mpe-direcciones.csv', dtypes={'telefono_contacto': pl.Utf8,'codigo_postal':pl.Utf8})
        # change column dtypes
    .with_columns(
        pl.col('fecha_alta','fecha_alta_gepp','fecha_actualizacion_gepp').str.strptime(pl.Datetime, strict=False),
        pl.col('telefono_contacto').cast(pl.Utf8),
    )
)

In [5]:
mpe_ordenes = (
    pl.read_csv('../mpe-ordenes.csv').with_columns(
        pl.col('fecha_alta','fecha_entrega').str.strptime(pl.Datetime, strict=False)
    )
)

In [6]:
mpe_ordenes_detalle = (
    pl.read_csv('../mpe-ordenes-detalle.csv')
)

In [7]:
mpe_promociones = (
    pl.read_csv('../mpe-promociones.csv', dtypes={'DescripcionCorta':pl.Utf8})
    .with_columns(pl.col('ClavePromo').cast(pl.Utf8))
).unique(subset='ClavePromo')

In [8]:
mpe_ordenes_estatus = pl.read_csv('../mpe-ordenes-estatus.csv')

In [9]:
mpe_redes_sociales = (
    pl.read_csv('../mpe-redes-sociales.csv').select(
        pl.col('id_red_social','red_social','descripcion')
    )
)

In [10]:
mpe_tipo_direcciones = pl.read_csv('../mpe-tipo-direcciones.csv')

In [11]:
mpe_genero = pl.read_csv('../mpe-genero.csv')

In [12]:
mpe_bodegas = pl.read_csv('../mpe-catalogo-bodegas.csv')

In [13]:
# join mpe_usuarios_direcciones, mpe_ordenes, mpe_ordenes_detalle, mpe_promociones
mpe_sabana = (
    mpe_usuarios_direcciones.join(mpe_ordenes, on='id_direccion', how='left')
    .join(mpe_ordenes_detalle, on='id_orden', how='left')
    .join(mpe_promociones, on='ClavePromo', how='left')
    .join(mpe_bodegas, on='id_bodega', how='left')
).rename({'ID_Bodega':'id_bodega_promo', 'Activo':'promo_activa'})

### Upload to Big Query

**MPE**

In [209]:
# create dataset gbq api
#client.create_dataset('database')

Dataset(DatasetReference('gepp-538', 'database'))

In [224]:
mpe_sabana = mpe_sabana.to_pandas()
mpe_sabana.to_gbq('gepp-538.database.mpe',
                    project_id='gepp-538',
                    if_exists='replace',
                    credentials=bq_credentials)

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 7626.01it/s]


### Retrieve datasets from Big Query

In [None]:
# create sql query
query = '''
    SELECT * FROM `gepp-538.transformation.mpe_catalogo`
'''
# convert query to pandas dataframe
mpe_catalogo = pd.read_gbq(query, credentials=bq_credentials)