# IAAS - Data exploration

## Environment settings

In [1]:
import numpy as np
import pandas as pd
import polars as pl
import gspread
import duckdb
# Import authenticator and gspread to manage g-sheets
from oauth2client.service_account import ServiceAccountCredentials
from google.oauth2 import service_account
from google.cloud import bigquery

In [2]:
# Create sheets and drive scopes to authenticate
scopes = ['https://www.googleapis.com/auth/spreadsheets',
        'https://www.googleapis.com/auth/drive',
        'https://www.googleapis.com/auth/analytics.readonly']

# Read google credentials
api = '../APIS/arkham-538.json'
# connect to google sheets
gs_credentials = ServiceAccountCredentials.from_json_keyfile_name(api, scopes)
gc = gspread.authorize(gs_credentials)
# connect to big query
bq_credentials = service_account.Credentials.from_service_account_file(api)
project_id = 'arkham-538'
client = bigquery.Client(credentials=bq_credentials,project=project_id)

## Create big query schema

In [5]:
# create dataset gbq api
#client.create_dataset('iaas')

Dataset(DatasetReference('arkham-538', 'iaas'))

## Read data sources

### Reporte Mensual HM 01-12-2022 al 10-07-2023

#### HM-1

In [3]:
hm1 = pl.read_csv('~/Downloads/Project IAAS/HM1.csv', ignore_errors=True)

#### HM-2

In [4]:
hm2 = pl.read_csv('~/Downloads/Project IAAS/HM2.csv', ignore_errors=True)

### Reporte Mensual IAAS Infeccion y Microorganismos 01-12-2022 AL 30-06-2023

In [5]:
im = pl.read_excel('~/Downloads/Project IAAS/Reporte Mensual IAAS Infeccion y Microorganismos 01-12-2022 AL 30-06-2023.xlsx',
                   read_csv_options={'infer_schema_length':0})

### Reporte Mensual IAAS Seguimiento HOSP 01-12-2022 AL 30-06-2023

In [6]:
sh = pl.read_excel('~/Downloads/Project IAAS/Reporte Mensual IAAS Seguimiento HOSP 01-12-2022 AL 30-06-2023.xlsx',
                   read_csv_options={'infer_schema_length':0})

### Reporte Mensual IAAS Seguimiento Infecciones 01-12-2022 Al 30-06-2023

In [7]:
si = pl.read_excel('~/Downloads/Project IAAS/Reporte Mensual IAAS Seguimiento Infecciones 01-12-2022 Al 30-06-2023.xlsx', 
                   read_csv_options={'infer_schema_length':0})

### Reporte Mensual InformacionGeneral - 2023 10072023

#### InfoGral

In [8]:
ig = pl.read_excel('~/Downloads/Project IAAS/Reporte Mensual InformacionGeneral - 2023 10072023.xlsx', sheet_name='InfoGral',
                   read_csv_options={'infer_schema_length':0})

#### InfoGralReg

In [9]:
igr = pl.read_excel('~/Downloads/Project IAAS/Reporte Mensual InformacionGeneral - 2023 10072023.xlsx', sheet_name='InfoGralReg',
                    read_csv_options={'infer_schema_length':0})

### Antibioticos Relevantes

In [115]:
mxrel = pl.read_excel('~/Downloads/Project IAAS/AntibioticosRelevantes.xlsx', sheet_name='MxRel',
                      read_csv_options={'infer_schema_length':0})

In [11]:
clsi = pl.read_excel('~/Downloads/Project IAAS/AntibioticosRelevantes.xlsx', sheet_name='CLSI',
                     read_csv_options={'infer_schema_length':0})

In [12]:
med = pl.read_excel('~/Downloads/Project IAAS/AntibioticosRelevantes.xlsx', sheet_name='Med',
                    read_csv_options={'infer_schema_length':0})

### Catalogos

In [13]:
cat_edad = pl.read_excel('~/Downloads/Project IAAS/Catalogos.xlsx', sheet_name='cat_edad', read_csv_options={'infer_schema_length':0})

In [14]:
cat_egreso = pl.read_excel('~/Downloads/Project IAAS/Catalogos.xlsx', sheet_name='cat_egreso', read_csv_options={'infer_schema_length':0})

In [15]:
calendario = pl.read_excel('~/Downloads/Project IAAS/Catalogos.xlsx', sheet_name='calendario', read_csv_options={'infer_schema_length':0})

## SQL queries to files

In [16]:
# create sql queries with duckdb
# read file with duckdb (csv, parquet, json)
duckdb.sql(
    '''
    SELECT * 
    FROM igr
    LIMIT 5
    '''
).pl() # and pipeline to polars dataframe
'''
others formats include: 
.pdf() for pandas
.arrow() for arrow
.fetchnumpy() for numpy
'''

cvePresupuestal,feRegistro,desc_delegacion,desc_unidad,cveAno,cveMes,divEgresosPeditria,divDiasPeditria,divEgresosCirugia,divDiasCirugia,divEgresosGineco,divDiasGineco,divEgresosMedInterna,divDiasMedInterna,divTotalEgresos,divTotalDiasEstancia,UCIEgresosCuneroPato,UCIDiasCuneroPato,UCIEgresosQuemados,UCIDiasQuemados,UCIEgresosTerapiaIntermedia,UCIDiasTerapiaIntermedia,UCIEgresosAdultos,UCIDiasAdultos,UCIEgresosCoronarios,UCIDiasCoronarios,UCIEgresosNeonatales,UCIDiasNeonatales,UCIEgresosPediatrico,UCIDiasPediatrico,UCIEgresosPostQx,UCIDiasPostQx,UCIEgresosPostQxPediatrico,UCIDiasPostQxPediatrico,UCIEgresosRespiratorio,UCIDiasRespiratorio,UCITotalEgresos,…,diasEstancia_duplicated_19,cveServicio_duplicated_20,descServicio_duplicated_20,numEgresos_duplicated_20,diasEstancia_duplicated_20,cveServicio_duplicated_21,descServicio_duplicated_21,numEgresos_duplicated_21,diasEstancia_duplicated_21,cveServicio_duplicated_22,descServicio_duplicated_22,numEgresos_duplicated_22,diasEstancia_duplicated_22,cveServicio_duplicated_23,descServicio_duplicated_23,numEgresos_duplicated_23,diasEstancia_duplicated_23,cveServicio_duplicated_24,descServicio_duplicated_24,numEgresos_duplicated_24,diasEstancia_duplicated_24,cveServicio_duplicated_25,descServicio_duplicated_25,numEgresos_duplicated_25,diasEstancia_duplicated_25,cveServicio_duplicated_26,descServicio_duplicated_26,numEgresos_duplicated_26,diasEstancia_duplicated_26,cveServicio_duplicated_27,descServicio_duplicated_27,numEgresos_duplicated_27,diasEstancia_duplicated_27,cveServicio_duplicated_28,descServicio_duplicated_28,numEgresos_duplicated_28,diasEstancia_duplicated_28
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""010101012151""","""2023-06-05 09:…","""Aguascalientes…","""HGZ 1 AGUASCAL…","""2023""","""5""","""197""","""599""","""372""","""2016""","""261""","""400""","""366""","""2357""","""1196""","""5372""","""0""","""0""","""0""","""0""","""0""","""0""","""18""","""41""","""0""","""0""","""0""","""0""","""7""","""45""","""0""","""0""","""0""","""0""","""0""","""0""","""25""",…,"""33""","""51""","""ONCOLOGIA QUIR…","""18""","""40""","""61""","""CUNERO PATOLOG…","""5""","""64""","""79""","""ALOJAMIENTO CO…","""135""","""248""",,,,,,,,,,,,,,,,,,,,,,,,
"""010101012151""","""2023-05-30 12:…","""Aguascalientes…","""HGZ 1 AGUASCAL…","""2023""","""4""","""205""","""439""","""313""","""1887""","""249""","""407""","""386""","""2290""","""1153""","""5023""","""15""","""29""","""0""","""0""","""0""","""0""","""18""","""91""","""0""","""0""","""0""","""4""","""2""","""7""","""0""","""0""","""0""","""0""","""0""","""0""","""35""",…,"""27""","""46""","""CIRUGIA PLASTI…","""2""","""6""","""51""","""ONCOLOGIA QUIR…","""15""","""33""","""61""","""CUNERO PATOLOG…","""15""","""29""","""79""","""ALOJAMIENTO CO…","""124""","""173""",,,,,,,,,,,,,,,,,,,,
"""010101012151""","""2023-05-04 15:…","""Aguascalientes…","""HGZ 1 AGUASCAL…","""2023""","""3""","""177""","""624""","""285""","""1820""","""222""","""378""","""319""","""1817""","""1003""","""4639""","""0""","""0""","""0""","""0""","""0""","""0""","""19""","""98""","""0""","""0""","""0""","""0""","""2""","""53""","""0""","""0""","""0""","""0""","""0""","""0""","""21""",…,"""96""","""79""","""ALOJAMIENTO CO…","""109""","""155""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""010101012151""","""2023-03-28 12:…","""Aguascalientes…","""HGZ 1 AGUASCAL…","""2023""","""2""","""167""","""635""","""416""","""2140""","""243""","""345""","""394""","""1834""","""1220""","""4954""","""0""","""0""","""0""","""0""","""0""","""0""","""17""","""75""","""0""","""0""","""0""","""0""","""9""","""66""","""0""","""0""","""0""","""0""","""0""","""0""","""26""",…,"""23""","""51""","""ONCOLOGIA QUIR…","""15""","""57""","""61""","""CUNERO PATOLOG…","""6""","""89""","""79""","""ALOJAMIENTO CO…","""88""","""166""",,,,,,,,,,,,,,,,,,,,,,,,
"""010101012151""","""2023-03-07 12:…","""Aguascalientes…","""HGZ 1 AGUASCAL…","""2023""","""1""","""289""","""1667""","""230""","""385""","""345""","""1917""","""206""","""645""","""1070""","""4614""","""0""","""0""","""0""","""0""","""0""","""0""","""31""","""123""","""0""","""0""","""0""","""0""","""14""","""126""","""0""","""0""","""0""","""0""","""0""","""0""","""45""",…,"""16""","""51""","""ONCOLOGIA QUIR…","""7""","""51""","""61""","""CUNERO PATOLOG…","""15""","""120""","""79""","""ALOJAMIENTO CO…","""103""","""145""",,,,,,,,,,,,,,,,,,,,,,,,


## Send to Big Query

**Tables to migrate to Big Query**:

* hm1: Reporte Mensual HM1 
* hm2: Reporte Mensual HM2

* im:  Reporte mensual Infeccion Microorganismos
* sh:  Reporte Mensual Seguimiento Hospital
* si:  Reporte Mensual Seguimiento Infecciones
* ig:  Reporte Mensual Informacion General 1
* igr: Reporte Mensual Informacion General 1
* mxrel
* clsi
* med
* cat_edad
* cat_egreso
* calendario

In [17]:
hm1 = hm1.to_pandas()
hm1.to_gbq('arkham-538.sources.hm1_mensual',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  hm1.to_gbq('arkham-538.sources.hm1_mensual',
1it [03:15, 196.00s/it]


In [18]:
hm2 = hm2.to_pandas()
hm2.to_gbq('arkham-538.sources.hm2_mensual',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  hm2.to_gbq('arkham-538.sources.hm2_mensual',
1it [03:13, 193.86s/it]


In [19]:
im = im.to_pandas()
im.to_gbq('arkham-538.sources.infeccion_microorganismos',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  im.to_gbq('arkham-538.sources.infeccion_microorganismos',
1it [00:40, 40.16s/it]


In [20]:
sh = sh.to_pandas()
sh.to_gbq('arkham-538.sources.seg_hosp',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  sh.to_gbq('arkham-538.sources.seg_hosp',
1it [00:53, 53.23s/it]


In [21]:
si = si.to_pandas()
si.to_gbq('arkham-538.sources.seg_infecciones',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  si.to_gbq('arkham-538.sources.seg_infecciones',
1it [00:42, 42.77s/it]


In [22]:
ig = ig.to_pandas()
ig.to_gbq('arkham-538.sources.info_gral',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  ig.to_gbq('arkham-538.sources.info_gral',
1it [00:09,  9.48s/it]


In [23]:
igr = igr.to_pandas()
igr.to_gbq('arkham-538.sources.info_gral_reg',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  igr.to_gbq('arkham-538.sources.info_gral_reg',
1it [00:04,  4.94s/it]


In [27]:
mxrel = mxrel.to_pandas()
mxrel.to_gbq('arkham-538.sources.xmlr',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  mxrel.to_gbq('arkham-538.sources.xmlr',
1it [00:04,  4.72s/it]


In [25]:
clsi = clsi.to_pandas()
clsi.to_gbq('arkham-538.sources.clsi',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  clsi.to_gbq('arkham-538.sources.clsi',
1it [00:03,  3.40s/it]


In [28]:
med = med.to_pandas()
med.to_gbq('arkham-538.sources.med',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  med.to_gbq('arkham-538.sources.med',
1it [00:05,  5.55s/it]


In [29]:
cat_edad = cat_edad.to_pandas()
cat_edad.to_gbq('arkham-538.sources.cat_edad',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  cat_edad.to_gbq('arkham-538.sources.cat_edad',
1it [00:02,  2.89s/it]


In [30]:
cat_egreso = cat_egreso.to_pandas()
cat_egreso.to_gbq('arkham-538.sources.cat_egreso',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  cat_egreso.to_gbq('arkham-538.sources.cat_egreso',
1it [00:02,  2.99s/it]


In [31]:
calendario = calendario.to_pandas()
calendario.to_gbq('arkham-538.sources.calendario',
                    project_id='arkham-538',
                    if_exists='replace',
                    credentials=bq_credentials)

  calendario.to_gbq('arkham-538.sources.calendario',
1it [00:02,  2.91s/it]


## Query from Big Query

In [32]:
# create sql query
query = '''
    SELECT * FROM `arkham-538.sources.info_gral_reg`
    LIMIT 5
'''

In [33]:
# convert query to pandas dataframe
info_gral_reg = pd.read_gbq(query, credentials=bq_credentials)
# display dataframe
info_gral_reg

Unnamed: 0,cvePresupuestal,feRegistro,desc_delegacion,desc_unidad,cveAno,cveMes,divEgresosPeditria,divDiasPeditria,divEgresosCirugia,divDiasCirugia,...,numEgresos_duplicated_26,diasEstancia_duplicated_26,cveServicio_duplicated_27,descServicio_duplicated_27,numEgresos_duplicated_27,diasEstancia_duplicated_27,cveServicio_duplicated_28,descServicio_duplicated_28,numEgresos_duplicated_28,diasEstancia_duplicated_28
0,20501142151,2023-07-05 11:37:01.110,Baja California,HGOMF 7 TIJUANA,2023,5,341,176,0,0,...,,,,,,,,,,
1,20501142151,2023-07-04 15:45:47.817,Baja California,HGOMF 7 TIJUANA,2023,4,380,1063,0,0,...,,,,,,,,,,
2,20501142151,2023-07-04 15:15:12.840,Baja California,HGOMF 7 TIJUANA,2023,3,403,913,0,0,...,,,,,,,,,,
3,20501142151,2023-03-10 17:21:57.053,Baja California,HGOMF 7 TIJUANA,2023,2,465,1102,0,0,...,,,,,,,,,,
4,20501142151,2023-02-08 16:26:06.800,Baja California,HGOMF 7 TIJUANA,2023,1,374,861,0,0,...,,,,,,,,,,


## Ingestion to Clever Cloud

In [7]:
# Import sqlalchemy library
import os
import json
import sqlalchemy
from sqlalchemy.types import (Integer, Float, Boolean, DateTime, String, Date, TIMESTAMP)

# path to credentials folder
filename = '../APIS/credentials.json'
# read json file
with open(filename) as f:
    keys = json.load(f)
# read credentials
hostname = keys['clevercloud_host']
dbname = keys['clevercloud_db']
uname = keys['clevercloud_user']
pwd = keys['clevercloud_passw']
port= keys['clevercloud_port']

# Create engine
engine = sqlalchemy.create_engine('postgresql+psycopg2://'+uname+':'+pwd+'@'+hostname+':'+port+'/'+dbname, echo=False)
# Create connection by sqlalchemy
conn = engine.connect()

In [20]:
# Drop tables and dependent views
conn.execute('DROP TABLE IF EXISTS hm1 CASCADE;');
conn.execute('DROP TABLE IF EXISTS hm2 CASCADE;');
conn.execute('DROP TABLE IF EXISTS im CASCADE;');
conn.execute('DROP TABLE IF EXISTS sh CASCADE;');
conn.execute('DROP TABLE IF EXISTS si CASCADE;');
conn.execute('DROP TABLE IF EXISTS ig CASCADE;');
conn.execute('DROP TABLE IF EXISTS igr CASCADE;');
conn.execute('DROP TABLE IF EXISTS mxrel CASCADE;');
conn.execute('DROP TABLE IF EXISTS clsi CASCADE;');
conn.execute('DROP TABLE IF EXISTS med CASCADE;');
conn.execute('DROP TABLE IF EXISTS cat_edad CASCADE;');
conn.execute('DROP TABLE IF EXISTS cat_egreso CASCADE;');
conn.execute('DROP TABLE IF EXISTS calendario CASCADE;');

In [23]:
#Migrate dataframes to PostgreSQL on server
hm1 = hm1.to_pandas()
hm1.to_sql('hm1', conn, if_exists='replace', index=False);

In [25]:
hm2 = hm2.to_pandas()
hm2.to_sql('hm2', conn, if_exists='replace', index=False);

In [27]:
im = im.to_pandas()
im.to_sql('im', conn, if_exists='replace', index=False);

In [29]:
sh = sh.to_pandas()
sh.to_sql('sh', conn, if_exists='replace', index=False);

In [30]:
si = si.to_pandas()
si.to_sql('si', conn, if_exists='replace', index=False);

In [31]:
ig = ig.to_pandas()
ig.to_sql('ig', conn, if_exists='replace', index=False);

In [32]:
igr = igr.to_pandas()
igr.to_sql('igr', conn, if_exists='replace', index=False);

In [33]:
mxrel = mxrel.to_pandas()
mxrel.to_sql('mxrel', conn, if_exists='replace', index=False);

In [34]:
clsi = clsi.to_pandas()
clsi.to_sql('clsi', conn, if_exists='replace', index=False);

In [35]:
med = med.to_pandas()
med.to_sql('med', conn, if_exists='replace', index=False);

In [36]:
cat_edad = cat_edad.to_pandas()
cat_edad.to_sql('cat_edad', conn, if_exists='replace', index=False);

In [37]:
cat_egreso = cat_egreso.to_pandas()
cat_egreso.to_sql('cat_egreso', conn, if_exists='replace', index=False);

In [38]:
calendario = calendario.to_pandas()
calendario.to_sql('calendario', conn, if_exists='replace', index=False);

In [8]:
# Close the connection
conn.close()

---

### Contact

<img src="../Pictures/PMcard.png" style="width:250px;height:85px;">