In [1]:
import dagster
import sys
import pandas as pd
import pendulum
sys.path.append("/home/lmoraes/maestro")
from pathlib import Path
import tempfile

In [6]:
import repositories.capturas.br_rj_riodejaneiro_onibus_rdo.registros as pipeline
import repositories.capturas.solids as solids
import repositories.capturas.resources as resources
from repositories.helpers.helpers import read_config


In [92]:
config = read_config("/home/lmoraes/maestro/repositories/capturas/br_rj_riodejaneiro_brt_rdo/registros.yaml")
config["resources"]["timezone_config"]["config"]["timezone"] = "America/Sao_Paulo"

In [121]:
@dagster.composite_solid
def set_header_jupyter(file_path, header):
    file_path = pipeline.delete_xls_header(file_path)
    file_path = pipeline.set_xls_header(file_path, header)
    return file_path


@dagster.solid(
    required_resource_keys={"basedosdados_config", "timezone_config"},
)
def pre_treatment_br_rj_riodejaneiro_brt_rdo_jupyter(context, file_path):

    timezone = context.resources.timezone_config["timezone"]

    # Rearrange columns
    df = pd.read_csv(file_path, delimiter=";")
    df.columns = pipeline.ORIGINAL_HEADER
    df.rename(columns = pipeline.column_mapping, inplace = True)
    df = df.reindex(columns = pipeline.ORDERED_HEADER)
    timestamp_captura = pd.to_datetime(pendulum.now(timezone).isoformat())
    df["timestamp_captura"] = timestamp_captura
    context.log.debug(", ".join(list(df.columns)))

    return df

@dagster.pipeline(
    mode_defs=[
        dagster.ModeDefinition(
            "dev", resource_defs={"basedosdados_config": resources.basedosdados_config, 
                                  "timezone_config": resources.timezone_config,
                                  "discord_webhook": resources.discord_webhook}
        ),
    ],
)
def br_rj_riodejaneiro_brt_rdo_registros_jupyter():

    filename, filetype, file_path, partitions = pipeline.parse_file_path_and_partitions()

    raw_file_path = pipeline.get_file_from_storage(file_path=file_path, filename=filename, 
                                          partitions=partitions, filetype=filetype)

    #header = pipeline.get_header()

    #raw_header_file_path = set_header_jupyter(raw_file_path, header)

    treated_data = pre_treatment_br_rj_riodejaneiro_brt_rdo_jupyter(raw_file_path)

    treated_file_path = pipeline.save_treated_local(treated_data, file_path)

    # # TODO: REFAZER A FUNÇÃO PARA SUBIR SÓ STAGING
    # upload_to_bigquery(treated_file_path, raw_file_path, partitions)
    #pipeline.create_table_bq(treated_file_path)

In [122]:
result = dagster.execute_pipeline(br_rj_riodejaneiro_brt_rdo_registros_jupyter, 
                                run_config = config)

2021-03-22 18:38:18 - dagster - DEBUG - br_rj_riodejaneiro_brt_rdo_registros_jupyter - a8ccbe55-b1d2-405b-a8ea-df8036fc8cec - 950 - ENGINE_EVENT - Starting initialization of resources [basedosdados_config, io_manager, timezone_config].
2021-03-22 18:38:18 - dagster - DEBUG - br_rj_riodejaneiro_brt_rdo_registros_jupyter - a8ccbe55-b1d2-405b-a8ea-df8036fc8cec - 950 - ENGINE_EVENT - Finished initialization of resources [basedosdados_config, io_manager, timezone_config].
2021-03-22 18:38:18 - dagster - DEBUG - br_rj_riodejaneiro_brt_rdo_registros_jupyter - a8ccbe55-b1d2-405b-a8ea-df8036fc8cec - 950 - PIPELINE_START - Started execution of pipeline "br_rj_riodejaneiro_brt_rdo_registros_jupyter".
2021-03-22 18:38:18 - dagster - DEBUG - br_rj_riodejaneiro_brt_rdo_registros_jupyter - a8ccbe55-b1d2-405b-a8ea-df8036fc8cec - 950 - ENGINE_EVENT - Executing steps in process (pid: 950)
2021-03-22 18:38:18 - dagster - DEBUG - br_rj_riodejaneiro_brt_rdo_registros_jupyter - a8ccbe55-b1d2-405b-a8ea-df803

In [108]:
df = result.output_for_solid('pre_treatment_br_rj_riodejaneiro_brt_rdo_jupyter')

2021-03-22 18:34:28 - dagster - DEBUG - br_rj_riodejaneiro_brt_rdo_registros_jupyter - 2b6f7383-170c-496f-ba4f-730a64709ec5 - 950 - ENGINE_EVENT - Starting initialization of resources [basedosdados_config, io_manager, timezone_config].
2021-03-22 18:34:28 - dagster - DEBUG - br_rj_riodejaneiro_brt_rdo_registros_jupyter - 2b6f7383-170c-496f-ba4f-730a64709ec5 - 950 - ENGINE_EVENT - Finished initialization of resources [basedosdados_config, io_manager, timezone_config].


In [123]:
file_path = result.output_for_solid('get_file_from_storage')

2021-03-22 21:03:51 - dagster - DEBUG - br_rj_riodejaneiro_brt_rdo_registros_jupyter - a8ccbe55-b1d2-405b-a8ea-df8036fc8cec - 950 - ENGINE_EVENT - Starting initialization of resources [basedosdados_config, io_manager, timezone_config].
2021-03-22 21:03:51 - dagster - DEBUG - br_rj_riodejaneiro_brt_rdo_registros_jupyter - a8ccbe55-b1d2-405b-a8ea-df8036fc8cec - 950 - ENGINE_EVENT - Finished initialization of resources [basedosdados_config, io_manager, timezone_config].


In [124]:
file_path

'/home/lmoraes/maestro/notebooks/data/raw/br_rj_riodejaneiro_brt_rdo/registros/year=2021/month=03/RDO012021_v1.csv'

In [112]:
df['GRATUIDADE TOTAL']

0         4
1        21
2        20
3        52
4        45
       ... 
3911    596
3912    644
3913    689
3914    424
3915    322
Name: GRATUIDADE TOTAL, Length: 3916, dtype: int64

In [125]:
from openpyxl import load_workbook

In [61]:
ws.cell(2,22).value

In [127]:
file_path = "/home/lmoraes/maestro/notebooks/data/staging/br_rj_riodejaneiro_brt_rdo/registros/year=2021/month=03/RDO012021_v1.csv"
df = pd.read_csv(file_path, dtype=str)
df.iloc[0]

operadora                                                   221000050
linha                                                           TCLJO
servico_tipo                                                        A
servico_termo                                                       1
tipo_veiculo                                                        1
data_ano                                                         2021
data_mes                                                            1
data_dia                                                            1
tarifa_codigo                                                       A
tarifa_valor                                                     4,05
frota_determinada                                                   0
frota_licenciada                                                    0
frota_operante                                                      0
viagem_realizada                                                    0
km                  