In [1]:
!pip install schedule



In [2]:
%run ../spark_session.ipynb

In [3]:
import time
import requests
import schedule
from pyspark.sql.functions import explode, col, to_timestamp

In [4]:
def explode_data(df):
    df = df.withColumn("ps", explode(df.ps))
    df = df.withColumn("vs", explode(df.ps.vs))
    return df

In [5]:
def output_formatter(df):
    df = df.withColumnRenamed("hr", "hora") \
                  .withColumn("codigo_parada", col("ps.cp")) \
                  .withColumn("nome_parada", col("ps.np")) \
                  .withColumn("latitude_parada", col("ps.py")) \
                  .withColumn("longitude_parada", col("ps.px")) \
                  .withColumn("prefixo_veiculo", col("ps.vs.p")) \
                  .withColumn("previsao_chegada_parada", col("ps.vs.t")) \
                  .withColumn("flag_acessivel", col("ps.vs.a")) \
                  .withColumn("data_hora", col("ps.vs.ta")) \
                  .withColumn("latitude_veiculo", col("ps.vs.py")) \
                  .withColumn("longitude_veiculo", col("ps.vs.px"))
    
    df = df.withColumn("prefixo_veiculo", explode(col('prefixo_veiculo')))
    df = df.withColumn("previsao_chegada_parada", explode(col('previsao_chegada_parada')))
    df = df.withColumn("flag_acessivel", explode(col('flag_acessivel')))
    df = df.withColumn("data_hora", explode(col('data_hora')))
    df = df.withColumn("latitude_veiculo", explode(col('latitude_veiculo')))
    df = df.withColumn("longitude_veiculo", explode(col('longitude_veiculo')))
    
    df = df.withColumn("data_hora", to_timestamp(col('data_hora')))

    return df.drop("ps","vs") 

In [6]:
def job_raw_preview():
    print("Criando sessão spark e carregando dados /raw/preview")
    spark = get_or_create_session_spark("raw_trusted_preview")
    df = spark.read.format("delta").load("s3a://raw/preview")
    
    print("Aplicando tratamentos")
    df = explode_data(df)
    df = output_formatter(df)
    
    print("Salvando dados!")
    df.write.format("delta").mode("overwrite").save("s3a://trusted/preview")
    
    spark.stop()
    print("Processamento finalizado!")

In [None]:
schedule.every(5).minutes.do(job_raw_preview)

while True:
    schedule.run_pending()
    time.sleep(1)

Criando sessão spark e carregando dados /raw/preview
Obtendo sessão spark raw_trusted_preview
Sessão Spark obtida com sucesso! raw_trusted_preview
Aplicando tratamentos
Salvando dados!
Processamento finalizado!
