<a href="https://colab.research.google.com/github/Rogerio-mack/data-engineering/blob/main/how_airflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 !pip install apache-airflow

Collecting apache-airflow
  Downloading apache_airflow-2.10.5-py3-none-any.whl.metadata (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic<2.0,>=1.13.1 (from apache-airflow)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting argcomplete>=1.10 (from apache-airflow)
  Downloading argcomplete-3.6.1-py3-none-any.whl.metadata (16 kB)
Collecting asgiref>=2.3.0 (from apache-airflow)
  Downloading asgiref-3.8.1-py3-none-any.whl.metadata (9.3 kB)
Collecting colorlog>=6.8.2 (from apache-airflow)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting configupdater>=3.1.1 (from apache-airflow)
  Downloading ConfigUpdater-3.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting connexion<3.0,>=2.14.2 (from connexion[flask]<3.0,>=2.14.2->apache-airflow)
  Downloading connexion-2.14.2-py2.py3-none-any.whl.metadata (28 kB)
Collecting cron-descriptor>=1.2.24 (from

# Original data: **NFL Stats**

In [None]:
import pandas as pd

df = pd.read_csv('https://github.com/Rogerio-mack/IMT_CD_2025/raw/refs/heads/main/data/nfl_offensive_stats.csv')
display(df.head())
print(df.shape)

Unnamed: 0,game_id,player_id,position,player,team,pass_cmp,pass_att,pass_yds,pass_td,pass_int,...,OT,Roof,Surface,Temperature,Humidity,Wind_Speed,Vegas_Line,Vegas_Favorite,Over_Under,game_date
0,201909050chi,RodgAa00,QB,Aaron Rodgers,GNB,18,30,203,1,0,...,False,outdoors,grass,65,69,10,-3.5,CHI,47.0,9/5/2019
1,201909050chi,JoneAa00,RB,Aaron Jones,GNB,0,0,0,0,0,...,False,outdoors,grass,65,69,10,-3.5,CHI,47.0,9/5/2019
2,201909050chi,ValdMa00,WR,Marquez Valdes-Scantling,GNB,0,0,0,0,0,...,False,outdoors,grass,65,69,10,-3.5,CHI,47.0,9/5/2019
3,201909050chi,AdamDa01,WR,Davante Adams,GNB,0,0,0,0,0,...,False,outdoors,grass,65,69,10,-3.5,CHI,47.0,9/5/2019
4,201909050chi,GrahJi00,TE,Jimmy Graham,GNB,0,0,0,0,0,...,False,outdoors,grass,65,69,10,-3.5,CHI,47.0,9/5/2019


(19973, 69)


# Definindo a DAG

In [None]:
from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime
import pandas as pd

def extract():
    url = 'https://github.com/Rogerio-mack/IMT_CD_2025/raw/refs/heads/main/data/nfl_offensive_stats.csv'
    df = pd.read_csv(url)
    df = df[['game_id','vis_team', 'home_team', 'vis_score', 'home_score', 'Roof', 'Surface', 'Temperature', 'Humidity', 'Wind_Speed']]

    df.to_csv('/tmp/dados_extraidos.csv', index=False)
    print("Extração concluída.")

def transform():
    df = pd.read_csv('/tmp/dados_extraidos.csv')

    df = df.dropna()
    df['Temperature'] = df['Temperature'].astype('str') + '°F'
    df['Wind_Speed'] = df['Wind_Speed'].astype('int')
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)

    df.to_csv('/tmp/dados_transformados.csv', index=False)
    print("Transformação concluída.")

def load():
    df = pd.read_csv('/tmp/dados_transformados.csv')

    df.to_csv('/tmp/dados_finais.csv', index=False)
    print("Carregamento concluído.")

with DAG(
    'etl_simples',
    start_date=datetime(2025, 4, 2),
    schedule_interval='@daily',
    catchup=False
) as dag:

    tarefa_extracao = PythonOperator(
        task_id='extrair_dados',
        python_callable=extract
    )

    tarefa_transformacao = PythonOperator(
        task_id='transformar_dados',
        python_callable=transform
    )

    tarefa_carregamento = PythonOperator(
        task_id='carregar_dados',
        python_callable=load
    )

    tarefa_extracao >> tarefa_transformacao >> tarefa_carregamento


# Executando a DAG (Colab)




In [None]:
tarefa_extracao.execute(context={})
tarefa_transformacao.execute(context={})
tarefa_carregamento.execute(context={})

Extração concluída.
[2025-04-03T02:25:59.348+0000] {python.py:240} INFO - Done. Returned value was: None
Transformação concluída.
[2025-04-03T02:25:59.406+0000] {python.py:240} INFO - Done. Returned value was: None
Carregamento concluído.
[2025-04-03T02:25:59.418+0000] {python.py:240} INFO - Done. Returned value was: None


In [None]:
extract()
transform()
load()

Extração concluída.
Transformação concluída.
Carregamento concluído.


# Saídas da DAG

In [None]:
df_extraidos = pd.read_csv('/tmp/dados_extraidos.csv')
display(df_extraidos.head())

Unnamed: 0,game_id,vis_team,home_team,vis_score,home_score,Roof,Surface,Temperature,Humidity,Wind_Speed
0,201909050chi,GNB,CHI,10,3,outdoors,grass,65,69,10
1,201909050chi,GNB,CHI,10,3,outdoors,grass,65,69,10
2,201909050chi,GNB,CHI,10,3,outdoors,grass,65,69,10
3,201909050chi,GNB,CHI,10,3,outdoors,grass,65,69,10
4,201909050chi,GNB,CHI,10,3,outdoors,grass,65,69,10


In [None]:
df_finais = pd.read_csv('/tmp/dados_finais.csv')
display(df_finais.head())

Unnamed: 0,game_id,vis_team,home_team,vis_score,home_score,Roof,Surface,Temperature,Humidity,Wind_Speed
0,201909050chi,GNB,CHI,10,3,outdoors,grass,65°F,69,10
1,201909080car,LAR,CAR,30,27,outdoors,grass,87°F,53,3
2,201909080cle,TEN,CLE,43,13,outdoors,grass,71°F,55,10
3,201909080crd,DET,ARI,27,27,retractable roof (closed),grass,72°F,45,0
4,201909080dal,NYG,DAL,17,35,retractable roof (closed),fieldturf,72°F,45,0


# Melhorias

* **Tratamento de exceções**

> Podem ser feitas com `try()/Exception` para tratar situações de erro nos dados, como formatos. Suponha por exemplo, erro na conversão de `df['Wind_Speed'].astype('int')` por não ter um valor numérico de entrada.

* **Carga dos dados tratados**

> O load dos dados poderia ser feito em um banco de dados, para processamento futuro.

* **Log & report do processamento**

> Cada etapa pode ter logs e/ou reports do processamento, informando registros processados, registros de saída, valores sumários (por exemplo, pense no total em R$ das vendas processadas em um dado arquivo).

- Facilitar o reaproveitamento da DAG em outros projetos.

etc.
