# 4 - Modelagem com Machine Learning

P1: Ã‰ possÃ­vel predizer a taxa de poluiÃ§Ã£o emitida no ar nos prÃ³ximos anos?\
P2: A estaÃ§Ã£o do ano influencia a quantidade de poluentes no ar de alguma forma?

1 - tentar fazer previsÃµes utilziando modelos lineares para ambos perguntas 1 e 2

2 - tentar verificar se eles conseguem prever com corretude anos posteriores ao treinamento do modelo.

3 - Aplicar modelos mais complexos para as mesmas coisas que a regressÃ£o linear tentou verificar

4 - verificar se as regressÃµes mais complexas conseguem prever melhor que a regressÃ£o linear 

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import duckdb
import pandas as pd
import numpy as np
import os 

PM25_ID = 365
NO2_ID = 375
O3_ID = 386

SPLIT_YEAR = 2020

OUT_FOLDER = "./out"

In [46]:
def linear_regression(data: pd.DataFrame, features: list, target_col: str, split_year: int, pollutant_id: int):
    # Filtra o indicador (pollutant_id) e remove NaNs introduzidos pelo LAG
    df_filtered = data[data['IndicatorID'] == pollutant_id].dropna().copy()
    
    if df_filtered.empty:
        print(f"Dados insuficientes para o Indicator ID {pollutant_id} apÃ³s remoÃ§Ã£o de NaNs.")
        return

    # SeparaÃ§Ã£o Treino/Teste por ano
    X_train = df_filtered[df_filtered['Year'] < split_year][features]
    y_train = df_filtered[df_filtered['Year'] < split_year][target_col]
    X_test = df_filtered[df_filtered['Year'] >= split_year][features]
    y_test = df_filtered[df_filtered['Year'] >= split_year][target_col]
    
    # PadronizaÃ§Ã£o (Scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Treinamento do Modelo
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    
    # PrediÃ§Ã£o e AvaliaÃ§Ã£o
    y_pred = model.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Coeficientes (Betas Padronizados)
    betas = pd.Series(model.coef_, index=features).sort_values(ascending=False)
    
    # Output
    pollutant_name = df_filtered['name'].iloc[0] # Pega o nome do poluente
    print("------------------------------------------------------------------")
    print(f"## ðŸ“ˆ Resultados (RL - AR(3) + Year) | {pollutant_name} (Citywide)")
    print("------------------------------------------------------------------")
    print(f"PerÃ­odo Treino: {df_filtered['year'].min()} - {split_year-1} | Teste: {split_year} - {df_filtered['year'].max()}")
    print(f"R-quadrado (RÂ²): {r2:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print("\n### Coeficientes (Betas Padronizados):")
    print(betas)
    print("------------------------------------------------------------------")

# AnÃ¡lise de P1 a partir dos dados citywide


In [None]:
P1_seasonal_citywide = pd.read_parquet(os.path.join(OUT_FOLDER, "seasonal_citywide_air_quality.parquet"))

sql_query = f"""
with base as(
    select 
        indicatorid,
        name,
        year,
        AVG(datavalue) AS DataValue
    from P1_seasonal_citywide
    group by indicatorid, name, year
)
select
    indicatorid,
    name,
    year,
    DataValue as Current,
    lag(DataValue, 1) over (
            partition by indicatorid
            order by year
        ) as Lag1,
    lag(DataValue, 2) over (
            partition by indicatorid
            order by year
        ) as Lag2,
    lag(DataValue, 3) over (
            partition by indicatorid
            order by year
        ) as Lag3
from base
order by year;
"""
P2_seasonal_citywide = duckdb.query(sql_query).to_df()

# Create lowercase aliases for columns so the existing `linear_regression` function
# (which mixes capitalized and lowercase names) can work without changing it.
for col in list(P2_seasonal_citywide.columns):
    lower = col.lower()
    if lower not in P2_seasonal_citywide.columns:
        P2_seasonal_citywide[lower] = P2_seasonal_citywide[col]

# FEATURES is already defined in the notebook as ['Lag1','Lag2','Lag3','year']
# ensure INDICATOR_IDS comes from the dataframe with the original capitalization
FEATURES = ['Lag1','Lag2','Lag3']
TARGET_COL = 'Current'
INDICATOR_IDS = P2_seasonal_citywide['IndicatorID'].unique()

for pollutant_id in INDICATOR_IDS:
    linear_regression(
        data=P2_seasonal_citywide,
        features=FEATURES,
        target_col=TARGET_COL,
        split_year=SPLIT_YEAR,
        pollutant_id=pollutant_id
    )


------------------------------------------------------------------
## ðŸ“ˆ Resultados (RL - AR(3) + Year) | Nitrogen dioxide (NO2) (Citywide)
------------------------------------------------------------------
PerÃ­odo Treino: 2012 - 2019 | Teste: 2020 - 2023
R-quadrado (RÂ²): 0.314
RMSE: 0.431

### Coeficientes (Betas Padronizados):
Lag3    0.144332
Lag1   -0.788555
Lag2   -0.809236
Year   -1.968363
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## ðŸ“ˆ Resultados (RL - AR(3) + Year) | Ozone (O3) (Citywide)
------------------------------------------------------------------
PerÃ­odo Treino: 2012 - 2019 | Teste: 2020 - 2023
R-quadrado (RÂ²): -2.185
RMSE: 4.387

### Coeficientes (Betas Padronizados):
Lag1   -0.871304
Lag2   -0.901139
Lag3   -0.923537
Year   -1.396787
dtype: float64
------------------------------------------------------------------
-----------------------------------------