# 4 - Modelagem com Machine Learning

P1: É possível predizer a taxa de poluição emitida no ar nos próximos anos?\
P2: A estação do ano influencia a quantidade de poluentes no ar de alguma forma?

1 - tentar fazer previsões utilziando modelos lineares para ambos perguntas 1 e 2

2 - tentar verificar se eles conseguem prever com corretude anos posteriores ao treinamento do modelo.

3 - Aplicar modelos mais complexos para as mesmas coisas que a regressão linear tentou verificar

4 - verificar se as regressões mais complexas conseguem prever melhor que a regressão linear 

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SelectKBest, f_regression, r_regression
from sklearn.impute import SimpleImputer
import duckdb
import pandas as pd
import numpy as np
import os 

PM25_ID = 365
NO2_ID = 375
O3_ID = 386

SPLIT_YEAR=2020

OUT_FOLDER = "./out"

In [2]:
def calc_features(features: list, target_col: str, df: pd.DataFrame):
    # Complex features example (uses df_PM25 / FEATURES_COMPLEXAS)
    X = df[features]
    y = df[target_col]

    imputer = SimpleImputer(strategy='mean')
    X_imp = imputer.fit_transform(X)
    selector = SelectKBest(f_regression, k=3).fit(X_imp, y)
    selected_complex = [f for f, keep in zip(features, selector.get_support()) if keep]
    feat_scores = pd.Series(selector.scores_, index=features)
    feat_scores.sort_values(ascending=False, inplace=True)

    # Selected features ordered by score
    selected_sorted = [f for f in feat_scores.index if f in selected_complex]

    # print(f"\n## Feature Selection - {df['Name'].iloc[0]} (Citywide) ###")
    # print("Selected (complex) ordenadas por score:", selected_sorted)
    # print("Scores (ordenados):", feat_scores.to_dict())

    return feat_scores

def linear_regression(data: pd.DataFrame, features: list, target_col: str, split_year: int, pollutant_id: int):
    df_filtered = data[data['IndicatorID'] == pollutant_id].dropna().copy()

    # Separação Treino/Teste por ano
    X_train = df_filtered[df_filtered['Year'] < split_year][features]
    y_train = df_filtered[df_filtered['Year'] < split_year][target_col]
    X_test = df_filtered[df_filtered['Year'] >= split_year][features]
    y_test = df_filtered[df_filtered['Year'] >= split_year][target_col]
    
    # Padronização (Scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Treinamento do Modelo
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    
    # Predição e Avaliação
    y_pred = model.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Coeficientes (Betas Padronizados)
    betas = pd.Series(model.coef_, index=features).sort_values(ascending=False)
    
    # Output
    pollutant_name = df_filtered['Name'].iloc[0] # Pega o nome do poluente
    print("-------------------------------------------------")
    print(f"## Resultados RL | {pollutant_name} (Citywide)")
    print(f"Período Treino: {df_filtered['Year'].min()} - {split_year-1} | Teste: {split_year} - {df_filtered['Year'].max()}")
    print(f"R-quadrado (R²): {r2:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print("\n## Coeficientes:")
    print(betas)
    print("-------------------------------------------------")

def rf_regression(data: pd.DataFrame, features: list, target_col: str, split_year: int, pollutant_id: int):
    df_filtered = data[data['IndicatorID'] == pollutant_id].copy()

    # Separação Treino/Teste por ano
    X_train = df_filtered[df_filtered['Year'] < split_year][features]
    y_train = df_filtered[df_filtered['Year'] < split_year][target_col]
    X_test = df_filtered[df_filtered['Year'] >= split_year][features]
    y_test = df_filtered[df_filtered['Year'] >= split_year][target_col]
    
    # Treinamento do Modelo
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predição e Avaliação
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Importância das Features
    importances = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
    
    # Output
    pollutant_name = df_filtered['Name'].iloc[0] # Pega o nome do poluente
    print("-------------------------------------------------")
    print(f"## Resultados RF | {pollutant_name} (Citywide)")
    print(f"Período Treino: {df_filtered['Year'].min()} - {split_year-1} | Teste: {split_year} - {df_filtered['Year'].max()}")
    print(f"R-quadrado (R²): {r2:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print("\n## Importância das Features:")
    print(importances)
    print("-------------------------------------------------")

def gradient_boosting_regression(data: pd.DataFrame, features: list, target_col: str, split_year: int, pollutant_id: int):
    df_filtered = data[data['IndicatorID'] == pollutant_id].dropna().copy()

    # Separação Treino/Teste por ano
    X_train = df_filtered[df_filtered['Year'] < split_year][features]
    y_train = df_filtered[df_filtered['Year'] < split_year][target_col]
    X_test = df_filtered[df_filtered['Year'] >= split_year][features]
    y_test = df_filtered[df_filtered['Year'] >= split_year][target_col]
    
    # Treinamento do Modelo
    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    model.fit(X_train, y_train)
    
    # Predição e Avaliação
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Importância das Features
    importances = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
    
    # Output
    pollutant_name = df_filtered['Name'].iloc[0] # Pega o nome do poluente
    print("-------------------------------------------------")
    print(f"## Resultados GB | {pollutant_name} (Citywide)")
    print(f"Período Treino: {df_filtered['Year'].min()} - {split_year-1} | Teste: {split_year} - {df_filtered['Year'].max()}")
    print(f"R-quadrado (R²): {r2:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print("\n## Importância das Features:")
    print(importances)
    print("-------------------------------------------------")


# Feature selection de dados Citywide utilizando Pearson


In [3]:
P1_seasonal_citywide = pd.read_parquet(os.path.join(OUT_FOLDER, "seasonal_citywide_air_quality.parquet"))

sql_query_simples = f"""
with base as(
    select 
        indicatorid,
        name,
        year,
        AVG(datavalue) AS DataValue
    from P1_seasonal_citywide
    group by indicatorid, name, year
)
select
    indicatorid,
    name,
    year,
    DataValue as Current,
    lag(DataValue, 1) over (
            partition by indicatorid
            order by year
        ) as Lag1,
    lag(DataValue, 2) over (
            partition by indicatorid
            order by year
        ) as Lag2,
    lag(DataValue, 3) over (
            partition by indicatorid
            order by year
        ) as Lag3
from base
order by year;
"""

sql_query_complexa = f"""
WITH base AS (
    SELECT 
        indicatorid,
        name,
        year,
        AVG(datavalue) AS DataValue
    FROM P1_seasonal_citywide
    GROUP BY indicatorid, name, year
),
lag_no2 AS (
    SELECT
				indicatorid,
				name,
        year,
        DataValue,
        LAG(DataValue, 1) OVER (ORDER BY year) AS Lag1,
        LAG(DataValue, 2) OVER (ORDER BY year) AS Lag2,
        LAG(DataValue, 3) OVER (ORDER BY year) AS Lag3
    FROM base
    WHERE indicatorid = 375
),
lag_pm25 as (
	  SELECT
				indicatorid,
				name,
        year,
        DataValue,
        LAG(DataValue, 1) OVER (ORDER BY year) AS Lag1,
        LAG(DataValue, 2) OVER (ORDER BY year) AS Lag2,
        LAG(DataValue, 3) OVER (ORDER BY year) AS Lag3
    FROM base
    WHERE indicatorid = 365
),
lag_o3 as (
	  SELECT
				indicatorid,
				name,
        year,
        DataValue,
        LAG(DataValue, 1) OVER (ORDER BY year) AS Lag1,
        LAG(DataValue, 2) OVER (ORDER BY year) AS Lag2,
        LAG(DataValue, 3) OVER (ORDER BY year) AS Lag3
    FROM base
    WHERE indicatorid = 386
)
SELECT
    b.indicatorid,
    b.name,
    b.year,
    b.DataValue AS Current,
    no2.Lag1 AS Lag1_NO2,
    no2.Lag2 AS Lag2_NO2,
    no2.Lag3 AS Lag3_NO2,
		pm25.Lag1 AS Lag1_PM25,
    pm25.Lag2 AS Lag2_PM25,
    pm25.Lag3 AS Lag3_PM25,
		o3.Lag1 AS Lag1_O3,
		o3.Lag2 AS Lag2_O3,
    o3.Lag3 AS Lag3_O3
FROM base b
INNER JOIN lag_no2 no2 ON b.year = no2.year
INNER JOIN lag_pm25 pm25 ON b.year = pm25.year
INNER JOIN lag_o3 o3 ON b.year = o3.year
ORDER BY b.name, b.year;
"""

df_query_complexa = duckdb.query(sql_query_complexa).to_df()
df_query_simples = duckdb.query(sql_query_simples).to_df()

TARGET_COL = 'Current'
FEATURES_SIMPLES = ['Lag1','Lag2','Lag3','Year']
FEATURES_COMPLEXAS = [
    'Lag1_NO2', 'Lag2_NO2', 'Lag3_NO2',
    'Lag1_PM25', 'Lag2_PM25', 'Lag3_PM25',
    'Lag1_O3', 'Lag2_O3', 'Lag3_O3',
    'Year'
]

df_PM25_complexa = df_query_complexa[df_query_complexa['IndicatorID'] == PM25_ID]
df_NO2_complexa = df_query_complexa[df_query_complexa['IndicatorID'] == NO2_ID]
df_O3_complexa = df_query_complexa[df_query_complexa['IndicatorID'] == O3_ID]

df_PM25_simples = df_query_simples[df_query_simples['IndicatorID'] == PM25_ID]
df_NO2_simples = df_query_simples[df_query_simples['IndicatorID'] == NO2_ID]
df_O3_simples = df_query_simples[df_query_simples['IndicatorID'] == O3_ID]


In [4]:
for df in [df_PM25_simples, df_NO2_simples, df_O3_simples]:
    df_filtered = df[df['Year'] < SPLIT_YEAR] #evitar data leakage
    res = calc_features(
        features=FEATURES_SIMPLES,
        target_col=TARGET_COL,
        df=df_filtered
    )

    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
    print(res.head(1))
    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

    linear_regression(
        data=df,
        features=res.index.tolist()[:1],
        target_col=TARGET_COL,
        split_year=SPLIT_YEAR,
        pollutant_id=df['IndicatorID'].iloc[0]
    )

    print("###########################################################################\n")

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Year    58.613714
dtype: float64
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-------------------------------------------------
## Resultados RL | Fine particles (PM 2.5) (Citywide)
Período Treino: 2012 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): -2.948
RMSE: 0.732

## Coeficientes:
Year   -0.85887
dtype: float64
-------------------------------------------------
###########################################################################

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Year    39.368027
dtype: float64
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-------------------------------------------------
## Resultados RL | Nitrogen dioxide (NO2) (Citywide)
Período Treino: 2012 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): 0.188
RMSE: 0.469

## Coeficientes:
Year   -0.83841
dtype: float64
-------------------------------------------------
###################################################

In [5]:
for df in [df_PM25_complexa, df_NO2_complexa, df_O3_complexa]:
    df_filtered = df[df['Year'] < SPLIT_YEAR] #evitar data leakage
    res = calc_features(
        features=FEATURES_COMPLEXAS,
        target_col=TARGET_COL,
        df=df_filtered
    )
    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
    print(res.head(3))
    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

    linear_regression(
        data=df,
        features=res.index.tolist()[:3],
        target_col=TARGET_COL,
        split_year=SPLIT_YEAR,
        pollutant_id=df['IndicatorID'].iloc[0]
    )
    rf_regression(
        data=df,
        features=res.index.tolist()[:3],
        target_col=TARGET_COL,
        split_year=SPLIT_YEAR,
        pollutant_id=df['IndicatorID'].iloc[0]
    )
    gradient_boosting_regression(
        data=df,
        features=res.index.tolist()[:3],
        target_col=TARGET_COL,
        split_year=SPLIT_YEAR,
        pollutant_id=df['IndicatorID'].iloc[0]
    )
    print("###########################################################################\n")

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Year         58.613714
Lag1_PM25     7.044944
Lag2_NO2      6.884790
dtype: float64
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-------------------------------------------------
## Resultados RL | Fine particles (PM 2.5) (Citywide)
Período Treino: 2012 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): -5.755
RMSE: 0.957

## Coeficientes:
Lag2_NO2    -0.373348
Lag1_PM25   -0.557036
Year        -1.646957
dtype: float64
-------------------------------------------------
-------------------------------------------------
## Resultados RF | Fine particles (PM 2.5) (Citywide)
Período Treino: 2009 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): -3.859
RMSE: 0.812

## Importância das Features:
Year         0.490233
Lag1_PM25    0.272626
Lag2_NO2     0.237141
dtype: float64
-------------------------------------------------
-------------------------------------------------
## Resultados GB | Fine particles (PM 2.5) (Citywide)
Período 