# 4 - Modelagem com Machine Learning

P1: É possível predizer a taxa de poluição emitida no ar nos próximos anos?\
P2: A estação do ano influencia a quantidade de poluentes no ar de alguma forma?

1 - tentar fazer previsões utilziando modelos lineares para ambos perguntas 1 e 2

2 - tentar verificar se eles conseguem prever com corretude anos posteriores ao treinamento do modelo.

3 - Aplicar modelos mais complexos para as mesmas coisas que a regressão linear tentou verificar

4 - verificar se as regressões mais complexas conseguem prever melhor que a regressão linear 

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import os 

PM25_ID = 365
NO2_ID = 375
O3_ID = 386

split_year = 2020

out_folder = "./out"

# Análise de P1 a partir dos dados citywide


In [13]:
P1_seasonal_citywide = pd.read_parquet(os.path.join(out_folder, "seasonal_citywide_air_quality.parquet"))

PM25_seasonal_citywide = P1_seasonal_citywide[P1_seasonal_citywide['IndicatorID'] == PM25_ID].copy()
NO2_seasonal_citywide = P1_seasonal_citywide[P1_seasonal_citywide['IndicatorID'] == NO2_ID].copy()

PM25_seasonal_citywide.sort_values(by=['Year', 'Season'], inplace=True)
NO2_seasonal_citywide.sort_values(by=['Year', 'Season'], inplace=True)

PM25_annual_citywide = PM25_seasonal_citywide.groupby('Year')['DataValue'].mean().reset_index()
NO2_annual_citywide = NO2_seasonal_citywide.groupby('Year')['DataValue'].mean().reset_index()
O3_annual_citywide = P1_seasonal_citywide[P1_seasonal_citywide['IndicatorID'] == O3_ID].copy()
O3_annual_citywide.drop(['Season', 'MeasureInfo', 'Name', 'IndicatorID', 'Measure'], axis=1, inplace=True)
O3_annual_citywide.sort_values(by=['Year'], inplace=True)

# Análise de P1 a partir dos dados borough


In [None]:
P1_seasonal_borough = pd.read_parquet(os.path.join(out_folder, "seasonal_borough_air_quality.parquet"))

P1_seasonal_borough.groupby(['Year', 'GeoPlaceName', 'IndicatorID'])['DataValue'].mean().reset_index()

P1_seasonal_borough.sort_values(by=['Year', 'GeoPlaceName', 'IndicatorID'], inplace=True)

P1_seasonal_borough

# PM25_seasonal_borough = P1_seasonal_borough[P1_seasonal_borough['IndicatorID'] == PM25_ID].copy()
# NO2_seasonal_borough = P1_seasonal_borough[P1_seasonal_borough['IndicatorID'] == NO2_ID].copy()
# O3_seasonal_borough = P1_seasonal_borough[P1_seasonal_borough['IndicatorID'] == O3_ID].copy()

# PM25_seasonal_borough = PM25_seasonal_borough.groupby(['Year', 'GeoPlaceName'])['DataValue'].mean().reset_index()
# NO2_seasonal_borough = NO2_seasonal_borough.groupby(['Year', 'GeoPlaceName'])['DataValue'].mean().reset_index()
# O3_seasonal_borough = O3_seasonal_borough.groupby(['Year', 'GeoPlaceName'])['DataValue'].mean().reset_index()

# PM25_bronx = PM25_seasonal_borough[PM25_seasonal_borough['GeoPlaceName'] == 'Bronx'].copy()
# NO2_bronx = NO2_seasonal_borough[NO2_seasonal_borough['GeoPlaceName'] == 'Bronx'].copy()
# O3_bronx = O3_seasonal_borough[O3_seasonal_borough['GeoPlaceName'] == 'Bronx'].copy()

# PM25_brooklyn = PM25_seasonal_borough[PM25_seasonal_borough['GeoPlaceName'] == 'Brooklyn'].copy()
# NO2_brooklyn = NO2_seasonal_borough[NO2_seasonal_borough['GeoPlaceName'] == 'Brooklyn'].copy()
# O3_brooklyn = O3_seasonal_borough[O3_seasonal_borough['GeoPlaceName'] == 'Brooklyn'].copy()

# PM25_manhattan = PM25_seasonal_borough[PM25_seasonal_borough['GeoPlaceName'] == 'Manhattan'].copy()
# NO2_manhattan = NO2_seasonal_borough[NO2_seasonal_borough['GeoPlaceName'] == 'Manhattan'].copy()
# O3_manhattan = O3_seasonal_borough[O3_seasonal_borough['GeoPlaceName'] == 'Manhattan'].copy()

# PM25_queens = PM25_seasonal_borough[PM25_seasonal_borough['GeoPlaceName'] == 'Queens'].copy()
# NO2_queens = NO2_seasonal_borough[NO2_seasonal_borough['GeoPlaceName'] == 'Queens'].copy()
# O3_queens = O3_seasonal_borough[O3_seasonal_borough['GeoPlaceName'] == 'Queens'].copy()

# PM25_staten_island = PM25_seasonal_borough[PM25_seasonal_borough['GeoPlaceName'] == 'Staten Island'].copy()
# NO2_staten_island = NO2_seasonal_borough[NO2_seasonal_borough['GeoPlaceName'] == 'Staten Island'].copy()
# O3_staten_island = O3_seasonal_borough[O3_seasonal_borough['GeoPlaceName'] == 'Staten Island'].copy()

Unnamed: 0,IndicatorID,Name,Measure,MeasureInfo,Season,Year,GeoPlaceName,DataValue
0,365,Fine particles (PM 2.5),Mean,mcg/m3,Summer,2009,Bronx,10.730000
75,365,Fine particles (PM 2.5),Mean,mcg/m3,Winter,2009,Bronx,14.080000
150,375,Nitrogen dioxide (NO2),Mean,ppb,Summer,2009,Bronx,20.830000
225,375,Nitrogen dioxide (NO2),Mean,ppb,Winter,2009,Bronx,29.290000
300,386,Ozone (O3),Mean,ppb,Summer,2009,Bronx,25.300000
...,...,...,...,...,...,...,...,...
74,365,Fine particles (PM 2.5),Mean,mcg/m3,Summer,2023,Staten Island,8.378828
149,365,Fine particles (PM 2.5),Mean,mcg/m3,Winter,2023,Staten Island,5.524704
224,375,Nitrogen dioxide (NO2),Mean,ppb,Summer,2023,Staten Island,9.286177
299,375,Nitrogen dioxide (NO2),Mean,ppb,Winter,2023,Staten Island,17.322713


In [15]:
def auto_regressive_features(data: pd.DataFrame, lags: int):
    df = data.copy()
    for lag in range(1, lags + 1):
        df[f'lag_{lag}'] = df['DataValue'].shift(lag)
    df.dropna(inplace=True)
    return df


def linear_regression(data: pd.DataFrame, split: int, features: list):
    X_train = data[data['Year'] < split][features]
    y_train = data[data['Year'] < split]['DataValue']
    X_test = data[data['Year'] >= split][features]
    y_test = data[data['Year'] >= split]['DataValue']
    
    scaler = StandardScaler()
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    coeficientes = pd.Series(model.coef_, index=features).sort_values(ascending=False)

    print(f"------------------------------------------------------------------")
    print(f"## Linear Regression {features})")
    print(f"------------------------------------------------------------------")
    print(f"Período Treino: {data['Year'].min()} - {split-1} | Teste: {split} - {data['Year'].max()}")
    print(f"R-quadrado (R²): {r2:.3f}")
    print(f"RMSE: {rmse:.2f}")
    print("\n### Coeficientes (Betas Padronizados):")
    print(coeficientes)
    print("------------------------------------------------------------------")

def random_forest_regression(data: pd.DataFrame, split: int, features: list, target_col: str, pollutant_name: str):
    X_train = data[data['Year'] < split][features]
    y_train = data[data['Year'] < split][target_col]
    X_test = data[data['Year'] >= split][features]
    y_test = data[data['Year'] >= split][target_col]
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    param_grid = {
        'n_estimators': [50, 100, 200], 
        'max_depth': [3, 5, 7],         
    }
    
    grid_search = GridSearchCV(
        RandomForestRegressor(random_state=42), 
        param_grid, 
        cv=2, 
        scoring='r2',
    )

    grid_search.fit(X_train_scaled, y_train)
    
    model = grid_search.best_estimator_
    
    y_pred = model.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    importance = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)

    print(f"------------------------------------------------------------------")
    print(f"## RF {pollutant_name})")
    print(f"------------------------------------------------------------------")
    print(f"Período Treino: {data['Year'].min()} - {split-1} | Teste: {split} - {data['Year'].max()}")
    print(f"Melhores Hiperparâmetros: {grid_search.best_params_}")
    print(f"R-quadrado (R²): {r2:.3f}")
    print(f"RMSE: {rmse:.2f}")
    print("\n### Importância das Features:")
    print(importance)
    print("------------------------------------------------------------------")

In [16]:
PM25_annual_citywide = auto_regressive_features(PM25_annual_citywide, lags=1)
NO2_annual_citywide = auto_regressive_features(NO2_annual_citywide, lags=1)
O3_annual_citywide = auto_regressive_features(O3_annual_citywide, lags=1)

linear_regression(PM25_annual_citywide, split=split_year, features=['lag_1', 'Year'])
linear_regression(NO2_annual_citywide, split=split_year, features=['lag_1', 'Year'])
linear_regression(O3_annual_citywide, split=split_year, features=['lag_1', 'Year'])

------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
------------------------------------------------------------------
Período Treino: 2010 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): -6.953
RMSE: 1.04

### Coeficientes (Betas Padronizados):
lag_1   -0.639459
Year    -1.795130
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
------------------------------------------------------------------
Período Treino: 2010 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): -0.333
RMSE: 0.60

### Coeficientes (Betas Padronizados):
lag_1   -0.699286
Year    -2.065656
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
-------------------------------------------------------------

In [17]:
PM25_bronx = auto_regressive_features(PM25_bronx, lags=1)
NO2_bronx = auto_regressive_features(NO2_bronx, lags=1)
O3_bronx = auto_regressive_features(O3_bronx, lags=1)

linear_regression(PM25_bronx, split=split_year, features=['lag_1', 'Year'])
linear_regression(NO2_bronx, split=split_year, features=['lag_1', 'Year'])
linear_regression(O3_bronx, split=split_year, features=['lag_1', 'Year'])

------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
------------------------------------------------------------------
Período Treino: 2011 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): -23.327
RMSE: 1.63

### Coeficientes (Betas Padronizados):
lag_1   -0.754183
Year    -2.023109
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
------------------------------------------------------------------
Período Treino: 2011 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): 0.783
RMSE: 0.34

### Coeficientes (Betas Padronizados):
lag_1   -0.495012
Year    -1.692829
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
-------------------------------------------------------------

In [18]:
PM25_brooklyn = auto_regressive_features(PM25_brooklyn, lags=1)
NO2_brooklyn = auto_regressive_features(NO2_brooklyn, lags=1)
O3_brooklyn = auto_regressive_features(O3_brooklyn, lags=1)

linear_regression(PM25_brooklyn, split=split_year, features=['lag_1', 'Year'])
linear_regression(NO2_brooklyn, split=split_year, features=['lag_1', 'Year'])
linear_regression(O3_brooklyn, split=split_year, features=['lag_1', 'Year'])

------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
------------------------------------------------------------------
Período Treino: 2012 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): -1.756
RMSE: 0.68

### Coeficientes (Betas Padronizados):
lag_1   -0.351511
Year    -1.126682
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
------------------------------------------------------------------
Período Treino: 2012 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): 0.098
RMSE: 0.75

### Coeficientes (Betas Padronizados):
lag_1   -0.57714
Year    -1.41952
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
----------------------------------------------------------------

In [19]:
PM25_manhattan = auto_regressive_features(PM25_manhattan, lags=1)
NO2_manhattan = auto_regressive_features(NO2_manhattan, lags=1)
O3_manhattan = auto_regressive_features(O3_manhattan, lags=1)

linear_regression(PM25_manhattan, split=split_year, features=['lag_1', 'Year'])
linear_regression(NO2_manhattan, split=split_year, features=['lag_1', 'Year'])
linear_regression(O3_manhattan, split=split_year, features=['lag_1', 'Year'])

------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
------------------------------------------------------------------
Período Treino: 2011 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): -6.676
RMSE: 0.88

### Coeficientes (Betas Padronizados):
lag_1   -0.419058
Year    -1.727199
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
------------------------------------------------------------------
Período Treino: 2011 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): 0.695
RMSE: 0.71

### Coeficientes (Betas Padronizados):
lag_1   -0.509461
Year    -2.841297
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
--------------------------------------------------------------

In [20]:
PM25_queens = auto_regressive_features(PM25_queens, lags=1)
NO2_queens = auto_regressive_features(NO2_queens, lags=1)
O3_queens = auto_regressive_features(O3_queens, lags=1)

linear_regression(PM25_queens, split=split_year, features=['lag_1', 'Year'])
linear_regression(NO2_queens, split=split_year, features=['lag_1', 'Year'])
linear_regression(O3_queens, split=split_year, features=['lag_1', 'Year'])

------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
------------------------------------------------------------------
Período Treino: 2011 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): -11.835
RMSE: 1.28

### Coeficientes (Betas Padronizados):
lag_1   -0.601576
Year    -1.652030
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
------------------------------------------------------------------
Período Treino: 2011 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): 0.411
RMSE: 0.33

### Coeficientes (Betas Padronizados):
lag_1   -0.515014
Year    -1.700798
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
-------------------------------------------------------------

In [21]:
PM25_staten_island = auto_regressive_features(PM25_staten_island, lags=1)
NO2_staten_island = auto_regressive_features(NO2_staten_island, lags=1)
O3_staten_island = auto_regressive_features(O3_staten_island, lags=1)

linear_regression(PM25_staten_island, split=split_year, features=['lag_1', 'Year'])
linear_regression(NO2_staten_island, split=split_year, features=['lag_1', 'Year'])
linear_regression(O3_staten_island, split=split_year, features=['lag_1', 'Year'])

------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
------------------------------------------------------------------
Período Treino: 2011 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): -7.895
RMSE: 1.40

### Coeficientes (Betas Padronizados):
lag_1   -0.547213
Year    -1.703977
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
------------------------------------------------------------------
Período Treino: 2011 - 2019 | Teste: 2020 - 2023
R-quadrado (R²): -4.302
RMSE: 1.06

### Coeficientes (Betas Padronizados):
lag_1   -0.543811
Year    -1.361905
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## Linear Regression ['lag_1', 'Year'])
-------------------------------------------------------------

In [22]:
PM25_brooklyn = auto_regressive_features(PM25_brooklyn, lags=1)
NO2_brooklyn = auto_regressive_features(NO2_brooklyn, lags=1)
O3_brooklyn = auto_regressive_features(O3_brooklyn, lags=1)

random_forest_regression(PM25_brooklyn, split=split_year, features=['lag_1', 'Year'], target_col='DataValue', pollutant_name='PM2.5 - Brooklyn')
random_forest_regression(NO2_brooklyn, split=split_year, features=['lag_1', 'Year'], target_col='DataValue', pollutant_name='NO2 - Brooklyn')
random_forest_regression(O3_brooklyn, split=split_year, features=['lag_1', 'Year'], target_col='DataValue', pollutant_name='O3 - Brooklyn')

------------------------------------------------------------------
## RF PM2.5 - Brooklyn)
------------------------------------------------------------------
Período Treino: 2013 - 2019 | Teste: 2020 - 2023
Melhores Hiperparâmetros: {'max_depth': 3, 'n_estimators': 100}
R-quadrado (R²): -3.602
RMSE: 0.88

### Importância das Features:
Year     0.634432
lag_1    0.365568
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## RF NO2 - Brooklyn)
------------------------------------------------------------------
Período Treino: 2013 - 2019 | Teste: 2020 - 2023
Melhores Hiperparâmetros: {'max_depth': 3, 'n_estimators': 100}
R-quadrado (R²): -1.271
RMSE: 1.19

### Importância das Features:
Year     0.580835
lag_1    0.419165
dtype: float64
------------------------------------------------------------------
------------------------------------------------------------------
## RF O3 - Brooklyn)
----