In [1]:
# Import libraries
import pandas as pd
from sklearn.metrics import mean_absolute_error
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import seaborn as sns

# Set plot style
sns.set(style="whitegrid")
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Data
train = pd.read_csv('../data/train.csv', index_col='id', parse_dates=True)
test = pd.read_csv('../data/test.csv', index_col='id', parse_dates=True)

### 3. Modeling

#### Baseline Model
- [ ] **Baseline (Naive) Model**: Implement a simple baseline model, such as predicting the last known value or average to set a benchmark for evaluation.

#### Model Selection and Training
- [ ] **ARIMA / Exponential Smoothing**
- [ ] **LSTM/GRU**
- [ ] **Other Deep Learning**

#### Hyperparameter Tuning
- [ ] **Grid/Random Search**: Perform hyperparameter tuning using time-series cross-validation ().

#### Model Evaluation
- [ ] **Define Evaluation Metrics**: MAE
- [ ] **Evaluate on Validation Set**: Assess each model’s performance on the validation set and compare it with the baseline.
- [ ] **Residual Analysis**: Plot and analyze residuals to check for patterns or biases.

#### Forecasting
- [ ] **Make Predictions**: Generate predictions for the future time points provided in `test.csv`.

In [3]:
pollutants = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']

In [4]:
submission = pd.DataFrame(test.index, columns=['id'])
submission['id'] = submission['id'].dt.strftime('%Y-%m-%d %H')

In [5]:
for pollutant in pollutants:
    submission[pollutant] = 0.0

In [7]:
def train_and_evaluate(train_data, pollutant, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    maes = []
    
    for train_idx, val_idx in tscv.split(train_data):
        train_fold, val_fold = train_data.iloc[train_idx], train_data.iloc[val_idx]
        
        # Fit ARIMA model (adjust order based on initial analysis)
        model = ARIMA(train_fold[pollutant], order=(1, 1, 1))
        model_fit = model.fit()
        
        # Predict on validation set
        val_pred = model_fit.predict(start=val_fold.index[0], end=val_fold.index[-1])
        
        # Calculate MAE for the fold
        mae = mean_absolute_error(val_fold[pollutant], val_pred)
        maes.append(mae)
    
    return np.mean(maes)

In [8]:
# Train, evaluate, and predict for each pollutant

for pollutant in pollutants:
    print(f"Training model for {pollutant}...")
    cv_mae = train_and_evaluate(train, pollutant)
    print(f"Cross-validated MAE for {pollutant}: {cv_mae}")

    # Train final model on all training data for test prediction
    model = ARIMA(train[pollutant]), order=(1, 1, 1))
    model_fit = model.fit()
    predictions = model_fit.predict(start=test.index[0], end=test.index[-1])

    submission[pollutant] = predictions.values
    

Training model for valeur_NO2...
Cross-validated MAE for valeur_NO2: 12.525199946955812
Training model for valeur_CO...
Cross-validated MAE for valeur_CO: 0.05586427821321005
Training model for valeur_O3...
Cross-validated MAE for valeur_O3: 35.59010791092384
Training model for valeur_PM10...
Cross-validated MAE for valeur_PM10: 12.60774234744063
Training model for valeur_PM25...
Cross-validated MAE for valeur_PM25: 7.441696901953859


In [10]:
submission.to_csv('/Users/smesguiche/Desktop/submission.csv', index=False)


In [9]:
submission

Unnamed: 0,id,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25
0,2024-09-03 23,17.6,0.184364,42.794022,9.184447,4.556043
1,2024-09-04 00,17.6,0.184626,44.767295,9.793872,4.563498
2,2024-09-04 01,17.6,0.184814,45.531681,10.336498,4.562233
3,2024-09-04 02,17.6,0.184950,45.827782,10.819648,4.562448
4,2024-09-04 03,17.6,0.185048,45.942483,11.249840,4.562411
...,...,...,...,...,...,...
499,2024-09-24 18,17.6,0.185301,46.015009,14.744424,4.562417
500,2024-09-24 19,17.6,0.185301,46.015009,14.744424,4.562417
501,2024-09-24 20,17.6,0.185301,46.015009,14.744424,4.562417
502,2024-09-24 21,17.6,0.185301,46.015009,14.744424,4.562417
