In [2]:
!pip install darts





In [3]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set plot style
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Load Data
train_processed_df = pd.read_csv('../data/train_with_weather.csv')
test_df = pd.read_csv('../data/test.csv')

### 3. Modeling

#### Baseline Model
- [ ] **Baseline (Naive) Model**: Implement a simple baseline model, such as predicting the last known value or average to set a benchmark for evaluation.

#### Model Selection and Training
- [ ] **ARIMA / Exponential Smoothing**
- [ ] **LSTM/GRU**
- [ ] **Other Deep Learning**

#### Hyperparameter Tuning
- [ ] **Grid/Random Search**: Perform hyperparameter tuning using time-series cross-validation ().

#### Model Evaluation
- [ ] **Define Evaluation Metrics**: MAE
- [ ] **Evaluate on Validation Set**: Assess each model’s performance on the validation set and compare it with the baseline.
- [ ] **Residual Analysis**: Plot and analyze residuals to check for patterns or biases.

#### Forecasting
- [ ] **Make Predictions**: Generate predictions for the future time points provided in `test.csv`.

In [13]:
from darts import TimeSeries
from darts.models import ARIMA, ExponentialSmoothing, Prophet, NBEATSModel, Theta, XGBModel, RNNModel, LightGBMModel, CatBoostModel
from darts.metrics import mae
from sklearn.model_selection import TimeSeriesSplit

# Convert train and test data into Darts TimeSeries format
pollutants = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
train_series = {}
train_series = {pollutant: TimeSeries.from_dataframe(train_processed_df, time_col='id', value_cols=pollutant) for pollutant in pollutants}
series = TimeSeries.from_dataframe(train_processed_df, time_col='id', value_cols='valeur_NO2')
test_series = TimeSeries.from_dataframe(test_df, time_col='id')

In [None]:
# Function to train and evaluate using Darts with multiple models
def train_and_evaluate_darts(pollutant):
    series = TimeSeries.from_dataframe(train_processed_df, time_col='id', value_cols=pollutant)

    models = {
        "ARIMA": ARIMA(p=1, d=1, q=1),
        "ExponentialSmoothing": ExponentialSmoothing(),
        "Prophet": Prophet(),
        "NBEATS": NBEATSModel(input_chunk_length=30, output_chunk_length=10, n_epochs=10, random_state=42),
        "LSTM": RNNModel(model='LSTM', input_chunk_length=30, n_epochs=10, random_state=42),
        "Theta": Theta()
    }

    
    maes = {model_name: [] for model_name in models.keys()}
    
    tscv = TimeSeriesSplit(n_splits=5)
        
    for train_index, test_index in tscv.split(series):
        train = series[:len(train_index)]  # Use the last index of train_index
        test = series[len(train_index): len(train_index) + len(test_index)]  # The test set starts after train set

        for model_name, model in models.items():
            
            # Fit the model
            model.fit(train)

            # Make predictions
            forecast = model.predict(len(test))

            # Evaluate the model
            score = mae(test, forecast)
            maes[model_name].append(score)

    return maes


cv_mae = train_and_evaluate_darts('valeur_NO2')

13:58:27 - cmdstanpy - INFO - Chain [1] start processing
13:58:28 - cmdstanpy - INFO - Chain [1] done processing
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type             | Params
---------------------------------------------------
0 | criterion     | MSELoss          | 0     
1 | train_metrics | MetricCollection | 0     
2 | val_metrics   | MetricCollection | 0     
3 | stacks        | ModuleList       | 6.2 M 
---------------------------------------------------
6.2 M     Trainable params
1.5 K     Non-trainable params
6.2 M     Total params
24.975    Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type             | Params
---------------------------------------------------
0 | criterion     | MSELoss          | 0     
1 | train_metrics | MetricCollection | 0     
2 | val_metrics   | MetricCollection | 0     
3 | rnn           | LSTM             | 2.8 K 
4 | V             | Linear           | 26    
---------------------------------------------------
2.8 K     Trainable params
0         Non-trainable params
2.8 K     Total params
0.011     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

In [None]:
# Execute the cross-validation
for pollutant in pollutants:
    print(f"Training model for {pollutant}...")
    cv_mae = train_and_evaluate_darts(pollutant)
    print(f"Cross-validated MAE for {pollutant}: {cv_mae}")

In [16]:
# Train final model on all training data for test prediction
best_model = ExponentialSmoothing()
best_model.fit(series)
predictions = best_model.predict(len(test_df))
predictions

In [None]:
# Train final models on full training data and make predictions on test set
for pollutant in pollutants:
    best_model = ExponentialSmoothing()
    series = TimeSeries.from_dataframe(train_processed_df, time_col='id', value_cols=pollutant)
    best_model.fit(series)
    predictions = best_model.predict(len(test_df))
    predictions_df = predictions.pd_dataframe()
    test_df[pollutant] = predictions[pollutant].values


In [None]:
test_df

In [None]:
# Convert predictions to a DataFrame and save to CSV
test_df.to_csv('../data/predictions.csv')