In [12]:
# Import libraries
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from meteostat import Point, Daily
import holidays
from vacances_scolaires_france import SchoolHolidayDates
import datetime

# Set plot style
sns.set(style="whitegrid")
import warnings
warnings.filterwarnings('ignore')

In [9]:
# Load Data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [11]:
# Preview the data
print("Train Data Shape:", train_df.shape)
print("Test Data Shape:", test_df.shape)
train_df.head()

Train Data Shape: (40991, 6)
Test Data Shape: (504, 1)


Unnamed: 0,id,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25
0,2020-01-01 00,42.9,0.718,15.7,73.1,64.4
1,2020-01-01 01,33.6,0.587,10.1,74.8,66.0
2,2020-01-01 02,29.3,,5.1,51.0,44.9
3,2020-01-01 03,30.5,0.246,7.2,27.7,25.1
4,2020-01-01 04,29.3,0.204,8.3,15.3,13.6


In [27]:
train_df['id'] = pd.to_datetime(train_df['id'])  # Ensure 'id' column is datetime

In [37]:
# Initialize SchoolHolidayDates
d = SchoolHolidayDates()

# Collect holiday dates for Zone C from 2020 to 2024
holiday_list = []
for year in range(2020, 2025):  # Loop through each year
    holidays_for_year = d.holidays_for_year_and_zone(year, 'C')
    for date in holidays_for_year.keys():
        holiday_list.append({'holiday_date': pd.to_datetime(date), 'is_holiday_zone_c': True})

# Create a DataFrame from the collected holiday dates
holiday_dates_zone_c = pd.DataFrame(holiday_list)

In [38]:
# Merge holiday information with train data
merged_df = pd.merge(train_df, holiday_dates_zone_c, left_on='id', right_on='holiday_date', how='left')

# Fill NaN in 'is_holiday_zone_c' with False, assuming non-holidays were not matched
merged_df['is_holiday_zone_c'] = merged_df['is_holiday_zone_c'].fillna(False)

# Drop unnecessary columns
merged_df = merged_df.drop(columns=['air_quality_time', 'holiday_date','level_0','index'])

# Save to CSV
merged_df.to_csv('../data/train_with_holidays.csv', index=False)

# Display the final merged DataFrame
merged_df

Unnamed: 0,id,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25,is_holiday_zone_c
0,2020-01-01 00:00:00,42.9,0.718,15.7,73.1,64.4,True
1,2020-01-01 01:00:00,33.6,0.587,10.1,74.8,66.0,False
2,2020-01-01 02:00:00,29.3,,5.1,51.0,44.9,False
3,2020-01-01 03:00:00,30.5,0.246,7.2,27.7,25.1,False
4,2020-01-01 04:00:00,29.3,0.204,8.3,15.3,13.6,False
...,...,...,...,...,...,...,...
40986,2024-09-03 18:00:00,,0.222,55.1,12.0,5.3,False
40987,2024-09-03 19:00:00,,0.245,48.2,13.4,7.0,False
40988,2024-09-03 20:00:00,,0.234,44.5,12.4,7.1,False
40989,2024-09-03 21:00:00,,0.225,25.9,10.6,5.4,False


### 3. Modeling

#### Baseline Model
- [ ] **Baseline (Naive) Model**: Implement a simple baseline model, such as predicting the last known value or average to set a benchmark for evaluation.

#### Model Selection and Training
- [ ] **ARIMA / Exponential Smoothing**
- [ ] **LSTM/GRU**
- [ ] **Other Deep Learning**

#### Hyperparameter Tuning
- [ ] **Grid/Random Search**: Perform hyperparameter tuning using time-series cross-validation ().

#### Model Evaluation
- [ ] **Define Evaluation Metrics**: MAE
- [ ] **Evaluate on Validation Set**: Assess each model’s performance on the validation set and compare it with the baseline.
- [ ] **Residual Analysis**: Plot and analyze residuals to check for patterns or biases.

#### Forecasting
- [ ] **Make Predictions**: Generate predictions for the future time points provided in `test.csv`.