In [75]:
import pandas as pd

train_df = pd.read_csv("train.csv")

In [76]:
train_df.info()
train_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40991 entries, 0 to 40990
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           40991 non-null  object 
 1   valeur_NO2   37694 non-null  float64
 2   valeur_CO    28462 non-null  float64
 3   valeur_O3    40298 non-null  float64
 4   valeur_PM10  33824 non-null  float64
 5   valeur_PM25  39200 non-null  float64
dtypes: float64(5), object(1)
memory usage: 1.9+ MB


Unnamed: 0,id,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25
0,2020-01-01 00,42.9,0.718,15.7,73.1,64.4
1,2020-01-01 01,33.6,0.587,10.1,74.8,66.0
2,2020-01-01 02,29.3,,5.1,51.0,44.9
3,2020-01-01 03,30.5,0.246,7.2,27.7,25.1
4,2020-01-01 04,29.3,0.204,8.3,15.3,13.6
...,...,...,...,...,...,...
40986,2024-09-03 18,,0.222,55.1,12.0,5.3
40987,2024-09-03 19,,0.245,48.2,13.4,7.0
40988,2024-09-03 20,,0.234,44.5,12.4,7.1
40989,2024-09-03 21,,0.225,25.9,10.6,5.4


In [77]:
test_df = pd.read_csv("test.csv")

In [78]:
test_df.info()
test_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      504 non-null    object
dtypes: object(1)
memory usage: 4.1+ KB


Unnamed: 0,id
0,2024-09-03 23
1,2024-09-04 00
2,2024-09-04 01
3,2024-09-04 02
4,2024-09-04 03
...,...
499,2024-09-24 18
500,2024-09-24 19
501,2024-09-24 20
502,2024-09-24 21


In [79]:
basic_params = ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "apparent_temperature", "pressure_msl", "wind_speed_10m", "wind_direction_10m", "precipitation"]
cloud_radiation_params = ["cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high", "shortwave_radiation", "direct_radiation", "diffuse_radiation", "global_tilted_irradiance"]
other_params = ["wind_speed_80m","wind_speed_120m","wind_speed_180m","wind_direction_80m","wind_direction_120m","wind_direction_180m","wind_gusts_10m","vapour_pressure_deficit","cape","evapotranspiration","et0_fao_evapotranspiration","snowfall","rain","showers","weather_code","visibility"]

In [80]:
import requests
import pandas as pd

LAT = 48.8566   # Paris latitude
LON = 2.3522    # Paris longitude
START_DATE = "2020-01-01"
END_DATE = "2024-09-24"
TIMEZONE = "Europe/Paris"

# Parameter groups
param_groups = {
    "basic": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "apparent_temperature", "pressure_msl", "wind_speed_10m", "wind_direction_10m", "precipitation"],
    "cloud_radiation": ["cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high", "shortwave_radiation", "direct_radiation", "diffuse_radiation", "global_tilted_irradiance"],
    "other": ["wind_speed_80m","wind_speed_120m","wind_speed_180m","wind_direction_80m","wind_direction_120m","wind_direction_180m","wind_gusts_10m","vapour_pressure_deficit","cape","evapotranspiration","et0_fao_evapotranspiration","snowfall","rain","showers","weather_code","visibility"]
}

dfs = []

for group_name, params in param_groups.items():
    print(f"Downloading {group_name} parameters...")
    url = "https://archive-api.open-meteo.com/v1/archive"
    response = requests.get(url, params={
        "latitude": LAT,
        "longitude": LON,
        "start_date": START_DATE,
        "end_date": END_DATE,
        "hourly": ",".join(params),
        "timezone": TIMEZONE
    })
    data = response.json()

    if "hourly" not in data or not data["hourly"]:
        print(f"No data returned for {group_name}, skipping...")
        continue

    df = pd.DataFrame(data["hourly"])
    df["time"] = pd.to_datetime(df["time"])
    df.set_index("time", inplace=True)
    dfs.append(df)

# Merge all groups on time
from functools import reduce
if dfs:
    weather = reduce(lambda left, right: left.join(right, how="outer"), dfs)
    weather.to_csv(f"weather.csv")
    print(f"Done! Saved as weather.csv")
else:
    print("No data downloaded.")

weather


Downloading basic parameters...
Downloading cloud_radiation parameters...
Downloading other parameters...
Done! Saved as weather.csv


Unnamed: 0_level_0,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,pressure_msl,wind_speed_10m,wind_direction_10m,precipitation,cloud_cover,cloud_cover_low,...,wind_gusts_10m,vapour_pressure_deficit,cape,evapotranspiration,et0_fao_evapotranspiration,snowfall,rain,showers,weather_code,visibility
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01 00:00:00,0.9,100,0.9,-1.6,1032.4,3.6,135,0.0,100,100,...,12.2,0.00,,,0.00,0.0,0.0,0.0,3,
2020-01-01 01:00:00,-0.1,99,-0.2,-2.7,1032.8,2.5,82,0.0,100,100,...,5.8,0.00,,,0.00,0.0,0.0,0.0,3,
2020-01-01 02:00:00,2.6,98,2.4,0.3,1032.4,3.7,119,0.0,100,97,...,7.6,0.01,,,0.00,0.0,0.0,0.0,3,
2020-01-01 03:00:00,2.1,100,2.1,-0.2,1032.1,3.8,131,0.0,95,95,...,7.2,0.00,,,0.00,0.0,0.0,0.0,3,
2020-01-01 04:00:00,1.9,100,1.9,-0.6,1032.0,4.9,163,0.0,94,94,...,8.6,0.00,,,0.00,0.0,0.0,0.0,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-24 19:00:00,16.7,79,12.9,15.6,1007.3,13.9,253,0.2,100,100,...,28.4,0.41,,,0.06,0.0,0.2,0.0,51,
2024-09-24 20:00:00,16.1,82,13.0,15.4,1007.7,11.3,253,0.2,97,65,...,24.1,0.33,,,0.04,0.0,0.2,0.0,51,
2024-09-24 21:00:00,15.4,86,13.0,15.1,1008.3,8.9,243,0.1,30,5,...,20.9,0.25,,,0.00,0.0,0.1,0.0,51,
2024-09-24 22:00:00,14.5,90,12.8,14.2,1008.6,8.1,225,0.0,45,21,...,15.1,0.17,,,0.00,0.0,0.0,0.0,1,


In [81]:
!pip install holidays



In [82]:
# Parse 'id' as datetime and set as index
train_df['datetime'] = pd.to_datetime(train_df['id'], format="%Y-%m-%d %H")
train_df.set_index('datetime', inplace=True)

In [83]:
import pandas as pd
import holidays

# Ensure datetime index
train_df.index = pd.to_datetime(train_df.index)

# France national holidays
fr_holidays = holidays.France(years=range(train_df.index.year.min(), train_df.index.year.max() + 1))

# Convert holiday keys to a set (they are already date objects)
holiday_dates = set(fr_holidays.keys())

# Create is_holiday column
train_df['is_holiday'] = [1 if d.date() in holiday_dates else 0 for d in train_df.index]


# Optional: weekend + holiday
train_df['is_weekend'] = (train_df.index.weekday >= 5).astype(int)

# Check result
train_df.tail(50)

Unnamed: 0_level_0,id,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25,is_holiday,is_weekend
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-09-01 21:00:00,2024-09-01 21,,0.224,52.5,11.2,6.7,0,1
2024-09-01 22:00:00,2024-09-01 22,,0.229,38.8,9.4,5.7,0,1
2024-09-01 23:00:00,2024-09-01 23,,0.205,45.0,9.5,5.9,0,1
2024-09-02 00:00:00,2024-09-02 00,,0.148,58.9,16.4,7.4,0,0
2024-09-02 01:00:00,2024-09-02 01,,0.139,62.2,13.4,6.5,0,0
2024-09-02 02:00:00,2024-09-02 02,,0.14,62.5,9.7,5.8,0,0
2024-09-02 03:00:00,2024-09-02 03,,0.141,64.2,10.7,6.4,0,0
2024-09-02 04:00:00,2024-09-02 04,,0.148,61.4,10.2,6.9,0,0
2024-09-02 05:00:00,2024-09-02 05,,0.168,57.4,11.7,7.9,0,0
2024-09-02 06:00:00,2024-09-02 06,,0.211,34.1,21.9,11.2,0,0


In [84]:
# Parse 'id' as datetime and set as index
test_df['datetime'] = pd.to_datetime(test_df['id'], format="%Y-%m-%d %H")
test_df.set_index('datetime', inplace=True)

In [85]:
# Ensure datetime index
test_df.index = pd.to_datetime(test_df.index)

# France national holidays
fr_holidays = holidays.France(years=range(test_df.index.year.min(), test_df.index.year.max() + 1))

# Convert holiday keys to a set (they are already date objects)
holiday_dates = set(fr_holidays.keys())

# Create is_holiday column
test_df['is_holiday'] = [1 if d.date() in holiday_dates else 0 for d in test_df.index]


# Optional: weekend + holiday
test_df['is_weekend'] = (test_df.index.weekday >= 5).astype(int)

# Check result
test_df.tail(50)

Unnamed: 0_level_0,id,is_holiday,is_weekend
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-09-22 21:00:00,2024-09-22 21,0,1
2024-09-22 22:00:00,2024-09-22 22,0,1
2024-09-22 23:00:00,2024-09-22 23,0,1
2024-09-23 00:00:00,2024-09-23 00,0,0
2024-09-23 01:00:00,2024-09-23 01,0,0
2024-09-23 02:00:00,2024-09-23 02,0,0
2024-09-23 03:00:00,2024-09-23 03,0,0
2024-09-23 04:00:00,2024-09-23 04,0,0
2024-09-23 05:00:00,2024-09-23 05,0,0
2024-09-23 06:00:00,2024-09-23 06,0,0


In [86]:
import pandas as pd

# For train_df: parse 'id' as datetime and set as index
train_df['datetime'] = pd.to_datetime(train_df['id'], format="%Y-%m-%d %H")
train_df.set_index('datetime', inplace=True)

# For weather df: make sure index is datetime (it looks like it already is)
weather.index = pd.to_datetime(weather.index)  # 'time' column is already index
# Optional: rename index to match train_df
weather.index.name = 'datetime'

# --- Step 2: Merge on datetime index ---
train_merged_df = train_df.join(weather, how='left')

# --- Step 3: Check result ---
train_merged_df

Unnamed: 0_level_0,id,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25,is_holiday,is_weekend,temperature_2m,relative_humidity_2m,...,wind_gusts_10m,vapour_pressure_deficit,cape,evapotranspiration,et0_fao_evapotranspiration,snowfall,rain,showers,weather_code,visibility
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01 00:00:00,2020-01-01 00,42.9,0.718,15.7,73.1,64.4,1,0,0.9,100,...,12.2,0.00,,,0.00,0.0,0.0,0.0,3,
2020-01-01 01:00:00,2020-01-01 01,33.6,0.587,10.1,74.8,66.0,1,0,-0.1,99,...,5.8,0.00,,,0.00,0.0,0.0,0.0,3,
2020-01-01 02:00:00,2020-01-01 02,29.3,,5.1,51.0,44.9,1,0,2.6,98,...,7.6,0.01,,,0.00,0.0,0.0,0.0,3,
2020-01-01 03:00:00,2020-01-01 03,30.5,0.246,7.2,27.7,25.1,1,0,2.1,100,...,7.2,0.00,,,0.00,0.0,0.0,0.0,3,
2020-01-01 04:00:00,2020-01-01 04,29.3,0.204,8.3,15.3,13.6,1,0,1.9,100,...,8.6,0.00,,,0.00,0.0,0.0,0.0,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-03 18:00:00,2024-09-03 18,,0.222,55.1,12.0,5.3,0,0,21.4,56,...,25.2,1.13,,,0.17,0.0,0.0,0.0,3,
2024-09-03 19:00:00,2024-09-03 19,,0.245,48.2,13.4,7.0,0,0,21.1,57,...,18.4,1.07,,,0.09,0.0,0.0,0.0,3,
2024-09-03 20:00:00,2024-09-03 20,,0.234,44.5,12.4,7.1,0,0,19.8,64,...,9.0,0.83,,,0.06,0.0,0.0,0.0,3,
2024-09-03 21:00:00,2024-09-03 21,,0.225,25.9,10.6,5.4,0,0,18.8,67,...,13.0,0.70,,,0.05,0.0,0.0,0.0,3,


In [87]:
import pandas as pd

# For train_df: parse 'id' as datetime and set as index
test_df['datetime'] = pd.to_datetime(test_df['id'], format="%Y-%m-%d %H")
test_df.set_index('datetime', inplace=True)

# For weather df: make sure index is datetime (it looks like it already is)
weather.index = pd.to_datetime(weather.index)  # 'time' column is already index
# Optional: rename index to match train_df
weather.index.name = 'datetime'

# --- Step 2: Merge on datetime index ---
test_merged_df = test_df.join(weather, how='left')

# --- Step 3: Check result ---
test_merged_df

Unnamed: 0_level_0,id,is_holiday,is_weekend,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,pressure_msl,wind_speed_10m,wind_direction_10m,...,wind_gusts_10m,vapour_pressure_deficit,cape,evapotranspiration,et0_fao_evapotranspiration,snowfall,rain,showers,weather_code,visibility
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-09-03 23:00:00,2024-09-03 23,0,0,15.9,79,12.2,15.7,1016.8,6.0,245,...,10.1,0.38,,,0.00,0.0,0.0,0.0,3,
2024-09-04 00:00:00,2024-09-04 00,0,0,16.1,79,12.4,15.8,1016.8,6.8,288,...,10.8,0.38,,,0.00,0.0,0.0,0.0,3,
2024-09-04 01:00:00,2024-09-04 01,0,0,15.9,86,13.5,15.7,1017.0,8.6,303,...,14.4,0.26,,,0.00,0.0,0.0,0.0,3,
2024-09-04 02:00:00,2024-09-04 02,0,0,15.4,91,14.0,15.5,1017.1,8.6,303,...,14.8,0.15,,,0.00,0.0,0.0,0.0,1,
2024-09-04 03:00:00,2024-09-04 03,0,0,17.4,91,16.0,18.1,1016.9,8.7,300,...,16.2,0.17,,,0.00,0.0,0.3,0.0,51,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-24 18:00:00,2024-09-24 18,0,0,17.2,76,13.0,15.8,1007.0,16.1,250,...,32.8,0.47,,,0.10,0.0,0.0,0.0,3,
2024-09-24 19:00:00,2024-09-24 19,0,0,16.7,79,12.9,15.6,1007.3,13.9,253,...,28.4,0.41,,,0.06,0.0,0.2,0.0,51,
2024-09-24 20:00:00,2024-09-24 20,0,0,16.1,82,13.0,15.4,1007.7,11.3,253,...,24.1,0.33,,,0.04,0.0,0.2,0.0,51,
2024-09-24 21:00:00,2024-09-24 21,0,0,15.4,86,13.0,15.1,1008.3,8.9,243,...,20.9,0.25,,,0.00,0.0,0.1,0.0,51,


In [74]:
train_merged_df.to_csv(f"train_merged_df.csv")

In [88]:
test_merged_df.to_csv(f"test_merged_df.csv")