# Engineering weather-based features and age buckets

In [6]:
%load_ext autoreload
%autoreload 2
%matplotlib inline


import pandas as pd
import warnings
from src.config import (WEATHER_FILE, WEATHER_QUARTER_FILE,)
from src.config import AGE_CLEAN_FILE, MASTER_DF_FILE, AGE_BUCKET_FILE


warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Weather features

In [4]:
weather_daily_df = pd.read_csv(WEATHER_FILE, index_col=0)
weather_daily_df.head()

Unnamed: 0,region,date,temp_max_c,temp_min_c,sunshine_duration_s,windspeed_mean_kmh,precipitation_sum_mm
0,Arcos de Valdevez,2019-01-01,16.4,6.9,29242.46,4.5,0.0
1,Arcos de Valdevez,2019-01-02,14.6,4.5,29295.18,5.0,0.0
2,Arcos de Valdevez,2019-01-03,13.9,8.5,29352.68,2.8,0.0
3,Arcos de Valdevez,2019-01-04,14.5,7.2,29414.8,3.0,0.0
4,Arcos de Valdevez,2019-01-05,14.8,6.7,29481.41,4.4,0.0


The data is daily and all of the other data we operate is either yearly, or quarterly. The first step is to add the quarter column and unify the municipality column name.

In [None]:
weather_daily_df["date"] = pd.to_datetime(weather_daily_df["date"])
weather_daily_df = weather_daily_df[weather_daily_df["date"] >= "2019-10-01"]
weather_daily_df["quarter_year"] = weather_daily_df["date"].dt.to_period("Q").astype(str).str.replace(r'(\d{4})Q(\d)', r"\2Q \1", regex=True)
weather_daily_df = weather_daily_df.rename(columns={'region': 'municipality'})

Convert the sunshine duration from seconds to hours, so it's more interpretable and human readable.

In [4]:
weather_daily_df["sunshine_duration_h"] = weather_daily_df["sunshine_duration_s"] / 3600
weather_daily_df = weather_daily_df.drop(columns=["sunshine_duration_s"])

Add the comfort helper columns that will help calculated the desired comfort metrics (amount of rainy/sunny/etc days per municipality per quarter).

In [5]:
# Helper columns
weather_daily_df["is_rainy"] = weather_daily_df["precipitation_sum_mm"] > 0
weather_daily_df["is_sunny"] = weather_daily_df["sunshine_duration_h"] >= 8
weather_daily_df["is_windy"] = weather_daily_df["windspeed_mean_kmh"] > 30.0
weather_daily_df["no_rain"] = weather_daily_df["precipitation_sum_mm"] == 0
weather_daily_df["is_temp_ok"] = (weather_daily_df["temp_min_c"] >= 17) & (weather_daily_df["temp_max_c"] <= 28)
weather_daily_df["temp_mean_c"] = (weather_daily_df["temp_max_c"] + weather_daily_df["temp_min_c"]) / 2

Group and aggregate everything and save the file.

In [None]:
weather_quarter_df = weather_daily_df.groupby(by=["municipality", "quarter_year"]).agg(
    # aggregate for the quarter values
    total_sunshine_h=pd.NamedAgg(column="sunshine_duration_h", aggfunc="sum"),
    mean_sunshine_h=pd.NamedAgg(column="sunshine_duration_h", aggfunc="mean"),
    windspeed_mean_kmh=pd.NamedAgg(column="windspeed_mean_kmh", aggfunc="mean"),
    total_precipitation_mm=pd.NamedAgg(column="precipitation_sum_mm", aggfunc="sum"),
    mean_precipitation_mm=pd.NamedAgg(column="precipitation_sum_mm", aggfunc="mean"),

    # features
    windy_days=pd.NamedAgg(column="is_windy", aggfunc="sum"),
    rainy_days=pd.NamedAgg(column="is_rainy", aggfunc="sum"),
    sunny_days=pd.NamedAgg(column="is_sunny", aggfunc="sum"),
    warm_days=pd.NamedAgg(column="is_temp_ok", aggfunc="sum")
).reset_index()
weather_quarter_df[weather_quarter_df["municipality"] == "Aveiro"]

Unnamed: 0,municipality,quarter_year,total_sunshine_h,mean_sunshine_h,windspeed_mean_kmh,total_precipitation_mm,mean_precipitation_mm,windy_days,rainy_days,sunny_days,warm_days
814,Aveiro,1Q 2020,694.762311,7.634751,12.8,234.2,2.573626,0,46,60,0
815,Aveiro,1Q 2021,678.421267,7.538014,13.444444,342.1,3.801111,2,45,56,0
816,Aveiro,1Q 2022,720.760794,8.008453,11.788889,170.5,1.894444,0,28,67,0
817,Aveiro,1Q 2023,725.64195,8.062688,12.898889,273.0,3.033333,2,42,66,0
818,Aveiro,1Q 2024,649.753275,7.140146,14.617582,591.3,6.497802,4,54,48,0
819,Aveiro,1Q 2025,628.320958,6.981344,13.076667,674.8,7.497778,2,59,46,0
820,Aveiro,2Q 2020,1032.213533,11.343006,13.40989,258.4,2.83956,0,49,79,5
821,Aveiro,2Q 2021,1058.906956,11.63634,13.431868,204.6,2.248352,0,45,80,2
822,Aveiro,2Q 2022,1056.350025,11.608242,14.840659,204.8,2.250549,2,47,81,7
823,Aveiro,2Q 2023,1059.420886,11.641988,13.843956,193.0,2.120879,3,39,85,12


In [7]:
weather_quarter_df.to_csv(WEATHER_QUARTER_FILE, index=False)

## Age buckets

In [7]:
age_df = pd.read_csv(AGE_CLEAN_FILE)
age_df

Unnamed: 0,municipality,year,10 - 14,15 - 19,20 - 24,25 - 29,30 - 34,35 - 39,40 - 44,45 - 49,5 - 9,50 - 54,55 - 59,60 - 64,65 - 69,70 - 74,75 - 79,80 - 84,85 ou mais,Menos de 5
0,Abrantes,2019,1529.0,1667.0,1512.0,1286.0,1338.0,1792.0,2382.0,2420.0,1165.0,2567.0,2796.0,2726.0,2590.0,2355.0,1875.0,1833.0,2020.0,1078.0
1,Abrantes,2020,1453.0,1623.0,1609.0,1266.0,1345.0,1686.0,2215.0,2465.0,1135.0,2493.0,2744.0,2772.0,2666.0,2336.0,1943.0,1737.0,2004.0,1066.0
2,Abrantes,2021,1388.0,1603.0,1628.0,1289.0,1328.0,1620.0,2042.0,2491.0,1113.0,2458.0,2686.0,2788.0,2713.0,2329.0,1988.0,1670.0,2022.0,1016.0
3,Abrantes,2022,1308.0,1606.0,1618.0,1348.0,1259.0,1623.0,1954.0,2445.0,1102.0,2490.0,2621.0,2768.0,2670.0,2416.0,2018.0,1593.0,2004.0,1015.0
4,Abrantes,2023,1250.0,1608.0,1659.0,1408.0,1362.0,1522.0,1876.0,2454.0,1133.0,2438.0,2600.0,2760.0,2706.0,2425.0,2087.0,1553.0,1967.0,1003.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1535,Óbidos,2019,512.0,590.0,560.0,458.0,550.0,654.0,814.0,892.0,499.0,846.0,922.0,870.0,866.0,765.0,575.0,461.0,429.0,449.0
1536,Óbidos,2020,539.0,580.0,586.0,515.0,541.0,694.0,834.0,893.0,509.0,915.0,879.0,889.0,891.0,807.0,612.0,491.0,415.0,447.0
1537,Óbidos,2021,551.0,576.0,656.0,578.0,555.0,719.0,842.0,919.0,508.0,917.0,896.0,928.0,890.0,845.0,635.0,504.0,417.0,474.0
1538,Óbidos,2022,589.0,575.0,666.0,665.0,606.0,722.0,851.0,937.0,503.0,968.0,904.0,944.0,890.0,868.0,674.0,496.0,447.0,522.0


All of the age ranges may be groupped based on the life stages (mentioned and explained in `02_eda_add.ipynb`)

In [8]:
age_cols = [col for col in age_df.columns if any(char.isdigit() for char in col) and '-' in col]

age_bucket_mapping = {
    "< 5": ["Menos de 5"],
    "5 - 19": ["5 - 9", "10 - 14", "15 - 19"],
    "20 - 34": ["20 - 24", "25 - 29", "30 - 34"],
    "35 - 54": ["35 - 39", "40 - 44", "45 - 49", "50 - 54"],
    "55 - 64": ["55 - 59", "60 - 64"],
    "> 65": ["65 - 69", "70 - 74", "75 - 79", "80 - 84", "85 ou mais"]
}

for new_bucket, old_buckets in age_bucket_mapping.items():
    matching_cols = [col for col in old_buckets if col in age_df.columns]
    if matching_cols:
        age_df[new_bucket] = age_df[matching_cols].sum(axis=1)

cols_to_drop = ["85 ou mais", "Menos de 5"] + [col for col in age_cols if col in age_df.columns]
age_df = age_df.drop(columns=cols_to_drop)
age_df

Unnamed: 0,municipality,year,< 5,5 - 19,20 - 34,35 - 54,55 - 64,> 65
0,Abrantes,2019,1078.0,4361.0,4136.0,9161.0,5522.0,10673.0
1,Abrantes,2020,1066.0,4211.0,4220.0,8859.0,5516.0,10686.0
2,Abrantes,2021,1016.0,4104.0,4245.0,8611.0,5474.0,10722.0
3,Abrantes,2022,1015.0,4016.0,4225.0,8512.0,5389.0,10701.0
4,Abrantes,2023,1003.0,3991.0,4429.0,8290.0,5360.0,10738.0
...,...,...,...,...,...,...,...,...
1535,Óbidos,2019,449.0,1601.0,1568.0,3206.0,1792.0,3096.0
1536,Óbidos,2020,447.0,1628.0,1642.0,3336.0,1768.0,3216.0
1537,Óbidos,2021,474.0,1635.0,1789.0,3397.0,1824.0,3291.0
1538,Óbidos,2022,522.0,1667.0,1937.0,3478.0,1848.0,3375.0


In [9]:
age_df.to_csv(AGE_BUCKET_FILE, index=False)