# Feature engineering based on the weather data

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline


import pandas as pd
# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import warnings
from src.config import (WEATHER_FILE, WEATHER_QUARTER_FILE)


warnings.filterwarnings("ignore")
# sns.set_style("whitegrid")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
weather_daily_df = pd.read_csv(WEATHER_FILE, index_col=0)
weather_daily_df.head()

Unnamed: 0,region,date,temp_max_c,temp_min_c,sunshine_duration_s,windspeed_mean_kmh,precipitation_sum_mm
0,Arcos de Valdevez,2019-01-01,16.4,6.9,29242.46,4.5,0.0
1,Arcos de Valdevez,2019-01-02,14.6,4.5,29295.18,5.0,0.0
2,Arcos de Valdevez,2019-01-03,13.9,8.5,29352.68,2.8,0.0
3,Arcos de Valdevez,2019-01-04,14.5,7.2,29414.8,3.0,0.0
4,Arcos de Valdevez,2019-01-05,14.8,6.7,29481.41,4.4,0.0


The data is daily and all of the other data we operate is either yearly, or quarterly. The first step is to add the quarter column and unify the municipality column name.

In [None]:
weather_daily_df["date"] = pd.to_datetime(weather_daily_df["date"])
weather_daily_df = weather_daily_df[weather_daily_df["date"] >= "2019-10-01"]
weather_daily_df["quarter_year"] = weather_daily_df["date"].dt.to_period("Q").astype(str).str.replace(r'(\d{4})Q(\d)', r"\2Q \1", regex=True)
weather_daily_df = weather_daily_df.rename(columns={'region': 'municipality'})

Convert the sunshine duration from seconds to hours, so it's more interpretable and human readable.

In [4]:
weather_daily_df["sunshine_duration_h"] = weather_daily_df["sunshine_duration_s"] / 3600
weather_daily_df = weather_daily_df.drop(columns=["sunshine_duration_s"])

Add the comfort helper columns that will help calculated the desired comfort metrics (amount of rainy/sunny/etc days per municipality per quarter).

In [5]:
# Helper columns
weather_daily_df["is_rainy"] = weather_daily_df["precipitation_sum_mm"] > 0
weather_daily_df["is_sunny"] = weather_daily_df["sunshine_duration_h"] >= 8
weather_daily_df["is_windy"] = weather_daily_df["windspeed_mean_kmh"] > 30.0
weather_daily_df["no_rain"] = weather_daily_df["precipitation_sum_mm"] == 0
weather_daily_df["is_temp_ok"] = (weather_daily_df["temp_min_c"] >= 17) & (weather_daily_df["temp_max_c"] <= 28)
weather_daily_df["temp_mean_c"] = (weather_daily_df["temp_max_c"] + weather_daily_df["temp_min_c"]) / 2

Group and aggregate everything and save the file.

In [None]:
weather_quarter_df = weather_daily_df.groupby(by=["municipality", "quarter_year"]).agg(
    # aggregate for the quarter values
    total_sunshine_h=pd.NamedAgg(column="sunshine_duration_h", aggfunc="sum"),
    mean_sunshine_h=pd.NamedAgg(column="sunshine_duration_h", aggfunc="mean"),
    windspeed_mean_kmh=pd.NamedAgg(column="windspeed_mean_kmh", aggfunc="mean"),
    total_precipitation_mm=pd.NamedAgg(column="precipitation_sum_mm", aggfunc="sum"),
    mean_precipitation_mm=pd.NamedAgg(column="precipitation_sum_mm", aggfunc="mean"),

    # features
    windy_days=pd.NamedAgg(column="is_windy", aggfunc="sum"),
    rainy_days=pd.NamedAgg(column="is_rainy", aggfunc="sum"),
    sunny_days=pd.NamedAgg(column="is_sunny", aggfunc="sum"),
    warm_days=pd.NamedAgg(column="is_temp_ok", aggfunc="sum")
).reset_index()
weather_quarter_df[weather_quarter_df["municipality"] == "Aveiro"]

Unnamed: 0,municipality,quarter_year,total_sunshine_h,mean_sunshine_h,windspeed_mean_kmh,total_precipitation_mm,mean_precipitation_mm,windy_days,rainy_days,sunny_days,warm_days
814,Aveiro,1Q 2020,694.762311,7.634751,12.8,234.2,2.573626,0,46,60,0
815,Aveiro,1Q 2021,678.421267,7.538014,13.444444,342.1,3.801111,2,45,56,0
816,Aveiro,1Q 2022,720.760794,8.008453,11.788889,170.5,1.894444,0,28,67,0
817,Aveiro,1Q 2023,725.64195,8.062688,12.898889,273.0,3.033333,2,42,66,0
818,Aveiro,1Q 2024,649.753275,7.140146,14.617582,591.3,6.497802,4,54,48,0
819,Aveiro,1Q 2025,628.320958,6.981344,13.076667,674.8,7.497778,2,59,46,0
820,Aveiro,2Q 2020,1032.213533,11.343006,13.40989,258.4,2.83956,0,49,79,5
821,Aveiro,2Q 2021,1058.906956,11.63634,13.431868,204.6,2.248352,0,45,80,2
822,Aveiro,2Q 2022,1056.350025,11.608242,14.840659,204.8,2.250549,2,47,81,7
823,Aveiro,2Q 2023,1059.420886,11.641988,13.843956,193.0,2.120879,3,39,85,12


In [7]:
weather_quarter_df.to_csv(WEATHER_QUARTER_FILE, index=False)