In [37]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error


def normalize_name(s: str) -> str:
    """Uppercase, strip accents, trim spaces."""
    if pd.isna(s):
        return s
    s = str(s).strip().upper()
    s = "".join(
        c for c in unicodedata.normalize("NFD", s)
        if unicodedata.category(c) != "Mn"
    )
    return s


def parse_quarter_label(label: str):
    """
    Convert things like '1ºT2025', '4ºT 2024' → (year, quarter).
    """
    lab = str(label).replace(" ", "")
    m = re.match(r"(\d)ºT(\d{4})", lab)
    if not m:
        return None, None
    q = int(m.group(1))
    year = int(m.group(2))
    return year, q


In [15]:
house = pd.read_csv("data/processed/house_pricing_clean.csv", encoding="latin-1", sep=";")

income = pd.read_csv("data/processed/total_average_income_by_municipality.csv", encoding="utf-8")

osm = pd.read_csv("data/processed/osm_services_counts.csv", encoding="utf-8")

weather = pd.read_csv("data/processed/weather_data_2019_2025_batched.csv", encoding="utf-8")

In [16]:
house["code_len"] = house["COD (NUTS2024)"].astype(str).str.len()
house_mun = house[house["code_len"] == 7].copy()

house_mun["muni_norm"] = house_mun["Designação"].apply(normalize_name)

quarter_cols = [c for c in house_mun.columns if re.match(r"\d", c)]

house_long = house_mun.melt(
    id_vars=["Designação", "COD (NUTS2024)", "muni_norm"],
    value_vars=quarter_cols,
    var_name="quarter_label",
    value_name="price_sqm"
)

house_long["year"], house_long["quarter"] = zip(
    *house_long["quarter_label"].map(parse_quarter_label)
)

house_long = house_long[
    (house_long["year"] >= 2019) & (house_long["year"] <= 2023)
].copy()

house_long.head()


Unnamed: 0,Designação,COD (NUTS2024),muni_norm,quarter_label,price_sqm,year,quarter
1525,Arcos de Valdevez,1111601,ARCOS DE VALDEVEZ,4ºT 2023,813.0,2023,4
1526,Caminha,1111602,CAMINHA,4ºT 2023,1318.0,2023,4
1527,Melgaï¿½o,1111603,MELGAI¿½O,4ºT 2023,404.0,2023,4
1528,Monï¿½ï¿½o,1111604,MONI¿½I¿½O,4ºT 2023,910.0,2023,4
1529,Paredes de Coura,1111605,PAREDES DE COURA,4ºT 2023,723.0,2023,4


In [17]:
income["muni_norm"] = income["Region"].apply(normalize_name)

income_total = income[
    (income["Scope"] == "Município") &
    (income["Education_Level"] == "Total")
].copy()

income_total = income_total[["Year", "muni_norm", "Value"]]
income_total.rename(columns={"Value": "avg_income_eur"}, inplace=True)

df = house_long.merge(
    income_total,
    left_on=["muni_norm", "year"],
    right_on=["muni_norm", "Year"],
    how="left"
)

df.drop(columns=["Year"], inplace=True)
df.head()


Unnamed: 0,Designação,COD (NUTS2024),muni_norm,quarter_label,price_sqm,year,quarter,avg_income_eur
0,Arcos de Valdevez,1111601,ARCOS DE VALDEVEZ,4ºT 2023,813.0,2023,4,1140.4
1,Caminha,1111602,CAMINHA,4ºT 2023,1318.0,2023,4,1069.7
2,Melgaï¿½o,1111603,MELGAI¿½O,4ºT 2023,404.0,2023,4,
3,Monï¿½ï¿½o,1111604,MONI¿½I¿½O,4ºT 2023,910.0,2023,4,
4,Paredes de Coura,1111605,PAREDES DE COURA,4ºT 2023,723.0,2023,4,1137.3


In [18]:
osm["muni_norm"] = osm["area"].apply(normalize_name)

df = df.merge(
    osm.drop(columns=["area"]),
    on="muni_norm",
    how="left"
)

df.head()


Unnamed: 0,Designação,COD (NUTS2024),muni_norm,quarter_label,price_sqm,year,quarter,avg_income_eur,cinema,college,...,library,mall,museum,pharmacy,police,post_office,school,station,theatre,university
0,Arcos de Valdevez,1111601,ARCOS DE VALDEVEZ,4ºT 2023,813.0,2023,4,1140.4,1.0,0.0,...,4.0,2.0,5.0,10.0,2.0,4.0,15.0,1.0,1.0,0.0
1,Caminha,1111602,CAMINHA,4ºT 2023,1318.0,2023,4,1069.7,,,...,,,,,,,,,,
2,Melgaï¿½o,1111603,MELGAI¿½O,4ºT 2023,404.0,2023,4,,,,...,,,,,,,,,,
3,Monï¿½ï¿½o,1111604,MONI¿½I¿½O,4ºT 2023,910.0,2023,4,,,,...,,,,,,,,,,
4,Paredes de Coura,1111605,PAREDES DE COURA,4ºT 2023,723.0,2023,4,1137.3,1.0,0.0,...,1.0,0.0,1.0,4.0,1.0,1.0,7.0,1.0,0.0,0.0


In [19]:
weather["date"] = pd.to_datetime(weather["date"])
weather["region_norm"] = weather["region"].apply(normalize_name)
weather["year"] = weather["date"].dt.year
weather["quarter"] = weather["date"].dt.quarter

weather_agg = (
    weather
    .groupby(["region_norm", "year", "quarter"])
    .agg({
        "temp_max_c": "mean",
        "temp_min_c": "mean",
        "sunshine_duration_s": "sum",
        "windspeed_mean_kmh": "mean",
        "precipitation_sum_mm": "sum",
    })
    .reset_index()
    .rename(columns={
        "temp_max_c": "temp_max_c_mean",
        "temp_min_c": "temp_min_c_mean",
        "sunshine_duration_s": "sunshine_s_sum",
        "windspeed_mean_kmh": "wind_kmh_mean",
        "precipitation_sum_mm": "precip_mm_sum",
    })
)

weather_agg.head()


Unnamed: 0,region_norm,year,quarter,temp_max_c_mean,temp_min_c_mean,sunshine_s_sum,wind_kmh_mean,precip_mm_sum
0,ABRANTES,2019,1,16.955556,6.701111,2696041.86,12.391111,118.5
1,ABRANTES,2019,2,23.658242,12.114286,3522534.82,14.416484,156.3
2,ABRANTES,2019,3,30.602174,16.773913,3744447.13,14.261957,35.8
3,ABRANTES,2019,4,18.417391,10.655435,2153462.98,13.745652,367.7
4,ABRANTES,2020,1,16.898901,8.16044,2415578.24,12.257143,176.0


In [20]:
df = df.merge(
    weather_agg,
    left_on=["muni_norm", "year", "quarter"],
    right_on=["region_norm", "year", "quarter"],
    how="left"
)

df.drop(columns=["region_norm"], inplace=True)
df.head()

Unnamed: 0,Designação,COD (NUTS2024),muni_norm,quarter_label,price_sqm,year,quarter,avg_income_eur,cinema,college,...,post_office,school,station,theatre,university,temp_max_c_mean,temp_min_c_mean,sunshine_s_sum,wind_kmh_mean,precip_mm_sum
0,Arcos de Valdevez,1111601,ARCOS DE VALDEVEZ,4ºT 2023,813.0,2023,4,1140.4,1.0,0.0,...,4.0,15.0,1.0,1.0,0.0,17.747826,11.431522,1755076.37,9.177174,1405.1
1,Caminha,1111602,CAMINHA,4ºT 2023,1318.0,2023,4,1069.7,,,...,,,,,,17.836957,12.041304,1827696.25,12.670652,1219.6
2,Melgaï¿½o,1111603,MELGAI¿½O,4ºT 2023,404.0,2023,4,,,,...,,,,,,,,,,
3,Monï¿½ï¿½o,1111604,MONI¿½I¿½O,4ºT 2023,910.0,2023,4,,,,...,,,,,,,,,,
4,Paredes de Coura,1111605,PAREDES DE COURA,4ºT 2023,723.0,2023,4,1137.3,1.0,0.0,...,1.0,7.0,1.0,0.0,0.0,15.554348,9.643478,1699687.58,10.283696,1237.1


In [21]:
df = df[df["price_sqm"].notna() & (df["price_sqm"] > 0)].copy()
df["log_price_sqm"] = np.log(df["price_sqm"])

service_cols = [
    "cinema", "college", "courthouse", "fire_station", "hospital",
    "kindergarten", "library", "mall", "museum", "pharmacy", "police",
    "post_office", "school", "station", "theatre", "university",
]

weather_cols = [
    "temp_max_c_mean", "temp_min_c_mean",
    "sunshine_s_sum", "wind_kmh_mean", "precip_mm_sum",
]

numeric_features = ["avg_income_eur", "year"] + service_cols + weather_cols
categorical_features = ["quarter"]

model_df = df[["log_price_sqm"] + numeric_features + categorical_features].dropna()

X = model_df[numeric_features + categorical_features]
y = model_df["log_price_sqm"]

print("Observations in regression dataset:", len(model_df))
model_df.head()


Observations in regression dataset: 2550


Unnamed: 0,log_price_sqm,avg_income_eur,year,cinema,college,courthouse,fire_station,hospital,kindergarten,library,...,school,station,theatre,university,temp_max_c_mean,temp_min_c_mean,sunshine_s_sum,wind_kmh_mean,precip_mm_sum,quarter
0,6.700731,1140.4,2023,1.0,0.0,2.0,3.0,4.0,11.0,4.0,...,15.0,1.0,1.0,0.0,17.747826,11.431522,1755076.37,9.177174,1405.1,4
4,6.583409,1137.3,2023,1.0,0.0,1.0,1.0,0.0,3.0,1.0,...,7.0,1.0,0.0,0.0,15.554348,9.643478,1699687.58,10.283696,1237.1,4
5,6.632002,1053.7,2023,1.0,0.0,2.0,3.0,4.0,11.0,3.0,...,16.0,1.0,1.0,0.0,17.81087,10.809783,1796511.15,8.804348,1300.5,4
6,7.028201,1148.3,2023,0.0,0.0,1.0,2.0,3.0,15.0,2.0,...,39.0,2.0,1.0,2.0,17.918478,11.628261,1848039.31,11.211957,1132.9,4
8,7.161622,1341.1,2023,2.0,0.0,2.0,5.0,3.0,20.0,2.0,...,66.0,16.0,1.0,4.0,17.380435,12.923913,1884254.38,16.102174,948.8,4


In [26]:
service_cols = [
    "cinema", "college", "courthouse", "fire_station", "hospital",
    "kindergarten", "library", "mall", "museum", "pharmacy", "police",
    "post_office", "school", "station", "theatre", "university",
]

weather_cols = [
    "temp_max_c_mean", "temp_min_c_mean",
    "sunshine_s_sum", "wind_kmh_mean", "precip_mm_sum",
]

df_master = df.copy()
df_master[service_cols] = df_master[service_cols].fillna(0)

df_master = df_master.dropna(subset=["avg_income_eur"] + weather_cols)

print("Rows before:", len(df))
print("Rows after cleaning:", len(df_master))
print("Municipalities after cleaning:", df_master["muni_norm"].nunique())

df_master.head()


Rows before: 5185
Rows after cleaning: 3315
Municipalities after cleaning: 194


Unnamed: 0,Designação,COD (NUTS2024),muni_norm,quarter_label,price_sqm,year,quarter,avg_income_eur,cinema,college,...,school,station,theatre,university,temp_max_c_mean,temp_min_c_mean,sunshine_s_sum,wind_kmh_mean,precip_mm_sum,log_price_sqm
0,Arcos de Valdevez,1111601,ARCOS DE VALDEVEZ,4ºT 2023,813.0,2023,4,1140.4,1.0,0.0,...,15.0,1.0,1.0,0.0,17.747826,11.431522,1755076.37,9.177174,1405.1,6.700731
1,Caminha,1111602,CAMINHA,4ºT 2023,1318.0,2023,4,1069.7,0.0,0.0,...,0.0,0.0,0.0,0.0,17.836957,12.041304,1827696.25,12.670652,1219.6,7.183871
4,Paredes de Coura,1111605,PAREDES DE COURA,4ºT 2023,723.0,2023,4,1137.3,1.0,0.0,...,7.0,1.0,0.0,0.0,15.554348,9.643478,1699687.58,10.283696,1237.1,6.583409
5,Ponte da Barca,1111606,PONTE DA BARCA,4ºT 2023,759.0,2023,4,1053.7,1.0,0.0,...,16.0,1.0,1.0,0.0,17.81087,10.809783,1796511.15,8.804348,1300.5,6.632002
6,Ponte de Lima,1111607,PONTE DE LIMA,4ºT 2023,1128.0,2023,4,1148.3,0.0,0.0,...,39.0,2.0,1.0,2.0,17.918478,11.628261,1848039.31,11.211957,1132.9,7.028201


In [27]:
# Make sure the folder exists
os.makedirs("data/processed", exist_ok=True)

output_path = "data/processed/master_dataframe.csv"
df_master.to_csv(output_path, index=False)

print("Saved master dataframe to:", output_path)

Saved master dataframe to: data/processed/master_dataframe.csv


In [28]:
df_master = pd.read_csv("data/processed/master_dataframe.csv")


In [None]:
numeric_features = ["avg_income_eur", "year"] + service_cols + weather_cols
categorical_features = ["quarter"]

model_df = df_master[["log_price_sqm"] + numeric_features + categorical_features].dropna()

X = model_df[numeric_features + categorical_features]
y = model_df["log_price_sqm"]

print("Final regression sample size:", len(model_df))
model_df.head()

Final regression sample size: 3315


Unnamed: 0,log_price_sqm,avg_income_eur,year,cinema,college,courthouse,fire_station,hospital,kindergarten,library,...,school,station,theatre,university,temp_max_c_mean,temp_min_c_mean,sunshine_s_sum,wind_kmh_mean,precip_mm_sum,quarter
0,6.700731,1140.4,2023,1.0,0.0,2.0,3.0,4.0,11.0,4.0,...,15.0,1.0,1.0,0.0,17.747826,11.431522,1755076.37,9.177174,1405.1,4
1,7.183871,1069.7,2023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,17.836957,12.041304,1827696.25,12.670652,1219.6,4
2,6.583409,1137.3,2023,1.0,0.0,1.0,1.0,0.0,3.0,1.0,...,7.0,1.0,0.0,0.0,15.554348,9.643478,1699687.58,10.283696,1237.1,4
3,6.632002,1053.7,2023,1.0,0.0,2.0,3.0,4.0,11.0,3.0,...,16.0,1.0,1.0,0.0,17.81087,10.809783,1796511.15,8.804348,1300.5,4
4,7.028201,1148.3,2023,0.0,0.0,1.0,2.0,3.0,15.0,2.0,...,39.0,2.0,1.0,2.0,17.918478,11.628261,1848039.31,11.211957,1132.9,4


In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [31]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(drop="first"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [38]:
sgd = SGDRegressor(
    loss="squared_error",
    penalty="l2",
    learning_rate="constant",
    max_iter=5000,
    tol=1e-3,
    random_state=42
)

In [41]:
pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", sgd)
])



In [42]:
param_grid = {
    "regressor__eta0": [1e-4, 5e-4, 1e-3, 5e-3, 1e-2],
    "regressor__alpha": [1e-6, 1e-5, 1e-4, 1e-3],
}

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=5,              # 5-fold CV
    scoring="r2",
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("Best CV score (R²):", grid.best_score_)
print("Best params:", grid.best_params_)

best_model = grid.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best CV score (R²): 0.5695946997049441
Best params: {'regressor__alpha': 0.001, 'regressor__eta0': 0.005}


In [44]:
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print(f"Test R²: {r2:.3f}")
print(f"Test RMSE on log(price_sqm): {rmse:.3f}")

Test R²: 0.525
Test RMSE on log(price_sqm): 0.159


In [45]:
feature_names = best_model.named_steps["preprocess"].get_feature_names_out()
coefs = best_model.named_steps["regressor"].coef_

coef_df = (
    pd.DataFrame({"feature": feature_names, "coef": coefs})
      .sort_values("coef", ascending=False)
)

In [46]:
print("\nTop positive coefficients:")
print(coef_df.head(15))

print("\nTop negative coefficients:")
print(coef_df.tail(15))


Top positive coefficients:
                 feature      coef
19  num__temp_min_c_mean  0.538554
6          num__hospital  0.170539
13      num__post_office  0.152615
0    num__avg_income_eur  0.148639
20   num__sunshine_s_sum  0.143375
7      num__kindergarten  0.136941
14           num__school  0.127701
12           num__police  0.084661
15          num__station  0.070733
1              num__year  0.046397
9              num__mall  0.038741
2            num__cinema  0.032919
11         num__pharmacy  0.009812
3           num__college -0.001324
8           num__library -0.009741

Top negative coefficients:
                 feature      coef
2            num__cinema  0.032919
11         num__pharmacy  0.009812
3           num__college -0.001324
8           num__library -0.009741
22    num__precip_mm_sum -0.021071
10           num__museum -0.046467
17       num__university -0.083045
21    num__wind_kmh_mean -0.099853
4        num__courthouse -0.114866
16          num__theatre -0.130684