In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import warnings

from src.config import (
    AGE_BUCKET_FILE,
    DENSITY_CLEAN_FILE,
    HOUSE_CLEAN_FILE,
    INCOME_CLEAN_TOTAL,
    MASTER_DF_FILE,
    SERVICES_COLUMNS,
    SERVICES_FILE,
    WEATHER_QUARTER_FILE,
)

warnings.filterwarnings("ignore")

In [2]:
df_house = pd.read_csv(HOUSE_CLEAN_FILE)
df_weather = pd.read_csv(WEATHER_QUARTER_FILE)
df_income = pd.read_csv(INCOME_CLEAN_TOTAL)
df_services = pd.read_csv(SERVICES_FILE)
df_density = pd.read_csv(DENSITY_CLEAN_FILE)
df_age = pd.read_csv(AGE_BUCKET_FILE)

Unify the column names.

In [3]:
df_services = df_services.rename(columns={'area': 'municipality'})
df_income = (
    df_income
    .rename(columns={'region': 'municipality', 'value': 'avg_income'})
    .assign(municipality=lambda x: x['municipality'].str.replace(r'\s*\[.*\]', '', regex=True).str.strip())
    [['year', 'municipality', 'avg_income']]
)

Add the temporal feature: order of the quarter (from 1 (Q4 2019) to 22 (Q1 2025))

In [4]:
quarter_mapper = {q: i for i, q in enumerate(df_house.columns[1:][::-1], 1)}

Merge all of the datasets into one.

In [5]:
df_final = (
    df_house
    .melt(id_vars='Designation', var_name='quarter_year', value_name='house_price')
    .rename(columns={'Designation': 'municipality'})
    .merge(df_weather, on=['municipality', 'quarter_year'])
    .assign(
        year=lambda x: x['quarter_year'].str.split().str[-1].astype(int),
        quarter_num=lambda x: x['quarter_year'].str.split().str[0].str.replace('Q', '', regex=False).astype(int),
        quarter_ord=lambda x: x['quarter_year'].map(quarter_mapper),
        log_price_sqm=lambda x: np.log(x['house_price'])
    )
    .query('year <= 2023 and house_price > 0')
    .merge(df_income, on=['municipality', 'year'], how='left')
    .merge(df_density, on=['municipality', 'year'], how='left')
    .merge(df_age, on=['municipality', 'year'], how='left')
    .merge(df_services, on='municipality', how='left')
    .drop(columns=['quarter_year'])
    .dropna(subset=SERVICES_COLUMNS)
)

df_final

Unnamed: 0,municipality,house_price,total_sunshine_h,mean_sunshine_h,windspeed_mean_kmh,total_precipitation_mm,mean_precipitation_mm,windy_days,rainy_days,sunny_days,...,library,mall,museum,pharmacy,police,post_office,school,station,theatre,university
0,Arcos de Valdevez,813.000000,487.521214,5.299144,9.177174,1405.1,15.272826,0,57,34,...,4.0,2.0,5.0,10.0,2.0,4.0,15.0,1.0,1.0,0.0
4,Paredes de Coura,723.000000,472.135439,5.131907,10.283696,1237.1,13.446739,0,57,34,...,1.0,0.0,1.0,4.0,1.0,1.0,7.0,1.0,0.0,0.0
5,Ponte da Barca,759.000000,499.030875,5.424249,8.804348,1300.5,14.135870,0,57,35,...,3.0,2.0,7.0,10.0,2.0,3.0,16.0,1.0,1.0,0.0
6,Ponte de Lima,1128.000000,513.344253,5.579829,11.211957,1132.9,12.314130,2,58,38,...,2.0,2.0,3.0,13.0,6.0,2.0,39.0,2.0,1.0,2.0
7,Valença,945.000000,549.046297,5.967895,13.494565,386.7,4.203261,1,54,27,...,3.0,1.0,3.0,7.0,6.0,4.0,23.0,5.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,Ponta do Sol,1111.000000,835.713408,9.083841,11.677174,207.4,2.254348,1,62,74,...,1.0,0.0,0.0,2.0,2.0,2.0,10.0,0.0,0.0,0.0
5179,Porto Moniz,737.688811,823.469272,8.950753,19.947826,307.7,3.344565,20,65,78,...,1.0,0.0,1.0,1.0,1.0,2.0,3.0,4.0,0.0,0.0
5182,Santa Cruz,1130.000000,859.389375,9.341189,7.531522,131.7,1.431522,0,49,84,...,4.0,13.0,4.0,12.0,6.0,6.0,24.0,4.0,3.0,0.0
5183,Santana,844.000000,1020.573564,11.093191,7.280435,176.7,1.920652,0,35,87,...,1.0,0.0,3.0,4.0,1.0,4.0,10.0,2.0,3.0,0.0


In [6]:
df_final.to_csv(MASTER_DF_FILE)
df_final.columns

Index(['municipality', 'house_price', 'total_sunshine_h', 'mean_sunshine_h',
       'windspeed_mean_kmh', 'total_precipitation_mm', 'mean_precipitation_mm',
       'windy_days', 'rainy_days', 'sunny_days', 'warm_days', 'year',
       'quarter_num', 'quarter_ord', 'log_price_sqm', 'avg_income',
       'people/km2', '< 5', '5 - 19', '20 - 34', '35 - 54', '55 - 64', '> 65',
       'cinema', 'college', 'courthouse', 'fire_station', 'hospital',
       'kindergarten', 'library', 'mall', 'museum', 'pharmacy', 'police',
       'post_office', 'school', 'station', 'theatre', 'university'],
      dtype='object')