In [11]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from src.config import AGE_CLEAN_FILE, MASTER_DF_FILE

In [12]:
age_dist_df = pd.read_csv(AGE_CLEAN_FILE, index_col=[0, 1])  # municipality, year as index
print("Original columns:")
print(age_dist_df.columns.tolist())
print(f"Shape: {age_dist_df.shape}\n")

# Define age bucket mappings from original to new buckets
age_bucket_mapping = {
    "< 5": ["Menos de 5"],
    "5 - 19": ["5 - 9", "10 - 14", "15 - 19"],
    "20 - 34": ["20 - 24", "25 - 29", "30 - 34"],
    "35 - 54": ["35 - 39", "40 - 44", "45 - 49", "50 - 54"],
    "55 - 64": ["55 - 59", "60 - 64"],
    "65+": ["65 - 69", "70 - 74", "75 - 79", "80 - 84", "85 ou mais"]
}

# Create new dataframe with aggregated age buckets
new_age_dist_df = pd.DataFrame(index=age_dist_df.index)

for new_bucket, old_buckets in age_bucket_mapping.items():
    # Sum all columns that match this bucket
    matching_cols = [col for col in age_dist_df.columns if col in old_buckets]
    if matching_cols:
        new_age_dist_df[new_bucket] = age_dist_df[matching_cols].sum(axis=1)
        print(f"✓ Created '{new_bucket}' from {matching_cols}")
    else:
        print(f"⚠ Warning: No columns found for '{new_bucket}'")

print(f"\nNew columns: {new_age_dist_df.columns.tolist()}")
print(f"New shape: {new_age_dist_df.shape}\n")

# Reset index to have municipality and year as columns
new_age_dist_df = new_age_dist_df.reset_index()

# Save transformed age distribution
new_age_dist_df.to_csv(AGE_CLEAN_FILE, index=False)
print(f"✓ Saved transformed age distribution to {AGE_CLEAN_FILE}")
print("\nFirst few rows:")
print(new_age_dist_df.head(10))

Original columns:
['10 - 14', '15 - 19', '20 - 24', '25 - 29', '30 - 34', '35 - 39', '40 - 44', '45 - 49', '5 - 9', '50 - 54', '55 - 59', '60 - 64', '65 - 69', '70 - 74', '75 - 79', '80 - 84', '85 ou mais', 'Menos de 5']
Shape: (1540, 18)

✓ Created '< 5' from ['Menos de 5']
✓ Created '5 - 19' from ['10 - 14', '15 - 19', '5 - 9']
✓ Created '20 - 34' from ['20 - 24', '25 - 29', '30 - 34']
✓ Created '35 - 54' from ['35 - 39', '40 - 44', '45 - 49', '50 - 54']
✓ Created '55 - 64' from ['55 - 59', '60 - 64']
✓ Created '65+' from ['65 - 69', '70 - 74', '75 - 79', '80 - 84', '85 ou mais']

New columns: ['< 5', '5 - 19', '20 - 34', '35 - 54', '55 - 64', '65+']
New shape: (1540, 6)

✓ Saved transformed age distribution to C:\Users\gonca\Documents\housing-prices\FCD-housing-prices\data\interm\age_distribution_by_municipality.csv

First few rows:
      municipality  year     < 5  5 - 19  20 - 34  35 - 54  55 - 64      65+
0         Abrantes  2019  1078.0  4361.0   4136.0   9161.0   5522.0  10673.

In [14]:
# Load master DataFrame
master_df = pd.read_csv(MASTER_DF_FILE)

print("Original age columns:")
age_cols = [col for col in master_df.columns if any(char.isdigit() for char in col) and '-' in col]
print(age_cols)

# Define correct mapping based on actual column names
age_bucket_mapping = {
    "< 5": ["Menos de 5"],
    "5 - 19": ["5 - 9", "10 - 14", "15 - 19"],
    "20 - 34": ["20 - 24", "25 - 29", "30 - 34"],
    "35 - 54": ["35 - 39", "40 - 44", "45 - 49", "50 - 54"],
    "55 - 64": ["55 - 59", "60 - 64"],
    "65+": ["65 - 69", "70 - 74", "75 - 79", "80 - 84", "85 ou mais"]
}

# Create aggregated columns
for new_bucket, old_buckets in age_bucket_mapping.items():
    matching_cols = [col for col in old_buckets if col in master_df.columns]
    if matching_cols:
        master_df[new_bucket] = master_df[matching_cols].sum(axis=1)
        print(f"✓ Created '{new_bucket}'")

# Drop old granular age columns
cols_to_drop = [col for col in age_cols if col in master_df.columns]
master_df = master_df.drop(columns=cols_to_drop)

print("\nNew age columns:")
print([col for col in master_df.columns if col in age_bucket_mapping.keys()])

master_df.to_csv(MASTER_DF_FILE, index=False)
print(f"\n✓ Saved transformed master DataFrame")

Original age columns:
['10 - 14', '15 - 19', '20 - 24', '25 - 29', '30 - 34', '35 - 39', '40 - 44', '45 - 49', '5 - 9', '50 - 54', '55 - 59', '60 - 64', '65 - 69', '70 - 74', '75 - 79', '80 - 84']
✓ Created '< 5'
✓ Created '5 - 19'
✓ Created '20 - 34'
✓ Created '35 - 54'
✓ Created '55 - 64'
✓ Created '65+'

New age columns:
['< 5', '5 - 19', '20 - 34', '35 - 54', '55 - 64', '65+']

✓ Saved transformed master DataFrame


In [15]:
master_df = pd.read_csv(MASTER_DF_FILE)

master_df.drop(columns=['85 ou mais'], inplace=True)
master_df.drop(columns=['Menos de 5'], inplace=True)

master_df.to_csv(MASTER_DF_FILE, index=False)



In [17]:

master_df.drop(columns=['Unnamed: 0'], inplace=True)
master_df.head()

Unnamed: 0,municipality,house_price,total_sunshine_h,mean_sunshine_h,windspeed_mean_kmh,total_precipitation_mm,mean_precipitation_mm,windy_days,rainy_days,sunny_days,...,station,theatre,university,log_price_sqm,< 5,5 - 19,20 - 34,35 - 54,55 - 64,65+
0,Arcos de Valdevez,813.0,487.521214,5.299144,9.177174,1405.1,15.272826,0,57,34,...,1.0,1.0,0.0,6.700731,583.0,2202.0,2804.0,4844.0,2838.0,7588.0
1,Paredes de Coura,723.0,472.135439,5.131907,10.283696,1237.1,13.446739,0,57,34,...,1.0,0.0,0.0,6.583409,261.0,1048.0,1172.0,2290.0,1235.0,2703.0
2,Ponte da Barca,759.0,499.030875,5.424249,8.804348,1300.5,14.13587,0,57,35,...,1.0,1.0,0.0,6.632002,344.0,1326.0,1639.0,2848.0,1634.0,3387.0
3,Ponte de Lima,1128.0,513.344253,5.579829,11.211957,1132.9,12.31413,2,58,38,...,2.0,1.0,2.0,7.028201,1445.0,5342.0,6654.0,11330.0,6017.0,10342.0
4,Valença,945.0,549.046297,5.967895,13.494565,386.7,4.203261,1,54,27,...,5.0,1.0,1.0,6.851185,505.0,1857.0,2246.0,3859.0,1918.0,3774.0


In [19]:
master_df.to_csv(MASTER_DF_FILE, index=False)

In [None]:
master_df.rename(columns={'65+': '> 65'}, inplace=True)
master_df.to_csv(MASTER_DF_FILE, index=False)

In [21]:
new_age_dist_df.head()

Unnamed: 0,municipality,year,< 5,5 - 19,20 - 34,35 - 54,55 - 64,65+
0,Abrantes,2019,1078.0,4361.0,4136.0,9161.0,5522.0,10673.0
1,Abrantes,2020,1066.0,4211.0,4220.0,8859.0,5516.0,10686.0
2,Abrantes,2021,1016.0,4104.0,4245.0,8611.0,5474.0,10722.0
3,Abrantes,2022,1015.0,4016.0,4225.0,8512.0,5389.0,10701.0
4,Abrantes,2023,1003.0,3991.0,4429.0,8290.0,5360.0,10738.0


In [None]:
new_age_dist_df.rename(columns={'65+': '> 65'}, inplace=True)
new_age_dist_df.to_csv(AGE_CLEAN_FILE, index=False)