### Add the province column back to filtered_final_cleaned_data.csv

In [1]:
# Load dataset
import pandas as pd

"""We need to add a "province" column to our filtered_final_cleaned_data.csv file. 
Since we only have postal codes, we'll first need to map them to their provinces. 
We'll use a dictionary mapping postal codes to provinces."""

# Load data
df = pd.read_csv("../data/filtered_final_cleaned_data.csv")

# Define postal code ranges per province
postal_to_province = {
    "Antwerp": range(2000, 3000),
    "East-Flanders": range(9000, 10000),
    "West-Flanders": range(8000, 9000),
    "Flemish-Brabant": list(range(1500, 2000)) + list(range(3000, 3500)),
    "Brussels": range(1000, 1300),
    "Limburg": range(3500, 4000),
    "Liège": range(4000, 5000),
    "Namur": range(5000, 6000),
    "Hainaut": list(range(6000, 6600)) + list(range(7000, 8000)),
    "Luxembourg": range(6600, 7000),
    "Brabant-Wallon": range(1300, 1500)
}

# Helper function to find province for each postal code
def get_province(postal_code):
    try:
        postal_code = int(postal_code)
        for province, codes in postal_to_province.items():
            if postal_code in codes:
                return province
        return "Unknown"
    except:
        return "Unknown"


# Apply the function to create a new column called "province" with the province names based on the postal codes.
df["province"] = df["postal_code"].apply(get_province)

# display(df.head())
print(df[["postal_code", "province"]].head())

# Save the updated dataset
df.to_csv("../data/filtered_final_cleaned_data_province.csv", index=False)
print("CSV saved with 'province' column!")

# Load the new dataset with province column
#df = pd.read_csv("../data/raw/filtered_final_cleaned_data.csv")






   postal_code province
0         2800  Antwerp
1         2200  Antwerp
2         2840  Antwerp
3         2440  Antwerp
4         2300  Antwerp
CSV saved with 'province' column!


In [3]:
"""4) Filter by region (Wallonia vs Flanders vs Brussels)"""

# Create a region column one time

df = pd.read_csv("../data/filtered_final_cleaned_data_province.csv")

def get_region(province):
    flanders = ['Antwerp', 'Limburg', 'East-Flanders', 'West-Flanders', 'Flemish-Brabant']
    wallonia = ['Hainaut', 'Liège', 'Luxembourg', 'Namur', 'Brabant-Wallon']
    brussels = ['Brussels']

    if province in flanders:
        return 'Flanders'
    if province in wallonia:
        return 'Wallonia'
    return 'Brussels'

df['region'] = df['province'].apply(get_region)

df.to_csv("../data/filtered_final_cleaned_data_province_redgion.csv", index=False)
