In [20]:
import pandas as pd
from rapidfuzz import fuzz, process  # For fuzzy matching

In [4]:
def initialize_csv_cbs(file_path: str):
    # Define the columns as per the specified labels
    columns = [
        "regio", "gm_naam", "recs", "gwb_code", "year", "old_name", "old_recs", "sim_score"
    ]
    # Create an empty DataFrame with these columns
    df = pd.DataFrame(columns=columns)
    # Save to CSV
    df.to_csv(file_path, index=False)
    print(f"Initialized CSV at: {file_path}")

In [5]:
def save_matches_to_csv(matches, file_path):
    # Create or append to a CSV file with matches
    df_matches = pd.DataFrame(matches)
    
    if not df_matches.empty:
        df_matches.to_csv(file_path, mode='a', index=False, header=False)
        print(f"Appended {len(matches)} matches to {file_path}")

In [53]:
def simalarity_check_gemeente(row, df_old, name_col, code_col, gm_col, typ_R, woz):
    # Fuzzy matching
    best_score = 0
    best_match = None

    for _, row_old in df_old.iterrows():
        # Check for cross-type comparisons: 'wijk' vs. 'gemeente'
        if (
            (row['recs'].lower() == 'wijk' and row_old[typ_R].lower() == 'gemeente') or
            (row['recs'].lower() == row_old[typ_R].lower())
        ):
            similarity = fuzz.ratio(row['regio'].lower(), row_old[name_col].lower())

            # Update best match if similarity is better but not perfect (100)
            if 70 <= similarity < 100 and similarity > best_score:
                best_score = similarity
                best_match = (
                    row_old[name_col],
                    row_old[gm_col],
                    row_old[typ_R],
                    row_old[woz],
                    best_score
                )

    # Log matches above the threshold
    if best_match:
        print(f"Best match for {row['regio']} ({row['gm_naam']}): {best_match[0]} ({best_match[2]}), Score: {best_score}")
        return best_match



In [49]:
def compare_differences(df_old, df_new, name_col, code_col, gm_col, typ_R, woz, year):
    matches = []
    second_match = []
    missing = []

    
    # Convert df_old names to lowercase for case-insensitive comparison
    df_old[name_col] = df_old[name_col].fillna("").str.lower()
    df_old[gm_col] = df_old[gm_col].fillna("").str.lower()
    df_old[typ_R] = df_old[typ_R].fillna("").str.lower()
    df_old[woz] = df_old[woz].fillna(0)  # Fill missing WOZ values with 0 or another placeholder

    df_old_set_name_R = set(zip(df_old[name_col], df_old[gm_col], df_old[typ_R], df_old[woz]))


    gm = None
    
    # Iterate through df_new rows for exact matching
    for index, row in df_new.iterrows():
        key_name_R = (row['regio'].lower(), row['gm_naam'].lower(), row['recs'].lower())

        if row['recs'].lower() == 'gemeente':

            # Search for match in old set, with check of typ_R are the same
            matched_entry = next(
                (entry for entry in df_old_set_name_R if entry[:3] == key_name_R),
                None
            )
            
            # Search for match in old set, without check of typ_R are the same
            # matched_entry = next(
            #     (entry for entry in df_old_set_name_R if entry[:2] == key_name_R[:2]),
            #     None
            # )

            
            if matched_entry:                
                matches.append({
                    "regio": row['regio'],
                    "gm_naam": row['gm_naam'],
                    "recs": row['recs'],
                    "gwb_code": row['gwb_code'],  # Assuming this is the correct mapping
                    "year": year,           
                    "gem_woz": matched_entry[3]  # Safely get 'woz' if it exists
                })
            
    
            else:
                match = simalarity_check_gemeente(row, df_old, name_col, code_col, gm_col, typ_R, woz)
                if match:
                    second_match.append({
                        "regio": row['regio'],
                        "gm_naam": row['gm_naam'],
                        "recs": row['recs'],
                        "gwb_code": row['gwb_code'], 
                        "year": year,
                        "old_name": match[0],
                        "old_recs": match[2], 
                        "sim_score": match[4]
                    })

                else: 
                    missing.append({
                        "index": index,
                        "regio": row['regio'],
                        "gm_naam": row['gm_naam'],
                    })

    
    return matches, missing, second_match

In [42]:
def cbs(file_path):
    
    df_24 = pd.read_excel('kwb-2024.xls', usecols=['regio', 'gm_naam', 'recs', 'gwb_code_8', 'gwb_code', 'a_inw'], dtype={'gwb_code_8': str, 'gwb_code': str, 'regio': str})
    
    for year in range(4, 5):
        year = str(year).zfill(2)  # Ensure year is two digits
        year_full = f"20{year}"
        file = f'kwb-{year_full}.xls'

        try:
            if int(year_full) < 2012:
                name = f'GWB_NAAM{year}_60pos'
                code = f'GWB_CODE{year}'
                gm = 'GM_NAAM'
                typ_R = 'RECS'
                woz = 'WOZ'

                df_cbs = pd.read_excel(file, usecols=lambda x: x.lower().strip() in [name.lower(), code.lower(), gm.lower(), woz.lower(),  typ_R.lower()])
                                
                matches, missing, second_match = compare_differences(df_cbs, df_24, name, code, gm, typ_R, woz, year_full)
                save_matches_to_csv(second_match, file_path)

                print(year_full)
                print(f"Matches found: {len(matches)}")
                print(f"Missing entries: {len(missing)}")
                print(f"Second match: {len(second_match)}")


            elif int(year_full) == 2012:
                name = f'GWB_NAAM{year}_60POS'
                code = f'GWB_CODE{year}'
                gm = 'GM_NAAM'
                typ_R = 'RECS'
                woz = 'woningwaarde'

                df_cbs = pd.read_excel(file, usecols=lambda x: x.lower().strip() in [name.lower(), f'gwb_code{year}','gm_naam', 'woz',  typ_R.lower()])
                
                matches, missing, second_match = compare_differences(df_cbs, df_24, name, code, gm, typ_R, woz, year_full)
                save_matches_to_csv(matches, file_path)

                print(year_full)
                print(f"Matches found: {len(matches)}")
                print(f"Missing entries: {len(missing)}")
                print(f"Second match: {len(second_match)}")


            elif int(year_full) > 2012:
                name = 'regio'
                code = 'gwb_code_8'
                gm = 'gm_naam'
                typ_R = 'recs'
                woz = 'g_wozbag'

                df_cbs = pd.read_excel(file, usecols=lambda x: any(sub in x.lower().strip() for sub in ['regio', 'gm_naam', 'recs', 'gwb_code_8', 'g_wozbag']))
                
                matches, missing, second_match = compare_differences(df_cbs, df_24, name, code, gm, typ_R, woz, year_full)
                save_matches_to_csv(matches, file_path)

                print(year_full)
                print(f"Matches found: {len(matches)}")
                print(f"Missing entries: {len(missing)}")
                print(f"Second match: {len(second_match)}")


                
        except Exception as e:
            print(f"Could not open {file}: {e}")
            continue

    return matches, missing, second_match


In [54]:
# Main script
if __name__ == "__main__":
    # Define the output file path
    file_path_2match = '/home/wouter/Documents/Scriptie/datacbs/cbs_mutations.csv'
    # Initialize the CSV file
    initialize_csv_cbs(file_path_2match)
    
    # Perform the matching process
    matches, missing, second_match = cbs(file_path_2match)
    
    # # Finalize the formatting of the CSV file
    # finalize_csv_format(file_path)

Initialized CSV at: /home/wouter/Documents/Scriptie/datacbs/cbs_mutations.csv
Best match for Hengelo (O.) (Hengelo): hengelo (gld.) (gemeente), Score: 84.61538461538461
Best match for Bergen (NH.) (Bergen (NH.)): bergen (l.) (gemeente), Score: 86.95652173913044
Best match for Laren (NH.) (Laren): bergen (nh.) (gemeente), Score: 78.26086956521739
Best match for Rijswijk (ZH.) (Rijswijk): rijswijk (gemeente), Score: 72.72727272727273
Best match for Middelburg (Z.) (Middelburg): middelburg (gemeente), Score: 80.0
Best match for Beek (L.) (Beek): bergen (l.) (gemeente), Score: 80.0
Best match for Bergen (L.) (Bergen (L.)): bergen (nh.) (gemeente), Score: 86.95652173913044
Best match for Koggenland (Koggenland): wester-koggenland (gemeente), Score: 74.07407407407408
Best match for Lansingerland (Lansingerland): smallingerland (gemeente), Score: 74.07407407407408
Best match for Berkelland (Berkelland): dinkelland (gemeente), Score: 70.0
Best match for Dantumadiel (Dantumadiel): dantumadeel (

In [None]:
    # # Convert df_old names to lowercase for case-insensitive comparison
    # current_df[name_col] = current_df[name_col].fillna("").str.lower()
    # current_df[gm_col] = current_df[gm_col].fillna("").str.lower()
    # current_df[code_col] = current_df[code_col].fillna("").str.lower()
    # current_df[woz] = current_df[woz].fillna(0)  # Fill missing WOZ values with 0 or another placeholder

    # #df_old_set_name_gem = set(zip(df_old[gm_col], df_old[typ_R]))
    # df_old_set_name_R = set(zip(df_old[name_col], df_old[gm_col], df_old[typ_R], df_old[woz]))


In [3]:
def initialize_csv_cbs(file_path: str):
    # Define the columns as per the specified labels
    columns = [
        "regio", "gm_naam", "recs", "gwb_code", "year", "gem_woz", "a_inw", "a_man", "a_vrouw", 
        "a_00_14", "a_15_24", "a_25_44", "a_45_64", "a_65_oo", "a_ongeh", "a_gehuwd", "a_gesch", "a_verwed", 
        "a_nl_all", "a_eur_al", "a_neu_al", "a_geb_nl", "a_geb_eu", "a_geb_ne", "a_gbl_eu", "a_gbl_ne", 
        "a_geb", "p_geb", "a_ste", "p_ste", "a_hh", "a_1p_hh", "a_hh_z_k", "a_hh_m_k", "g_hhgro", "bev_dich", 
        "a_woning", "p_1gezw", "p_mgezw", "p_bewndw", "p_leegsw", "p_koopw", "p_huurw", "p_wcorpw", "p_ov_hw", "p_bj_me10", "p_bj_mi10", 
        "g_ele", "g_ele_ap", "g_ele_tw", "g_ele_hw", "g_ele_2w", "g_ele_vw", "g_ele_hu", "g_ele_ko", 
        "g_gas", "g_gas_ap", "g_gas_tw", "g_gas_hw", "g_gas_2w", "g_gas_vw", "g_gas_hu", "g_gas_ko", 
        "p_stadsv", "a_opl_lg", "a_opl_md", "a_opl_hg", "p_arb_pp", "p_arb_wn", "p_arb_zs", 
        "a_inkont", "g_ink_po", "g_ink_pi", "p_ink_li", "p_ink_hi", 
        "g_hh_sti", "p_hh_li", "p_hh_hi", "p_hh_lkk", "p_hh_osm", "p_hh_110", "p_hh_120", "m_hh_ver", "a_soz_wb", "a_soz_ao", "a_soz_ww", "a_soz_ow", 
        "a_jz_tn", "p_jz_tn", "a_wmo_t", "p_wmo_t", "a_bedv", "a_bed_a", "a_bed_bf", "a_bed_gi", "a_bed_hj", "a_bed_kl", "a_bed_mn", "a_bed_oq", "a_bed_ru",
        "a_pau", "a_bst_b", "a_bst_nb", "g_pau_hh", "g_pau_km", "a_m2w", "g_afs_hp", "g_afs_gs", "g_afs_kv", "g_afs_sc", "g_3km_sc",
        "a_opp_ha", "a_lan_ha", "a_wat_ha", "pst_mvp", "pst_dekp", "ste_mvs", "ste_oad"
    ]
    # Create an empty DataFrame with these columns
    df = pd.DataFrame(columns=columns)
    # Save to CSV
    df.to_csv(file_path, index=False)
    print(f"Initialized CSV at: {file_path}")


SyntaxError: unterminated string literal (detected at line 8) (3618308814.py, line 8)

In [2]:
def finalize_csv_format(file_path: str):
    """
    Finalize the CSV file by grouping rows and sorting them by gm_naam, gwb_code, and year.
    """
    try:
        # Load the CSV
        df = pd.read_csv(file_path)
        
        df["gwb_code_numeric"] = df["gwb_code"].str[2:]  # Extract characters starting from the 3rd position

        # Sort the DataFrame by gm_naam, gwb_code, and year
        df = df.sort_values(by=["gm_naam", "gwb_code_numeric", "year"], ascending=[True, True, True])
        df = df.drop(columns=["gwb_code_numeric"])

        # Save back to the same CSV fileLeeuwesteyn
        df.to_csv(file_path, index=False)
        print(f"CSV file '{file_path}' has been formatted and saved.")
    except Exception as e:
        print(f"Error during finalizing CSV: {e}")


In [6]:
def save_matches_to_csv(matches, file_path):
    # Create or append to a CSV file with matches
    df_matches = pd.DataFrame(matches)
    
    if not df_matches.empty:
        df_matches.to_csv(file_path, mode='a', index=False, header=False)
        print(f"Appended {len(matches)} matches to {file_path}")

In [7]:
#def check_with_prev_df(current_df, next_df, name_col, code_col, gm_col, typ_R, woz, wbi, year):
def check_with_prev_df(current_df, next_df, name_col, code_col, gm_col, typ_R, woz, wbi, year):

    matches = [] 
    lost_matches = []


    for _, row in next_df.iterrows():

        if pd.isna(row[wbi]):
            continue
        
        # If ind_wbi == 1, find an exact match on 'gwb_code'
        if int(row['ind_wbi']) == 1:
            matched_entry = current_df[current_df[code_col] == row[code_col]].iloc[0] if not current_df[current_df[code_col] == row[code_col]].empty else None

            if matched_entry is not None:
                matches.append({
                    "regio": row['regio'],
                    "gm_naam": row['gm_naam'],
                    "recs": row['recs'],
                    "gwb_code": row['gwb_code'],  # Assuming this is the correct mapping
                    "year": year,           
                    "gem_woz": matched_entry[woz]  # Safely get 'woz' if it exists
                })
        
        # If ind_wbi == 2, handle differently (e.g., code has changed but mapping is required)
        elif int(row['ind_wbi']) == 2:
            potential_matches = current_df[current_df['gm_naam'] == row['gm_naam']]
            print(potential_matches)
            if not potential_matches.empty:
                matched_entry = potential_matches.iloc[0]  # Take the first match for simplicity
                matches.append({
                    "regio": row['regio'],
                    "gm_naam": row['gm_naam'],
                    "recs": row['recs'],
                    "gwb_code": row['gwb_code'],  
                    "year": year,           
                    "gem_woz": matched_entry[woz] 
                })
        
           
            

    return matches
            
           

    

In [87]:
def filter_wbi(df, wbi_col):

    df[wbi_col] = pd.to_numeric(df[wbi_col], errors='coerce')
    
    # Filter the rows where 'ind_wbi' is 1, 2, or missing
    filtered_df = df[df[wbi_col].isin([1, 2]) | df[wbi_col].isna()]
    
    return filtered_df
    

In [121]:
def wbi_load(file_path):
    matches = []  # Initialize the matches list at the start

    incl_feat =  []
    
    df_next_year = None  # Initialize the DataFrame for the next year
    df_24 = None  # For storing data specifically for 2024
    
    for year in range(24, 22, -1):  # Adjust range as needed for other years
        year = str(year).zfill(2)  # Ensure year is two digits
        year_full = f"20{year}"
        file = f'kwb-{year_full}.xls'

        try:
            if int(year_full) > 2012:
                name_col = 'regio'
                code_col = 'gwb_code'
                gm_col = 'gm_naam'
                typ_R = 'recs'
                wbi = 'ind_wbi'
                
                if int(year_full) <= 2019:
                    woz = 'g_woz'
                
                else:
                    woz = 'g_wozbag'
               
                df_cur_year = pd.read_excel(
                    file, 
                    usecols=lambda x: any(sub in x.lower().strip() for sub in ['regio', 'gm_naam', 'recs', 'gwb_code', woz, 'ind_wbi']),
                    dtype={'gwb_code': str, 'regio': str}
                )
                
                df_cur_year_filt = filter_wbi(df_cur_year, wbi)
                
                if int(year_full) == 2024:
                    df_24 = df_cur_year_filt
                    
                
                else:
                    print(f'Processing year: {year_full}')
                    year_matches = check_with_prev_df(
                        df_cur_year, 
                        df_next_year, 
                        name_col, 
                        code_col, 
                        gm_col, 
                        typ_R, 
                        woz, 
                        wbi, 
                        year_full
                    )
                    
                    matches.extend(year_matches)  # Append to the overall matches list
                    save_matches_to_csv(year_matches, file_path)

                df_next_year = df_cur_year_filt
                #print(df_next_year)

        except Exception as e:
            print(f"Could not open {file}: {e}")
            continue
    
    return matches


In [124]:
# Main script
if __name__ == "__main__":
    file_path = '/home/wouter/Documents/Scriptie/datacbs/cbs_final.csv'
    
    initialize_csv_cbs(file_path)
    matches = wbi_load(file_path)
    
    # Finalize the formatting of the CSV file
    finalize_csv_format(file_path)

Initialized CSV at: /home/wouter/Documents/Scriptie/datacbs/cbs_final.csv
Processing year: 2023
Appended 17351 matches to /home/wouter/Documents/Scriptie/datacbs/cbs_final.csv
Processing year: 2022
Appended 17422 matches to /home/wouter/Documents/Scriptie/datacbs/cbs_final.csv
Processing year: 2021
Appended 16851 matches to /home/wouter/Documents/Scriptie/datacbs/cbs_final.csv
Processing year: 2020
Appended 16001 matches to /home/wouter/Documents/Scriptie/datacbs/cbs_final.csv
Processing year: 2019
Appended 16196 matches to /home/wouter/Documents/Scriptie/datacbs/cbs_final.csv
Processing year: 2018
Appended 14955 matches to /home/wouter/Documents/Scriptie/datacbs/cbs_final.csv
Processing year: 2017
Appended 15368 matches to /home/wouter/Documents/Scriptie/datacbs/cbs_final.csv
Processing year: 2016
Appended 15137 matches to /home/wouter/Documents/Scriptie/datacbs/cbs_final.csv
Processing year: 2015
Appended 14018 matches to /home/wouter/Documents/Scriptie/datacbs/cbs_final.csv
Processi

In [7]:
import pandas as pd

def count_wbi_3(df, year, wbi_column, regio_column, woz_column):
    """
    Counts rows where `wbi_column` equals 3 and collects `regio` and `woz` for those rows.
    
    Args:
        df (pd.DataFrame): The DataFrame for the current year.
        year (str): The year of the data being processed.
        wbi_column (str): The column representing the WBI indicator.
        regio_column (str): The column representing the region name.
        woz_column (str): The column representing the WOZ value.
    
    Returns:
        list: A list of dictionaries containing year, regio, and woz for rows where wbi == 3.
    """
    wbi_3_list = []
    filtered_df = df[df[wbi_column] == 3]
    
    for _, row in filtered_df.iterrows():
        wbi_3_list.append({
            'year': year,
            'regio': row[regio_column],
            'woz': row[woz_column]
        })
    
    return wbi_3_list

In [9]:
def wbi_check_on_3():

    wbi_3_over_years = []
    
    df_next_year = None  # Initialize the DataFrame for the next year
    df_24 = None  # For storing data specifically for 2024
    
    for year in range(24, 22, -1):  # Adjust range as needed for other years
        year = str(year).zfill(2)  # Ensure year is two digits
        year_full = f"20{year}"
        file = f'kwb-{year_full}.xls'

        try:
            if int(year_full) > 2012:
                name_col = 'regio'
                code_col = 'gwb_code'
                gm_col = 'gm_naam'
                typ_R = 'recs'
                wbi = 'ind_wbi'
                
                if int(year_full) <= 2019:
                    woz = 'g_woz'
                
                else:
                    woz = 'g_wozbag'
               
                df_cur_year = pd.read_excel(
                    file, 
                    usecols=lambda x: any(sub in x.lower().strip() for sub in ['regio', 'gm_naam', 'recs', 'gwb_code', woz, 'ind_wbi']),
                    dtype={'gwb_code': str, 'regio': str}
                )
                
                
                # if int(year_full) == 2024:
                #     df_24 = df_cur_year
                    
                print(f'Processing year: {year_full}')
                wbi_3_data = count_wbi_3(df_cur_year, year_full, wbi, name_col, woz)
                wbi_3_over_years.extend(wbi_3_data)
                
        except Exception as e:
            print(f"Could not open {file}: {e}")
            continue
    
    return wbi_3_over_years


In [10]:
if __name__ == "__main__":
  
    list3 = wbi_check_on_3()
    print(list3)

Processing year: 2024
Processing year: 2023
[{'year': '2024', 'regio': 'Dokkumer Ie e.o.', 'woz': '.'}, {'year': '2024', 'regio': 'Jelsum', 'woz': '.'}, {'year': '2024', 'regio': 'Buitengebied Jelsum', 'woz': '.'}, {'year': '2024', 'regio': 'Koarnjum', 'woz': '.'}, {'year': '2024', 'regio': 'Buitengebied Koarnjum', 'woz': '.'}, {'year': '2024', 'regio': 'Britsum', 'woz': '.'}, {'year': '2024', 'regio': 'Buitengebied Britsum', 'woz': '.'}, {'year': '2024', 'regio': 'Stiens e.o.', 'woz': '.'}, {'year': '2024', 'regio': 'Stiens', 'woz': '.'}, {'year': '2024', 'regio': 'Buitengebied Stiens', 'woz': '.'}, {'year': '2024', 'regio': 'Buitengebied Hempens', 'woz': '.'}, {'year': '2024', 'regio': 'Zuiderburen', 'woz': '.'}, {'year': '2024', 'regio': 'Middelsee', 'woz': '.'}, {'year': '2024', 'regio': 'Swichum', 'woz': '.'}, {'year': '2024', 'regio': 'Buitengebied Wirdum', 'woz': '.'}, {'year': '2024', 'regio': 'Mantgum', 'woz': '.'}, {'year': '2024', 'regio': 'Buitengebied Mantgum', 'woz': '.'}

In [11]:
for row in list3:
    print(row)

{'year': '2024', 'regio': 'Dokkumer Ie e.o.', 'woz': '.'}
{'year': '2024', 'regio': 'Jelsum', 'woz': '.'}
{'year': '2024', 'regio': 'Buitengebied Jelsum', 'woz': '.'}
{'year': '2024', 'regio': 'Koarnjum', 'woz': '.'}
{'year': '2024', 'regio': 'Buitengebied Koarnjum', 'woz': '.'}
{'year': '2024', 'regio': 'Britsum', 'woz': '.'}
{'year': '2024', 'regio': 'Buitengebied Britsum', 'woz': '.'}
{'year': '2024', 'regio': 'Stiens e.o.', 'woz': '.'}
{'year': '2024', 'regio': 'Stiens', 'woz': '.'}
{'year': '2024', 'regio': 'Buitengebied Stiens', 'woz': '.'}
{'year': '2024', 'regio': 'Buitengebied Hempens', 'woz': '.'}
{'year': '2024', 'regio': 'Zuiderburen', 'woz': '.'}
{'year': '2024', 'regio': 'Middelsee', 'woz': '.'}
{'year': '2024', 'regio': 'Swichum', 'woz': '.'}
{'year': '2024', 'regio': 'Buitengebied Wirdum', 'woz': '.'}
{'year': '2024', 'regio': 'Mantgum', 'woz': '.'}
{'year': '2024', 'regio': 'Buitengebied Mantgum', 'woz': '.'}
{'year': '2024', 'regio': 'Weidum', 'woz': '.'}
{'year': '20

In [13]:
len(list3)
df_24 = pd.read_excel('kwb-2024.xls')

In [14]:
print(len(df_24))

18310
