In [155]:
import pandas as pd
import math
import numpy as np

In [189]:
class CBSDataProcessor(): 
    def __init__(self):
        self.columns_of_interest = [
            'gwb_code_10', 'regio', 'gm_naam', 'recs', 
            'a_inw', 'a_hh', 'a_woning', 'g_ele', 'g_gas', 'p_stadsv'
        ]
    
    def run(self): 
        self.read_data_2012()
        self.change_column_names_2012()
        self.add_missing_columns_2012()
        self.add_woz_columns_pre2020()
        self.combine_data()
        self.format_data(columns_list=['g_ele', 'g_gas', 'p_stadsv'])
        self.save_data()

    def read_data_2012(self):
        dtype_dict = {
            'WK_CODE': str, 'BU_CODE': str, 
            'Code_10_pos12': str, 'GWB_CODE12': str, 
            'GM_CODE': str
        }
        df_file_2012 = pd.read_excel('../data/raw/cbs/kwb-2012.xls', converters=dtype_dict)
        self.data_2012 = df_file_2012

    def change_column_names_2012(self): 
        column_conversion_dict = {
            'GWB_NAAM12_60POS': 'regio',
            'GEM_NAAM': 'gm_naam', 
            'AANT_INW': 'a_inw', 
            'AANTAL_HH': 'a_hh',
            'WONINGEN': 'a_woning', 
            'P_ELEK_TOT': 'g_ele',
            'P_GAS_TOT': 'g_gas', 
            'P_STADVERW': 'p_stadsv'
        }
        self.data_2012.rename(columns=column_conversion_dict, inplace=True)

    def add_missing_columns_2012(self): 
        self._add_recs()
        self._add_gwb_code_10()
    
    def _add_recs(self): 
        def _lambda_make_recs_code(row): 
            recs_dict = {
                'B': 'BU', 'W': 'WK', 'G': 'GM', 'N': 'NL'
            }
            return recs_dict[row.RECS]
        def _lambda_make_recs(row): 
            recs_dict = {
                'B': 'Buurt', 'W': 'Wijk', 'G': 'Gemeente', 'N': 'Land'
            }
            return recs_dict[row.RECS]
        self.data_2012['recs_code'] = self.data_2012.apply(lambda row: _lambda_make_recs_code(row), axis=1)
        self.data_2012['recs'] = self.data_2012.apply(lambda row: _lambda_make_recs(row), axis=1)

    def _add_gwb_code_10(self):
        def _lambda_make_gwb_code_10(row): 
            if pd.isna(row.WK_CODE): 
                row.WK_CODE = ''
            if pd.isna(row.BU_CODE):
                row.BU_CODE = ''
            return f'{row.recs_code}{row.GM_CODE}{row.WK_CODE}{row.BU_CODE}'
        self.data_2012['gwb_code_10'] = self.data_2012.apply(lambda row: _lambda_make_gwb_code_10(row), axis=1)

    def add_woz_columns_pre2020(self):
        None
        # add missing woz columns for pre-2020 data 
    
    def combine_data(self): 
        df_list = []
        self.data_2012 = self.data_2012[self.columns_of_interest]
        self.data_2012['year'] = 2012
        print('Appending year 2012...')
        df_list.append(self.data_2012)

        for year in range(2013, 2023):
            print(f'Appending year {year}...')
            extension = 'xlsx' if year > 2018 else 'xls'
            df = pd.read_excel(f'../data/raw/cbs/kwb-{year}.{extension}', nrows=5) # remove nrows later
            df = df[self.columns_of_interest]
            df['year'] = year
            df_list.append(df)

        self.data_all = pd.concat(df_list)

    def format_data(self, columns_list):
        df = self.data_all
        for column in columns_list: 
            df[column] = df[column].str.strip()
            df[column] = df[column].str.replace(',', '.', regex=False)
            df[column] = pd.to_numeric(df[column], errors='coerce')
        self.data_all = df

    def save_data(self): 
        file_path = '../data/processed/cbs/kwb-all.csv'
        self.data_all.to_csv(file_path, index=False)
        print(f'Saved data to {file_path}')



In [217]:
data_processor = CBSDataProcessor()
data_processor.run()

Appending year 2012...
Appending year 2013...
Appending year 2014...
Appending year 2015...
Appending year 2016...
Appending year 2017...
Appending year 2018...
Appending year 2019...
Appending year 2020...
Appending year 2021...
Appending year 2022...
Saved data to ../data/processed/cbs/kwb-all.csv
