In [3]:
import pandas as pd
import geopandas as gpd
import shapely
from scipy.stats import pearsonr
import numpy as np
from libpysal import weights
import time

from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_classification

import os
from itertools import zip_longest



In [4]:
def grouper(iterable_obj, count, fillvalue=None):
    args = [iter(iterable_obj)] * count
    return zip_longest(*args, fillvalue=fillvalue)

In [5]:
nadf = pd.DataFrame(columns = ["data2015", "residential2015","data2016", "residential2016","data2017", 
                     "residential2017","data2018", "residential2018","data2019", "residential2019"])

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [65]:
def clean_it(PATH):
    for filename in grouper(os.listdir(PATH), 5, np.NaN):
        print(filename)
        
        data2015 = pd.read_csv(PATH + filename[0])
        data2016 = pd.read_csv(PATH + filename[1])
        data2017 = pd.read_csv(PATH + filename[2])
        data2018 = pd.read_csv(PATH + filename[3])
        data2019 = pd.read_csv(PATH + filename[4])

        data2015.rename(columns = {"p_wester_g" : "p_western_g", "afs_school_b" : "afs_vo_b", "afs_treins_g" : "afs_train_g", "nonWestern_b" : "p_nonWestern_b", "mean_WOZ_b" : "mean_WOZ", "pBenefits_b" : "p_benefits_b", "p_NL_g" : "p_NL", "PHoogste20Inkomen_b" : "PHoogste20Inkomen", "PLaagste40Inkomen_b" : "PLaagste40Inkomen" }, inplace = True)
        data2016.rename(columns = {"p_wester_g" : "p_western_g", "afs_school_b" : "afs_vo_b", "afs_treins_g" : "afs_train_g", "nonWestern_b" : "p_nonWestern_b", "mean_WOZ_b" : "mean_WOZ", "pBenefits_b" : "p_benefits_b", "p_NL_g" : "p_NL", "PHoogste20Inkomen_b" : "PHoogste20Inkomen", "PLaagste40Inkomen_b" : "PLaagste40Inkomen" }, inplace = True)
        data2017.rename(columns = {"p_wester_g" : "p_western_g", "afs_school_b" : "afs_vo_b", "afs_treins_g" : "afs_train_g", "nonWestern_b" : "p_nonWestern_b", "mean_WOZ_b" : "mean_WOZ", "pBenefits_b" : "p_benefits_b", "p_NL_g" : "p_NL", "PHoogste20Inkomen_b" : "PHoogste20Inkomen", "PLaagste40Inkomen_b" : "PLaagste40Inkomen" }, inplace = True)
        data2018.rename(columns = {"p_wester_g" : "p_western_g", "afs_school_b" : "afs_vo_b", "afs_treins_g" : "afs_train_g", "nonWestern_b" : "p_nonWestern_b", "mean_WOZ_b" : "mean_WOZ", "pBenefits_b" : "p_benefits_b", "p_NL_g" : "p_NL", "PHoogste20Inkomen_b" : "PHoogste20Inkomen", "PLaagste40Inkomen_b" : "PLaagste40Inkomen" }, inplace = True)
        data2019.rename(columns = {"p_wester_g" : "p_western_g", "afs_school_b" : "afs_vo_b", "afs_treins_g" : "afs_train_g", "nonWestern_b" : "p_nonWestern_b", "mean_WOZ_b" : "mean_WOZ", "pBenefits_b" : "p_benefits_b", "p_NL_g" : "p_NL", "PHoogste20Inkomen_b" : "PHoogste20Inkomen", "PLaagste40Inkomen_b" : "PLaagste40Inkomen" }, inplace = True)

        data2015 = data2015.replace({-99997 : np.NaN, np.inf : np.NaN})
        data2016 = data2016.replace({-99997 : np.NaN, np.inf : np.NaN})
        data2017 = data2017.replace({-99997 : np.NaN, np.inf : np.NaN})
        data2018 = data2018.replace({-99997 : np.NaN, np.inf : np.NaN})
        data2019 = data2019.replace({-99997 : np.NaN, np.inf : np.NaN})

        b15 = data2015[data2015.woonfunctiesum>0]
        b16 = data2016[data2016.woonfunctiesum>0]
        b17 = data2017[data2017.woonfunctiesum>0]
        b18 = data2018[data2018.woonfunctiesum>0]
        b19 = data2019[data2019.woonfunctiesum>0]

        # set percentage lower than 0 to na
        b15[b15["p_benefits_g"] < 0] = np.NaN 
        b16[b16["p_benefits_g"] < 0] = np.NaN 
        b17[b17["p_benefits_g"] < 0] = np.NaN 
        b18[b18["p_benefits_g"] < 0] = np.NaN 
        b19[b19["p_benefits_g"] < 0] = np.NaN 

        def grid_filler(data, grid_column, buurt_column):
            print(grid_column + " missing before: {}".format(data[grid_column].isna().sum()))
            missing_grid_data = data[grid_column].isna().index.values
            data.loc[missing_grid_data, grid_column] = data.loc[missing_grid_data, buurt_column]
            print(grid_column + " missing after: {}".format(data[grid_column].isna().sum()))
            return data[grid_column]

        grid_variables = []
        neighborhood_variables = []
        for var in data2015.columns:
            if var.endswith("_g"):
                grid_variables.append(var)
            elif var.endswith("_b"):
                neighborhood_variables.append(var)

        grid_variables.sort(key=str.lower)
        neighborhood_variables.sort(key=str.lower)

        ls = list()
        for i, j in zip(grid_variables, neighborhood_variables):
            ls.append((i,j))
        print(ls)

        for i in ls:
            data2015[i[0]] = grid_filler(data2015, i[0], i[1])
            data2016[i[0]] = grid_filler(data2016, i[0], i[1])
            data2017[i[0]] = grid_filler(data2017, i[0], i[1])
            data2018[i[0]] = grid_filler(data2018, i[0], i[1])
            data2019[i[0]] = grid_filler(data2019, i[0], i[1])

        var = ['geometry', 'bijeenkomstfunctiesum',
    #        'celfunctie', 
               'gezondheidszorgfunctiesum', 'industriefunctiesum',
           'kantoorfunctiesum', 'logiesfunctiesum', 'onderwijsfunctiesum', 'sportfunctiesum',
           'winkelfunctiesum', 'woonfunctiesum', 'oppervlakteVerblijfsobjectmean', 'BU_CODE',
           'y', 'inwoner_g', 'geboorte_g', 'inw_014_g', 'inw_1524_g', 'inw_2544_g',
           'inw_4564_g', 'inw_65_g', 'p_NL', 'p_western_g', 'p_nonWestern_g',
           'p_buyhouses_g', 'p_rentals_g', 'p_socialHousing_g', 'medianIncome_g',
           'avg_electricity_g', 'avg_gas_g', 'p_benefits_g', 'afs_haprak_g',
           'afs_ziek_g', 'afs_superm_g', 'afs_vo_g', 'afs_oprit_g', 'afs_train_g',
           'afs_transit_g', 'mean_WOZ', 
            'PLaagste40Inkomen', 'PHoogste20Inkomen','bouwjaarmean', 'bouwjaarmin', 'bouwjaarmax']
        t15 = data2015[var]
        t16 = data2016[var]
        t17 = data2017[var]
        t18 = data2018[var]
        t19 = data2019[var]


        t15["n_missing"] = t15.isna().sum(axis=1)
        t16["n_missing"] = t16.isna().sum(axis=1)
        t17["n_missing"] = t17.isna().sum(axis=1)
        t18["n_missing"] = t18.isna().sum(axis=1)
        t19["n_missing"] = t19.isna().sum(axis=1)

        t15b = t15[(t15.woonfunctiesum > 0) & (t15.n_missing < 15)]
        t16b = t16[(t16.woonfunctiesum > 0) & (t16.n_missing < 15)]
        t17b = t17[(t17.woonfunctiesum > 0) & (t17.n_missing < 15)]
        t18b = t18[(t18.woonfunctiesum > 0) & (t18.n_missing < 15)]
        t19b = t19[(t19.woonfunctiesum > 0) & (t19.n_missing < 15)]

        t15b = gpd.GeoDataFrame(t15b)
        t15b.geometry = t15b.geometry.apply(shapely.wkt.loads)

        t16b = gpd.GeoDataFrame(t16b)
        t16b.geometry = t16b.geometry.apply(shapely.wkt.loads)

        t17b = gpd.GeoDataFrame(t17b)
        t17b.geometry = t17b.geometry.apply(shapely.wkt.loads)

        t18b = gpd.GeoDataFrame(t18b)
        t18b.geometry = t18b.geometry.apply(shapely.wkt.loads)

        t19b = gpd.GeoDataFrame(t19b)
        t19b.geometry = t19b.geometry.apply(shapely.wkt.loads)

        var2 = ['bijeenkomstfunctiesum', 'geometry',
    #        'celfunctie', 
                'gezondheidszorgfunctiesum', 'industriefunctiesum',
           'kantoorfunctiesum', 'logiesfunctiesum', 'onderwijsfunctiesum', 'sportfunctiesum',
           'winkelfunctiesum', 'woonfunctiesum', 'oppervlakteVerblijfsobjectmean', 
            'inwoner_g', 'geboorte_g', 'inw_014_g', 'inw_1524_g', 'inw_2544_g',
           'inw_4564_g', 'inw_65_g', 'p_western_g', 'p_nonWestern_g',
           'p_buyhouses_g', 'p_rentals_g', 'p_socialHousing_g', 'medianIncome_g',
           'avg_electricity_g', 'avg_gas_g', 'p_benefits_g', 'afs_haprak_g',
           'afs_ziek_g', 'afs_superm_g', 'afs_vo_g', 'afs_oprit_g', 'afs_train_g',
           'afs_transit_g', 'mean_WOZ', 
            'PLaagste40Inkomen', 'PHoogste20Inkomen', 'bouwjaarmean', 'bouwjaarmin', 'bouwjaarmax']

        t15b = t15b[var2]
        t16b = t16b[var2]
        t17b = t17b[var2]
        t18b = t18b[var2]
        t19b = t19b[var2]

        def fill_final_na(data):
            t0 = time.time()
            total_na = data.isna().sum().sum()
            all_geom = data[["geometry", "sportfunctiesum"]]

            data = data.drop(["geometry"], axis = 1)
            incomplete = data[data.isnull().any(axis=1)]
            complete = gpd.GeoDataFrame(data[~data.isnull().any(axis=1)]) # store the rows that have no na values



            # create list of combinatoins of missing columns
            inc_list = list()
            for _, i in incomplete.iterrows():
                if not tuple(i.index[i.isna()]) in inc_list:
                    inc_list.append(tuple(i.index[i.isna()]))

        #   Create weightsmatrix and transform to row_standardized            
        #     w_all = weights.distance.KNN.from_dataframe(all_geom, k = 4, silence_warnings = True)
        #     w_all.transform = 'r'


            columns = data.columns

            # loop over the missing column combinations
            def train_creater(inp, w, train = "train", test = False):
                """
                returns: new dataframe including the neighbors data
                """
                # Get neigbors per index
                neighbors = [ w.neighbors[k] for k in inp.index.values.tolist()]
                if test:

                    df = pd.concat([inp, train])

                    rows = [ np.concatenate(df.loc[j].values) for j in neighbors] 
                    distances = [list(all_geom.loc[neigh].distance(all_geom.loc[df.iloc[i].name].geometry).values) for i, neigh in enumerate(neighbors)]
                else:
                    # get rows of the neigbors, tranform to np and concatenate them
                    rows = [ np.concatenate(inp.loc[j].values) for j in neighbors] 
                    distances = [list(all_geom.loc[neigh].distance(all_geom.loc[inp.iloc[i].name].geometry).values) for i, neigh in enumerate(neighbors)]


                return pd.concat([pd.DataFrame(rows).set_index(inp.index), inp, pd.DataFrame(distances).set_index(inp.index) ], axis=1)

            for i, cols in enumerate(inc_list):
                cols = list(cols)
                x_train = complete.drop(cols, axis = 1) # only keep the cols to trained upon
                y_train = complete[cols] # y has the na values to be predicted

                w_train  = weights.distance.KNN.from_dataframe(all_geom.loc[x_train.index], k = 4, silence_warnings = True)
                w_train.transform = 'r'

                X_train = train_creater(x_train, w_train)

                # Voor x predict only keep the rows where there are only NA's in cols
                pred = incomplete[(incomplete[cols].isna().all(axis = 1) & (incomplete[columns.drop(cols)].notna().all(axis = 1)))]
                X_pred = pred.drop(cols, axis = 1)

                # create prediction weights of missing rows as well as known rows
                w_pred =  weights.distance.KNN.from_dataframe(all_geom.loc[X_train.index.append(X_pred.index)], k = 4, silence_warnings = True)
                w_pred.transform = 'r'
                X_pred = train_creater(X_pred, w_pred, x_train, True)

                # loop over each missing variable in cols and create model per 
                for col in cols:

                    clf = RandomForestRegressor(max_depth=6, random_state=0)
                    clf.fit(X_train, y_train[col].values.ravel())    
                    y_pred = clf.predict(X_pred)
                    incomplete.loc[X_pred.index, col] = y_pred

                print("{} na's left of original {}, {:.2f}% of cols done, time spend: {:.0f}".format(incomplete.isna().sum().sum(),  total_na, i / len(inc_list) * 100, time.time() - t0))
            print("total time: {:.0f}".format(time.time() - t0))
            return pd.concat([incomplete, complete])

        t15b = fill_final_na(t15b)
        t16b = fill_final_na(t16b)
        t17b = fill_final_na(t17b)
        t18b = fill_final_na(t18b)
        t19b = fill_final_na(t19b)

        t15b = t15b.join(data2015[["C28992R100", "geometry", "y", "BU_CODE"]])
        t16b = t16b.join(data2016[["C28992R100", "geometry", "y", "BU_CODE"]])
        t17b = t17b.join(data2017[["C28992R100", "geometry", "y", "BU_CODE"]])
        t18b = t18b.join(data2018[["C28992R100", "geometry", "y", "BU_CODE"]])
        t19b = t19b.join(data2019[["C28992R100", "geometry", "y", "BU_CODE"]])

        t15b.to_csv("../Data/filled/2015/" + filename[0] + "_filled_.csv")
        t16b.to_csv("../Data/filled/2016/" + filename[1] + "_filled_.csv")
        t17b.to_csv("../Data/filled/2017/" + filename[2] + "_filled_.csv")
        t18b.to_csv("../Data/filled/2018/" + filename[3] + "_filled_.csv")
        t19b.to_csv("../Data/filled/2019/" + filename[4] + "_filled_.csv")


In [66]:
a= clean_it("../Data/snapshot/")

('Heerenveen15.csv', 'Heerenveen16.csv', 'Heerenveen17.csv', 'Heerenveen18.csv', 'Heerenveen19.csv')
[('afs_haprak_g', 'afs_huisarts_b'), ('afs_oprit_g', 'afs_oprit_b'), ('afs_superm_g', 'afs_supermarkt_b'), ('afs_train_g', 'afs_train_b'), ('afs_transit_g', 'afs_transit_b'), ('afs_vo_g', 'afs_vo_b'), ('afs_ziek_g', 'afs_ziekenhuis_b'), ('avg_electricity_g', 'avg_electicity_b'), ('avg_gas_g', 'avg_gas_b'), ('geboorte_g', 'birth_b'), ('inw_014_g', 'inw14_b'), ('inw_1524_g', 'inw24_b'), ('inw_2544_g', 'inw44_b'), ('inw_4564_g', 'inw64_b'), ('inw_65_g', 'inw65_b'), ('inwoner_g', 'Inwoners_b'), ('medianIncome_g', 'medianInkomen_b'), ('p_benefits_g', 'p_benefits_b'), ('p_buildAfter2000_g', 'p_buildAfter2000_b'), ('p_buildBefore2000_g', 'p_buildBefore2000_b'), ('p_buyhouses_g', 'p_buyHouses_b'), ('p_nonWestern_g', 'p_nonWestern_b'), ('p_rentals_g', 'p_rental_houses_b'), ('p_socialHousing_g', 'p_socialHousing_b'), ('p_western_g', 'western_b')]
afs_haprak_g missing before: 19718
afs_haprak_g mi

127 na's left of original 235, 0.00% of cols done, time spend: 6
124 na's left of original 235, 16.67% of cols done, time spend: 8
120 na's left of original 235, 33.33% of cols done, time spend: 14
57 na's left of original 235, 50.00% of cols done, time spend: 18
12 na's left of original 235, 66.67% of cols done, time spend: 28
0 na's left of original 235, 83.33% of cols done, time spend: 38
total time: 38
937 na's left of original 981, 0.00% of cols done, time spend: 5
892 na's left of original 981, 11.11% of cols done, time spend: 8
267 na's left of original 981, 22.22% of cols done, time spend: 15
105 na's left of original 981, 33.33% of cols done, time spend: 23
49 na's left of original 981, 44.44% of cols done, time spend: 30
38 na's left of original 981, 55.56% of cols done, time spend: 38
14 na's left of original 981, 66.67% of cols done, time spend: 47
6 na's left of original 981, 77.78% of cols done, time spend: 54
0 na's left of original 981, 88.89% of cols done, time spend: 

inwoner_g missing after: 8013
inwoner_g missing before: 9355
inwoner_g missing after: 8013
inwoner_g missing before: 9351
inwoner_g missing after: 8013
inwoner_g missing before: 9322
inwoner_g missing after: 8013
medianIncome_g missing before: 9373
medianIncome_g missing after: 8877
medianIncome_g missing before: 9360
medianIncome_g missing after: 8870
medianIncome_g missing before: 9339
medianIncome_g missing after: 8853
medianIncome_g missing before: 9335
medianIncome_g missing after: 8875
medianIncome_g missing before: 11160
medianIncome_g missing after: 10473
p_benefits_g missing before: 9373
p_benefits_g missing after: 8535
p_benefits_g missing before: 9360
p_benefits_g missing after: 8549
p_benefits_g missing before: 9339
p_benefits_g missing after: 8489
p_benefits_g missing before: 9335
p_benefits_g missing after: 8517
p_benefits_g missing before: 9308
p_benefits_g missing after: 8505
p_buildAfter2000_g missing before: 9373
p_buildAfter2000_g missing after: 8542
p_buildAfter2000

864 na's left of original 5172, 50.00% of cols done, time spend: 42
840 na's left of original 5172, 54.55% of cols done, time spend: 44
824 na's left of original 5172, 59.09% of cols done, time spend: 48
814 na's left of original 5172, 63.64% of cols done, time spend: 53
750 na's left of original 5172, 68.18% of cols done, time spend: 57
80 na's left of original 5172, 72.73% of cols done, time spend: 61
72 na's left of original 5172, 77.27% of cols done, time spend: 65
42 na's left of original 5172, 81.82% of cols done, time spend: 67
22 na's left of original 5172, 86.36% of cols done, time spend: 70
10 na's left of original 5172, 90.91% of cols done, time spend: 76
0 na's left of original 5172, 95.45% of cols done, time spend: 81
total time: 81
('Maastricht15.csv', 'Maastricht16.csv', 'Maastricht17.csv', 'Maastricht18.csv', 'Maastricht19.csv')
[('afs_haprak_g', 'afs_huisarts_b'), ('afs_oprit_g', 'afs_oprit_b'), ('afs_superm_g', 'afs_supermarkt_b'), ('afs_train_g', 'afs_train_b'), ('af

990 na's left of original 1089, 0.00% of cols done, time spend: 46
840 na's left of original 1089, 8.33% of cols done, time spend: 66
635 na's left of original 1089, 16.67% of cols done, time spend: 99
523 na's left of original 1089, 25.00% of cols done, time spend: 138
239 na's left of original 1089, 33.33% of cols done, time spend: 163
228 na's left of original 1089, 41.67% of cols done, time spend: 204
179 na's left of original 1089, 50.00% of cols done, time spend: 237
69 na's left of original 1089, 58.33% of cols done, time spend: 283
60 na's left of original 1089, 66.67% of cols done, time spend: 320
48 na's left of original 1089, 75.00% of cols done, time spend: 352
30 na's left of original 1089, 83.33% of cols done, time spend: 381
0 na's left of original 1089, 91.67% of cols done, time spend: 400
total time: 400
1025 na's left of original 1124, 0.00% of cols done, time spend: 42
866 na's left of original 1124, 7.69% of cols done, time spend: 61
758 na's left of original 1124, 

inw_2544_g missing before: 9863
inw_2544_g missing after: 6900
inw_2544_g missing before: 9864
inw_2544_g missing after: 7119
inw_2544_g missing before: 9831
inw_2544_g missing after: 7119
inw_2544_g missing before: 9761
inw_2544_g missing after: 7119
inw_2544_g missing before: 9760
inw_2544_g missing after: 7122
inw_4564_g missing before: 10325
inw_4564_g missing after: 6900
inw_4564_g missing before: 10292
inw_4564_g missing after: 7119
inw_4564_g missing before: 10221
inw_4564_g missing after: 7119
inw_4564_g missing before: 10133
inw_4564_g missing after: 7119
inw_4564_g missing before: 10072
inw_4564_g missing after: 7122
inw_65_g missing before: 9177
inw_65_g missing after: 6900
inw_65_g missing before: 9164
inw_65_g missing after: 7119
inw_65_g missing before: 9134
inw_65_g missing after: 7119
inw_65_g missing before: 9076
inw_65_g missing after: 7119
inw_65_g missing before: 9062
inw_65_g missing after: 7122
inwoner_g missing before: 8793
inwoner_g missing after: 6861
inwoner_g

982 na's left of original 1364, 21.43% of cols done, time spend: 125
923 na's left of original 1364, 28.57% of cols done, time spend: 138
908 na's left of original 1364, 35.71% of cols done, time spend: 172
608 na's left of original 1364, 42.86% of cols done, time spend: 229
298 na's left of original 1364, 50.00% of cols done, time spend: 264
266 na's left of original 1364, 57.14% of cols done, time spend: 277
138 na's left of original 1364, 64.29% of cols done, time spend: 320
57 na's left of original 1364, 71.43% of cols done, time spend: 373
44 na's left of original 1364, 78.57% of cols done, time spend: 429
40 na's left of original 1364, 85.71% of cols done, time spend: 457
0 na's left of original 1364, 92.86% of cols done, time spend: 505
total time: 505
1439 na's left of original 2444, 0.00% of cols done, time spend: 9
1367 na's left of original 2444, 5.56% of cols done, time spend: 27
1353 na's left of original 2444, 11.11% of cols done, time spend: 50
1197 na's left of original

p_socialHousing_g missing after: 8811
p_socialHousing_g missing before: 12141
p_socialHousing_g missing after: 8773
p_socialHousing_g missing before: 12139
p_socialHousing_g missing after: 8697
p_western_g missing before: 11518
p_western_g missing after: 7732
p_western_g missing before: 11499
p_western_g missing after: 7732
p_western_g missing before: 11454
p_western_g missing after: 8130
p_western_g missing before: 11417
p_western_g missing after: 8146
p_western_g missing before: 11382
p_western_g missing after: 8067
1686 na's left of original 1770, 0.00% of cols done, time spend: 18
1685 na's left of original 1770, 7.14% of cols done, time spend: 27
1608 na's left of original 1770, 14.29% of cols done, time spend: 60
1276 na's left of original 1770, 21.43% of cols done, time spend: 82
1233 na's left of original 1770, 28.57% of cols done, time spend: 92
501 na's left of original 1770, 35.71% of cols done, time spend: 102
494 na's left of original 1770, 42.86% of cols done, time spend:

inw_014_g missing after: 8911
inw_014_g missing before: 12090
inw_014_g missing after: 8739
inw_1524_g missing before: 12725
inw_1524_g missing after: 8198
inw_1524_g missing before: 12684
inw_1524_g missing after: 8198
inw_1524_g missing before: 12644
inw_1524_g missing after: 8198
inw_1524_g missing before: 12571
inw_1524_g missing after: 8911
inw_1524_g missing before: 12495
inw_1524_g missing after: 8739
inw_2544_g missing before: 11811
inw_2544_g missing after: 8198
inw_2544_g missing before: 11761
inw_2544_g missing after: 8198
inw_2544_g missing before: 11700
inw_2544_g missing after: 8198
inw_2544_g missing before: 11640
inw_2544_g missing after: 8911
inw_2544_g missing before: 11615
inw_2544_g missing after: 8739
inw_4564_g missing before: 12836
inw_4564_g missing after: 8198
inw_4564_g missing before: 12773
inw_4564_g missing after: 8198
inw_4564_g missing before: 12705
inw_4564_g missing after: 8198
inw_4564_g missing before: 12637
inw_4564_g missing after: 8911
inw_4564_g m

1344 na's left of original 4526, 17.65% of cols done, time spend: 246
1267 na's left of original 4526, 23.53% of cols done, time spend: 322
1183 na's left of original 4526, 29.41% of cols done, time spend: 362
822 na's left of original 4526, 35.29% of cols done, time spend: 382
774 na's left of original 4526, 41.18% of cols done, time spend: 431
742 na's left of original 4526, 47.06% of cols done, time spend: 513
728 na's left of original 4526, 52.94% of cols done, time spend: 624
655 na's left of original 4526, 58.82% of cols done, time spend: 644
295 na's left of original 4526, 64.71% of cols done, time spend: 732
279 na's left of original 4526, 70.59% of cols done, time spend: 781
267 na's left of original 4526, 76.47% of cols done, time spend: 877
17 na's left of original 4526, 82.35% of cols done, time spend: 936
10 na's left of original 4526, 88.24% of cols done, time spend: 1007
0 na's left of original 4526, 94.12% of cols done, time spend: 1102
total time: 1102
4313 na's left o

814 na's left of original 1278, 0.00% of cols done, time spend: 19
744 na's left of original 1278, 8.33% of cols done, time spend: 46
657 na's left of original 1278, 16.67% of cols done, time spend: 62
603 na's left of original 1278, 25.00% of cols done, time spend: 94
433 na's left of original 1278, 33.33% of cols done, time spend: 129
193 na's left of original 1278, 41.67% of cols done, time spend: 160
146 na's left of original 1278, 50.00% of cols done, time spend: 167
142 na's left of original 1278, 58.33% of cols done, time spend: 186
54 na's left of original 1278, 66.67% of cols done, time spend: 224
43 na's left of original 1278, 75.00% of cols done, time spend: 258
3 na's left of original 1278, 83.33% of cols done, time spend: 280
0 na's left of original 1278, 91.67% of cols done, time spend: 288
total time: 288
1024 na's left of original 1099, 0.00% of cols done, time spend: 8
964 na's left of original 1099, 11.11% of cols done, time spend: 23
948 na's left of original 1099, 2

p_buildBefore2000_g missing before: 20513
p_buildBefore2000_g missing after: 18387
p_buildBefore2000_g missing before: 20500
p_buildBefore2000_g missing after: 18376
p_buildBefore2000_g missing before: 20468
p_buildBefore2000_g missing after: 18379
p_buildBefore2000_g missing before: 20463
p_buildBefore2000_g missing after: 18355
p_buildBefore2000_g missing before: 20441
p_buildBefore2000_g missing after: 18296
p_buyhouses_g missing before: 21873
p_buyhouses_g missing after: 18387
p_buyhouses_g missing before: 21857
p_buyhouses_g missing after: 18376
p_buyhouses_g missing before: 21840
p_buyhouses_g missing after: 18379
p_buyhouses_g missing before: 21828
p_buyhouses_g missing after: 18355
p_buyhouses_g missing before: 21799
p_buyhouses_g missing after: 18296
p_nonWestern_g missing before: 23066
p_nonWestern_g missing after: 17952
p_nonWestern_g missing before: 23066
p_nonWestern_g missing after: 17952
p_nonWestern_g missing before: 23067
p_nonWestern_g missing after: 17917
p_nonWester

1383 na's left of original 5304, 47.62% of cols done, time spend: 94
1343 na's left of original 5304, 52.38% of cols done, time spend: 102
1334 na's left of original 5304, 57.14% of cols done, time spend: 109
1320 na's left of original 5304, 61.90% of cols done, time spend: 118
1302 na's left of original 5304, 66.67% of cols done, time spend: 129
1252 na's left of original 5304, 71.43% of cols done, time spend: 142
172 na's left of original 5304, 76.19% of cols done, time spend: 153
160 na's left of original 5304, 80.95% of cols done, time spend: 166
80 na's left of original 5304, 85.71% of cols done, time spend: 174
8 na's left of original 5304, 90.48% of cols done, time spend: 185
0 na's left of original 5304, 95.24% of cols done, time spend: 194
total time: 194


In [64]:
(a.isna().sum(axis=1) == 0).any()

AttributeError: 'NoneType' object has no attribute 'isna'

In [58]:
a.isna().sum(axis = 0)
a.mean_WOZ_b.isna().sum()

9816

In [53]:
a.shape

(13524, 74)

In [21]:
a

Unnamed: 0,bijeenkomstfunctiesum,geometry,gezondheidszorgfunctiesum,industriefunctiesum,kantoorfunctiesum,logiesfunctiesum,onderwijsfunctiesum,sportfunctiesum,winkelfunctiesum,woonfunctiesum,...,afs_vo_g,afs_oprit_g,afs_train_g,afs_transit_g,mean_WOZ,PLaagste40Inkomen,PHoogste20Inkomen,bouwjaarmean,bouwjaarmin,bouwjaarmax
85,0.0,"POLYGON ((235300.000 587200.000, 235400.000 58...",0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,...,3.4,1.0,4.8,7.0,214.0,35.8,19.3,1945.250000,1928.0,1981.0
86,1.0,"POLYGON ((235400.000 587200.000, 235500.000 58...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,...,3.4,1.0,4.8,7.0,214.0,35.8,19.3,1970.543478,1932.0,2003.0
88,0.0,"POLYGON ((235600.000 587200.000, 235700.000 58...",0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,...,3.4,1.0,4.8,7.0,214.0,35.8,19.3,1942.272727,1833.0,2000.0
100,0.0,"POLYGON ((236800.000 587200.000, 236900.000 58...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,4.2,2.3,5.6,7.7,,,,1961.000000,1915.0,2007.0
118,0.0,"POLYGON ((238600.000 587200.000, 238700.000 58...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,7.3,2.7,8.7,10.8,,34.9,21.3,1955.000000,1955.0,1955.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13495,0.0,"POLYGON ((237700.000 577500.000, 237800.000 57...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,...,2.4,0.9,1.3,7.4,,37.9,22.4,1977.437500,1907.0,2002.0
13496,0.0,"POLYGON ((237800.000 577500.000, 237900.000 57...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,...,2.4,0.9,1.3,7.4,,37.9,22.4,1984.733333,1948.0,1997.0
13497,0.0,"POLYGON ((237900.000 577500.000, 238000.000 57...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,18.0,...,2.4,0.9,1.3,7.4,,37.9,22.4,1980.666667,1915.0,2000.0
13498,0.0,"POLYGON ((238000.000 577500.000, 238100.000 57...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,15.0,...,2.4,0.9,1.3,7.4,,37.9,22.4,1976.611111,1913.0,1996.0


In [73]:
.01a.isna().sum(\)

bijeenkomstfunctiesum             0
gezondheidszorgfunctiesum         0
industriefunctiesum               0
kantoorfunctiesum                 0
logiesfunctiesum                  0
onderwijsfunctiesum               0
sportfunctiesum                   0
winkelfunctiesum                  0
woonfunctiesum                    0
oppervlakteVerblijfsobjectmean    0
inwoner_g                         0
geboorte_g                        0
inw_014_g                         0
inw_1524_g                        0
inw_2544_g                        0
inw_4564_g                        0
inw_65_g                          0
p_western_g                       0
p_nonWestern_g                    0
p_buyhouses_g                     0
p_rentals_g                       0
p_socialHousing_g                 0
medianIncome_g                    0
avg_electricity_g                 0
avg_gas_g                         0
p_benefits_g                      0
afs_haprak_g                      0
afs_ziek_g                  

In [47]:
for j,i in a.max().items():
    print(j,i)

Unnamed: 0 12562.0
Unnamed: 0.1 12562.0
Unnamed: 0.1.1 2911096.0
bijeenkomstfunctiesum 12.0
gezondheidszorgfunctiesum 31.0
industriefunctiesum 20.0
kantoorfunctiesum 17.0
logiesfunctiesum 3.0
onderwijsfunctiesum 4.0
sportfunctiesum 1.0
winkelfunctiesum 38.0
woonfunctiesum 201.0
oppervlakteVerblijfsobjectmean 34964.5
bouwjaarmean 2014.0
bouwjaarmin 2014.0
bouwjaarmax 2014.0
y 1.0
inwoner_g 270.0
geboorte_g 10.0
inw_014_g 55.0
inw_1524_g 215.0
inw_2544_g 170.0
inw_4564_g 175.0
inw_65_g 100.0
p_NL 60.0
p_western_g 70.0
p_nonWestern_g 100.0
p_buyhouses_g 100.0
p_rentals_g 100.0
p_socialHousing_g 190.0
avg_electricity_g 5320.0
avg_gas_g 2530.0
p_benefits_g 1.0
afs_haprak_g 3.5
afs_ziek_g 10.0
afs_superm_g 3.3
afs_vo_g 8.6
afs_oprit_g 4.6
afs_train_g 9.6
afs_transit_g 11.5
p_buildBefore2000_g 1.0002450686192137
p_buildAfter2000_g 0.5001917459216476
afs_huisarts_b 5.2
afs_ziekenhuis_b 10.9
afs_supermarkt_b 4.9
afs_vo_b 10.1
afs_oprit_b 4.5
afs_train_b 11.1
afs_transit_b 13.9
Inwoners_b 8055.0

  """Entry point for launching an IPython kernel.


In [27]:
a.min()

  """Entry point for launching an IPython kernel.


Unnamed: 0                                                           5
Unnamed: 0.1                                                         5
C28992R100                                                  E1425N4094
Unnamed: 0.1.1                                                 2746653
geometry             POLYGON ((142500 409500, 142600 409500, 142600...
                                           ...                        
avg_gas_b                                                        280.0
medianInkomen_b                                                   19.3
PLaagste40Inkomen                                                 16.5
PHoogste20Inkomen                                                  2.2
p_benefits_b                                                       0.0
Length: 73, dtype: object

In [46]:
(a.p_benefits_g <0).sum()

0

In [37]:
len(a)

3558

In [12]:
b.isna().sum()

Unnamed: 0                0
Unnamed: 0.1              0
C28992R100                0
Unnamed: 0.1.1            0
geometry                  0
                       ... 
avg_gas_b              3558
medianInkomen_b        3558
PLaagste40Inkomen_b    3558
PHoogste20Inkomen_b    3558
pBenefits_b            3558
Length: 74, dtype: int64

In [3]:
data2015 = pd.read_csv("BRgrid2015.csv")
data2016 = pd.read_csv("BRgrid2016.csv")
data2017 = pd.read_csv("BRgrid2017.csv")
data2018 = pd.read_csv("BRgrid2018.csv")
data2019 = pd.read_csv("BRgrid2019.csv")

In [4]:
data2015.rename(columns = {"p_buildBefore2000": "p_buildBefore2000_g", "p_buildAfter2000": "p_buildAfter2000_g", "p_wester_g" : "p_western_g", "afs_school_b" : "afs_vo_b", "afs_treins_g" : "afs_train_g", "nonWestern_b" : "p_nonWestern_b", "mean_WOZ_b" : "mean_WOZ", "pBenefits_b" : "p_benefits_b", "p_NL_g" : "p_NL", "PHoogste20Inkomen_b" : "PHoogste20Inkomen", "PLaagste40Inkomen_b" : "PLaagste40Inkomen" }, inplace = True)
data2016.rename(columns = {"p_buildBefore2000": "p_buildBefore2000_g", "p_buildAfter2000": "p_buildAfter2000_g", "p_wester_g" : "p_western_g", "afs_school_b" : "afs_vo_b", "afs_treins_g" : "afs_train_g", "nonWestern_b" : "p_nonWestern_b", "mean_WOZ_b" : "mean_WOZ", "pBenefits_b" : "p_benefits_b", "p_NL_g" : "p_NL", "PHoogste20Inkomen_b" : "PHoogste20Inkomen", "PLaagste40Inkomen_b" : "PLaagste40Inkomen" }, inplace = True)
data2017.rename(columns = {"p_buildBefore2000": "p_buildBefore2000_g", "p_buildAfter2000": "p_buildAfter2000_g", "p_wester_g" : "p_western_g", "afs_school_b" : "afs_vo_b", "afs_treins_g" : "afs_train_g", "nonWestern_b" : "p_nonWestern_b", "mean_WOZ_b" : "mean_WOZ", "pBenefits_b" : "p_benefits_b", "p_NL_g" : "p_NL", "PHoogste20Inkomen_b" : "PHoogste20Inkomen", "PLaagste40Inkomen_b" : "PLaagste40Inkomen" }, inplace = True)
data2018.rename(columns = {"p_buildBefore2000": "p_buildBefore2000_g", "p_buildAfter2000": "p_buildAfter2000_g", "p_wester_g" : "p_western_g", "afs_school_b" : "afs_vo_b", "afs_treins_g" : "afs_train_g", "nonWestern_b" : "p_nonWestern_b", "mean_WOZ_b" : "mean_WOZ", "pBenefits_b" : "p_benefits_b", "p_NL_g" : "p_NL", "PHoogste20Inkomen_b" : "PHoogste20Inkomen", "PLaagste40Inkomen_b" : "PLaagste40Inkomen" }, inplace = True)
data2019.rename(columns = {"p_buildBefore2000": "p_buildBefore2000_g", "p_buildAfter2000": "p_buildAfter2000_g", "p_wester_g" : "p_western_g", "afs_school_b" : "afs_vo_b", "afs_treins_g" : "afs_train_g", "nonWestern_b" : "p_nonWestern_b", "mean_WOZ_b" : "mean_WOZ", "pBenefits_b" : "p_benefits_b", "p_NL_g" : "p_NL", "PHoogste20Inkomen_b" : "PHoogste20Inkomen", "PLaagste40Inkomen_b" : "PLaagste40Inkomen" }, inplace = True)

In [5]:
data2015 = data2015.replace({-99997 : np.NaN, np.inf : np.NaN})
data2016 = data2016.replace({-99997 : np.NaN, np.inf : np.NaN})
data2017 = data2017.replace({-99997 : np.NaN, np.inf : np.NaN})
data2018 = data2018.replace({-99997 : np.NaN, np.inf : np.NaN})
data2019 = data2019.replace({-99997 : np.NaN, np.inf : np.NaN})


In [6]:
b15 = data2015[data2015.woonfunctie>0]
b16 = data2016[data2016.woonfunctie>0]
b17 = data2017[data2017.woonfunctie>0]
b18 = data2018[data2018.woonfunctie>0]
b19 = data2019[data2019.woonfunctie>0]

In [7]:
nadf["data2015"] = data2015.isna().sum() / len(data2015)
nadf["residential2015"] = b15.isna().sum() / len(b15)

nadf["data2016"] = data2016.isna().sum() / len(data2015)
nadf["residential2016"] = b16.isna().sum() / len(b15)

nadf["data2017"] = data2017.isna().sum() / len(data2015)
nadf["residential2017"] = b17.isna().sum() / len(b15)

nadf["data2018"] = data2018.isna().sum() / len(data2015)
nadf["residential2018"] = b18.isna().sum() / len(b15)

nadf["data2019"] = data2019.isna().sum() / len(data2015)
nadf["residential2019"] = b19.isna().sum() / len(b15)

In [8]:
def grid_filler(data, grid_column, buurt_column):
    print(grid_column + " missing before: {}".format(data[grid_column].isna().sum()))
    missing_grid_data = data[grid_column].isna().index.values
    data.loc[missing_grid_data, grid_column] = data.loc[missing_grid_data, buurt_column]
    print(grid_column + " missing after: {}".format(data[grid_column].isna().sum()))
    return data[grid_column]

In [9]:
grid_variables = []
neighborhood_variables = []
for var in data2015.columns:
    if var.endswith("_g"):
        grid_variables.append(var)
    elif var.endswith("_b"):
        neighborhood_variables.append(var)
        
grid_variables.sort(key=str.lower)
neighborhood_variables.sort(key=str.lower)

In [10]:
ls = list()
for i, j in zip(grid_variables, neighborhood_variables):
    ls.append((i,j))
print(ls)

[('afs_haprak_g', 'afs_huisarts_b'), ('afs_oprit_g', 'afs_oprit_b'), ('afs_superm_g', 'afs_supermarkt_b'), ('afs_train_g', 'afs_train_b'), ('afs_transit_g', 'afs_transit_b'), ('afs_vo_g', 'afs_vo_b'), ('afs_ziek_g', 'afs_ziekenhuis_b'), ('avg_electricity_g', 'avg_electicity_b'), ('avg_gas_g', 'avg_gas_b'), ('geboorte_g', 'birth_b'), ('inw_014_g', 'inw14_b'), ('inw_1524_g', 'inw24_b'), ('inw_2544_g', 'inw44_b'), ('inw_4564_g', 'inw64_b'), ('inw_65_g', 'inw65_b'), ('inwoner_g', 'Inwoners_b'), ('medianIncome_g', 'medianInkomen_b'), ('p_benefits_g', 'p_benefits_b'), ('p_buildAfter2000_g', 'p_buildAfter2000_b'), ('p_buildBefore2000_g', 'p_buildBefore2000_b'), ('p_buyhouses_g', 'p_buyHouses_b'), ('p_nonWestern_g', 'p_nonWestern_b'), ('p_rentals_g', 'p_rental_houses_b'), ('p_socialHousing_g', 'p_socialHousing_b'), ('p_western_g', 'western_b')]


In [11]:
for i in ls:
    data2015[i[0]] = grid_filler(data2015, i[0], i[1])
    data2016[i[0]] = grid_filler(data2016, i[0], i[1])
    data2017[i[0]] = grid_filler(data2017, i[0], i[1])
    data2018[i[0]] = grid_filler(data2018, i[0], i[1])
    data2019[i[0]] = grid_filler(data2019, i[0], i[1])
    

afs_haprak_g missing before: 15560
afs_haprak_g missing after: 12807
afs_haprak_g missing before: 15546
afs_haprak_g missing after: 12807
afs_haprak_g missing before: 15518
afs_haprak_g missing after: 12807
afs_haprak_g missing before: 15510
afs_haprak_g missing after: 12807
afs_haprak_g missing before: 19324
afs_haprak_g missing after: 12807
afs_oprit_g missing before: 15555
afs_oprit_g missing after: 12807
afs_oprit_g missing before: 15537
afs_oprit_g missing after: 12807
afs_oprit_g missing before: 15518
afs_oprit_g missing after: 12807
afs_oprit_g missing before: 15510
afs_oprit_g missing after: 12807
afs_oprit_g missing before: 19324
afs_oprit_g missing after: 12807
afs_superm_g missing before: 15555
afs_superm_g missing after: 12807
afs_superm_g missing before: 15537
afs_superm_g missing after: 12807
afs_superm_g missing before: 15518
afs_superm_g missing after: 12807
afs_superm_g missing before: 15510
afs_superm_g missing after: 12807
afs_superm_g missing before: 19324
afs_super

In [12]:
def visualize_NANs(data):
    data["n_missing"] = data.isna().sum(axis=1)
    data.plot(figsize = (40,40), alpha=0.5, edgecolor = "b", column='n_missing',legend=True)

In [13]:
var = ['geometry', 'bijeenkomstfunctie',
       'celfunctie', 'gezondheidszorgfunctie', 'industriefunctie',
       'kantoorfunctie', 'logiesfunctie', 'onderwijsfunctie', 'sportfunctie',
       'winkelfunctie', 'woonfunctie', 'oppervlakteVerblijfsobject', 'BU_CODE',
       'y', 'inwoner_g', 'geboorte_g', 'inw_014_g', 'inw_1524_g', 'inw_2544_g',
       'inw_4564_g', 'inw_65_g', 'p_NL', 'p_western_g', 'p_nonWestern_g',
       'p_buyhouses_g', 'p_rentals_g', 'p_socialHousing_g', 'medianIncome_g',
       'avg_electricity_g', 'avg_gas_g', 'p_benefits_g', 'afs_haprak_g',
       'afs_ziek_g', 'afs_superm_g', 'afs_vo_g', 'afs_oprit_g', 'afs_train_g',
       'afs_transit_g', 'p_buildBefore2000_g', 'p_buildAfter2000_g', 'mean_WOZ', 
        'PLaagste40Inkomen', 'PHoogste20Inkomen']

In [14]:
t15 = data2015[var]
t16 = data2016[var]
t17 = data2017[var]
t18 = data2018[var]
t19 = data2019[var]


In [15]:
t15["n_missing"] = t15.isna().sum(axis=1)
t16["n_missing"] = t16.isna().sum(axis=1)
t17["n_missing"] = t17.isna().sum(axis=1)
t18["n_missing"] = t18.isna().sum(axis=1)
t19["n_missing"] = t19.isna().sum(axis=1)

t15b = t15[(t15.woonfunctie > 0) & (t15.n_missing < 15)]
t16b = t16[(t16.woonfunctie > 0) & (t16.n_missing < 15)]
t17b = t17[(t17.woonfunctie > 0) & (t17.n_missing < 15)]
t18b = t18[(t18.woonfunctie > 0) & (t18.n_missing < 15)]
t19b = t19[(t19.woonfunctie > 0) & (t19.n_missing < 15)]

t15b = gpd.GeoDataFrame(t15b)
t15b.geometry = t15b.geometry.apply(shapely.wkt.loads)

t16b = gpd.GeoDataFrame(t16b)
t16b.geometry = t16b.geometry.apply(shapely.wkt.loads)

t17b = gpd.GeoDataFrame(t17b)
t17b.geometry = t17b.geometry.apply(shapely.wkt.loads)

t18b = gpd.GeoDataFrame(t18b)
t18b.geometry = t18b.geometry.apply(shapely.wkt.loads)

t19b = gpd.GeoDataFrame(t19b)
t19b.geometry = t19b.geometry.apply(shapely.wkt.loads)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [16]:
data2015.columns

Index(['C28992R100', 'Unnamed: 0', 'geometry', 'bijeenkomstfunctie',
       'celfunctie', 'gezondheidszorgfunctie', 'industriefunctie',
       'kantoorfunctie', 'logiesfunctie', 'onderwijsfunctie', 'sportfunctie',
       'winkelfunctie', 'woonfunctie', 'oppervlakteVerblijfsobject', 'BU_CODE',
       'y', 'inwoner_g', 'geboorte_g', 'inw_014_g', 'inw_1524_g', 'inw_2544_g',
       'inw_4564_g', 'inw_65_g', 'p_NL', 'p_western_g', 'p_nonWestern_g',
       'p_buyhouses_g', 'p_rentals_g', 'p_socialHousing_g', 'medianIncome_g',
       'avg_electricity_g', 'avg_gas_g', 'p_benefits_g', 'afs_haprak_g',
       'afs_ziek_g', 'afs_superm_g', 'afs_vo_g', 'afs_oprit_g', 'afs_train_g',
       'afs_transit_g', 'p_buildBefore2000_g', 'p_buildAfter2000_g',
       'afs_huisarts_b', 'afs_ziekenhuis_b', 'afs_supermarkt_b', 'afs_vo_b',
       'afs_oprit_b', 'afs_train_b', 'afs_transit_b', 'Inwoners_b', 'inw14_b',
       'inw24_b', 'inw44_b', 'inw64_b', 'inw65_b', 'western_b',
       'p_nonWestern_b', 'birth_b

In [17]:
var2 = ['bijeenkomstfunctie', 'geometry',
       'celfunctie', 'gezondheidszorgfunctie', 'industriefunctie',
       'kantoorfunctie', 'logiesfunctie', 'onderwijsfunctie', 'sportfunctie',
       'winkelfunctie', 'woonfunctie', 'oppervlakteVerblijfsobject', 
        'inwoner_g', 'geboorte_g', 'inw_014_g', 'inw_1524_g', 'inw_2544_g',
       'inw_4564_g', 'inw_65_g', 'p_western_g', 'p_nonWestern_g',
       'p_buyhouses_g', 'p_rentals_g', 'p_socialHousing_g', 'medianIncome_g',
       'avg_electricity_g', 'avg_gas_g', 'p_benefits_g', 'afs_haprak_g',
       'afs_ziek_g', 'afs_superm_g', 'afs_vo_g', 'afs_oprit_g', 'afs_train_g',
       'afs_transit_g', 'p_buildBefore2000_g', 'p_buildAfter2000_g', 'mean_WOZ', 
        'PLaagste40Inkomen', 'PHoogste20Inkomen']

In [18]:
t15b = t15b[var2]
t15b = gpd.GeoDataFrame(t15b)
t16b = t16b[var2]
t16b = gpd.GeoDataFrame(t16b)
t17b = t17b[var2]
t17b = gpd.GeoDataFrame(t17b)
t18b = t18b[var2]
t18b = gpd.GeoDataFrame(t18b)
t19b = t19b[var2]
t19b = gpd.GeoDataFrame(t19b)



In [19]:
def fill_final_na(data):
    t0 = time.time()
    total_na = data.isna().sum().sum()
    all_geom = data[["geometry", "sportfunctie"]]

    data = data.drop(["geometry"], axis = 1)
    incomplete = data[data.isnull().any(axis=1)]
    complete = gpd.GeoDataFrame(data[~data.isnull().any(axis=1)]) # store the rows that have no na values
    
    
    
    # create list of combinatoins of missing columns
    inc_list = list()
    for _, i in incomplete.iterrows():
        if not tuple(i.index[i.isna()]) in inc_list:
            inc_list.append(tuple(i.index[i.isna()]))
            
#   Create weightsmatrix and transform to row_standardized            
#     w_all = weights.distance.KNN.from_dataframe(all_geom, k = 4, silence_warnings = True)
#     w_all.transform = 'r'
    
    
    columns = data.columns
    
    # loop over the missing column combinations
    def train_creater(inp, w, train = "train", test = False):
        """
        returns: new dataframe including the neighbors data
        """
        # Get neigbors per index
        neighbors = [ w.neighbors[k] for k in inp.index.values.tolist()]
        if test:
            
            df = pd.concat([inp, train])
            
            rows = [ np.concatenate(df.loc[j].values) for j in neighbors] 
            distances = [list(all_geom.loc[neigh].distance(all_geom.loc[df.iloc[i].name].geometry).values) for i, neigh in enumerate(neighbors)]
        else:
            # get rows of the neigbors, tranform to np and concatenate them
            rows = [ np.concatenate(inp.loc[j].values) for j in neighbors] 
            distances = [list(all_geom.loc[neigh].distance(all_geom.loc[inp.iloc[i].name].geometry).values) for i, neigh in enumerate(neighbors)]
        
        
        return pd.concat([pd.DataFrame(rows).set_index(inp.index), inp, pd.DataFrame(distances).set_index(inp.index) ], axis=1)
        
    for i, cols in enumerate(inc_list):
        cols = list(cols)
        x_train = complete.drop(cols, axis = 1) # only keep the cols to trained upon
        y_train = complete[cols] # y has the na values to be predicted
       
        w_train  = weights.distance.KNN.from_dataframe(all_geom.loc[x_train.index], k = 4, silence_warnings = True)
        w_train.transform = 'r'
        
        X_train = train_creater(x_train, w_train)
        
        # Voor x predict only keep the rows where there are only NA's in cols
        pred = incomplete[(incomplete[cols].isna().all(axis = 1) & (incomplete[columns.drop(cols)].notna().all(axis = 1)))]
        X_pred = pred.drop(cols, axis = 1)

        # create prediction weights of missing rows as well as known rows
        w_pred =  weights.distance.KNN.from_dataframe(all_geom.loc[X_train.index.append(X_pred.index)], k = 4, silence_warnings = True)
        w_pred.transform = 'r'
        X_pred = train_creater(X_pred, w_pred, x_train, True)
        
        # loop over each missing variable in cols and create model per 
        for col in cols:
            
            clf = RandomForestRegressor(max_depth=6, random_state=0)
            clf.fit(X_train, y_train[col].values.ravel())    
            y_pred = clf.predict(X_pred)
            incomplete.loc[X_pred.index, col] = y_pred
            
        print("{} na's left of original {}, {:.2f}% of cols done, time spend: {:.0f}".format(incomplete.isna().sum().sum(),  total_na, i / len(inc_list) * 100, time.time() - t0))
    print("total time: {:.0f}".format(time.time() - t0))
    return pd.concat([incomplete, complete])
            


In [20]:

t15b = fill_final_na(t15b)
t16b = fill_final_na(t16b)
t17b = fill_final_na(t17b)
t18b = fill_final_na(t18b)
t19b = fill_final_na(t19b)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


788 na's left of original 885, 0.00% of cols done, time spend: 12


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

592 na's left of original 885, 14.29% of cols done, time spend: 38


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


340 na's left of original 885, 28.57% of cols done, time spend: 50


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

335 na's left of original 885, 42.86% of cols done, time spend: 84


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

333 na's left of original 885, 57.14% of cols done, time spend: 102


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

180 na's left of original 885, 71.43% of cols done, time spend: 148


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

0 na's left of original 885, 85.71% of cols done, time spend: 197
total time: 197


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


802 na's left of original 860, 0.00% of cols done, time spend: 14


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

602 na's left of original 860, 14.29% of cols done, time spend: 41


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


350 na's left of original 860, 28.57% of cols done, time spend: 54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

345 na's left of original 860, 42.86% of cols done, time spend: 86


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

343 na's left of original 860, 57.14% of cols done, time spend: 104


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

190 na's left of original 860, 71.43% of cols done, time spend: 152


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

0 na's left of original 860, 85.71% of cols done, time spend: 202
total time: 202


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


834 na's left of original 894, 0.00% of cols done, time spend: 13


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

630 na's left of original 894, 16.67% of cols done, time spend: 43


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


377 na's left of original 894, 33.33% of cols done, time spend: 56


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

372 na's left of original 894, 50.00% of cols done, time spend: 90


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

370 na's left of original 894, 66.67% of cols done, time spend: 108


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

0 na's left of original 894, 83.33% of cols done, time spend: 159
total time: 159


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


727 na's left of original 786, 0.00% of cols done, time spend: 13


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

515 na's left of original 786, 16.67% of cols done, time spend: 42


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


262 na's left of original 786, 33.33% of cols done, time spend: 55


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

257 na's left of original 786, 50.00% of cols done, time spend: 87


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

95 na's left of original 786, 66.67% of cols done, time spend: 135


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

0 na's left of original 786, 83.33% of cols done, time spend: 167
total time: 167


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


979 na's left of original 2409, 0.00% of cols done, time spend: 9


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

863 na's left of original 2409, 12.50% of cols done, time spend: 19


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

563 na's left of original 2409, 25.00% of cols done, time spend: 34


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

319 na's left of original 2409, 37.50% of cols done, time spend: 44


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


188 na's left of original 2409, 50.00% of cols done, time spend: 51


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

183 na's left of original 2409, 62.50% of cols done, time spend: 70


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

180 na's left of original 2409, 75.00% of cols done, time spend: 84


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

0 na's left of original 2409, 87.50% of cols done, time spend: 109
total time: 109


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [117]:
t15b

Unnamed: 0,bijeenkomstfunctie,celfunctie,gezondheidszorgfunctie,industriefunctie,kantoorfunctie,logiesfunctie,onderwijsfunctie,sportfunctie,winkelfunctie,woonfunctie,...,afs_superm_g,afs_vo_g,afs_oprit_g,afs_train_g,afs_transit_g,p_buildBefore2000_g,p_buildAfter2000_g,mean_WOZ,PLaagste40Inkomen,PHoogste20Inkomen
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,3.3,4.3,2.1,5.7,7.9,94.0,6.0,391.764651,34.453318,37.313453
345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,3.3,4.3,2.1,5.7,7.9,94.0,6.0,393.048448,34.618565,38.903016
349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,3.3,4.3,2.1,5.7,7.9,94.0,6.0,390.136723,34.509750,38.094073
611,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,3.2,3.5,1.4,4.4,6.5,88.0,12.0,283.960709,35.672029,35.528626
625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,3.2,3.5,1.4,4.4,6.5,88.0,12.0,284.419384,35.615597,37.537007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,...,0.7,1.4,0.9,1.3,7.3,92.0,8.0,189.000000,37.100000,23.800000
13496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,...,0.7,1.4,0.9,1.3,7.3,92.0,8.0,189.000000,37.100000,23.800000
13497,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,17.0,...,0.7,1.4,0.9,1.3,7.3,92.0,8.0,189.000000,37.100000,23.800000
13498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,...,0.7,1.4,0.9,1.3,7.3,92.0,8.0,189.000000,37.100000,23.800000


In [28]:
t15b = t15b.join(data2015[["C28992R100", "geometry", "y", "BU_CODE"]])
t16b = t16b.join(data2016[["C28992R100", "geometry", "y", "BU_CODE"]])
t17b = t17b.join(data2017[["C28992R100", "geometry", "y", "BU_CODE"]])
t18b = t18b.join(data2018[["C28992R100", "geometry", "y", "BU_CODE"]])
t19b = t19b.join(data2019[["C28992R100", "geometry", "y", "BU_CODE"]])

In [29]:
t15b.to_csv("BRfilled_data15.csv")
t16b.to_csv("BRfilled_data16.csv")
t17b.to_csv("BRfilled_data17.csv")
t18b.to_csv("BRfilled_data18.csv")
t19b.to_csv("BRfilled_data19.csv")

In [26]:
t15b.shape

(4404, 39)