# Create numpy grid for training for example CNNs

In [3]:
import geopandas as gpd
import pandas as pd
import shapely
import numpy as np
import pickle
import os

In [2]:
# load cbs_od_koppel file
with open('cbs_id_koppel.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [3]:
br = pd.read_csv("../Data/filled/2015/Maastricht15_filled_.csv", index_col = 0)
br.geometry = br.geometry.apply(shapely.wkt.loads)
br = br.set_geometry("geometry")

In [4]:
br.columns

Index(['bijeenkomstfunctiesum', 'gezondheidszorgfunctiesum',
       'industriefunctiesum', 'kantoorfunctiesum', 'logiesfunctiesum',
       'onderwijsfunctiesum', 'sportfunctiesum', 'winkelfunctiesum',
       'woonfunctiesum', 'oppervlakteVerblijfsobjectmean', 'inwoner_g',
       'geboorte_g', 'inw_014_g', 'inw_1524_g', 'inw_2544_g', 'inw_4564_g',
       'inw_65_g', 'p_western_g', 'p_nonWestern_g', 'p_buyhouses_g',
       'p_rentals_g', 'p_socialHousing_g', 'medianIncome_g',
       'avg_electricity_g', 'avg_gas_g', 'p_benefits_g', 'afs_haprak_g',
       'afs_ziek_g', 'afs_superm_g', 'afs_vo_g', 'afs_oprit_g', 'afs_train_g',
       'afs_transit_g', 'mean_WOZ', 'PLaagste40Inkomen', 'PHoogste20Inkomen',
       'bouwjaarmean', 'bouwjaarmin', 'bouwjaarmax', 'C28992R100', 'geometry',
       'y', 'BU_CODE'],
      dtype='object')

In [5]:
br.C28992R100 = br.C28992R100.map(b) # change C28992code for id

In [6]:
br = br.drop(["BU_CODE"], axis = 1)

In [7]:
def to_np_grid(year, grid_size = 100):
    for filename in os.listdir("../Data/filled/" + str(year)):
        print(filename.strip("_filled_.csv"), year)

        df = pd.read_csv("../Data/filled/{}/{}".format(year, filename), index_col = 0)
        df.geometry = df.geometry.apply(shapely.wkt.loads)
        df = df.set_geometry("geometry")
        df.C28992R100 = df.C28992R100.map(b)
        df = df.drop(["BU_CODE"], axis = 1)
        """
        args: GeoDataFrame with geometry collumn
        """
        minx = df.geometry.total_bounds[0]
        miny = df.geometry.total_bounds[1]

        # translate geometry so that left lower corner is 0,0
        df["geometry"] = df["geometry"].translate(-minx, -miny)

        # Get x,y,z shape of the df for the numpy zeros grid
        x = int((df.total_bounds / 100)[2])
        y = int((df.total_bounds / 100)[3])
        z = df.shape[1] -1 # n collumns (- geometry column)
        grid = np.zeros((x,y,z))

        # loop over items in gpd dataframe and add data to the numpy grid
        for _, i in df.iterrows(): 
            grid[int(i.geometry.bounds[0]/100), int(i.geometry.bounds[1]/100)] = i.drop("geometry")

        np.save("../Data/filled/grids/{}/{}".format(year, filename.strip("_filled_.csv")), grid)
        

In [8]:
for year in [2015,2016,2017,2018,2019]:
    to_np_grid(year)


's-Hertogenbosch15 2015
Alkmaar15 2015
Amersfoort15 2015
Apeldoorn15 2015
Breda15 2015
Deventer15 2015
ndhoven15 2015
Enschede15 2015
Heerenveen15 2015
Leeuwarden15 2015
Maastricht15 2015
Nijmegen15 2015
Tilburg15 2015
Utrecht15 2015
Venlovbo.csv15 2015
Zwolle15 2015
's-Hertogenbosch16 2016
Alkmaarvbo.csv16 2016
Amersfoortvbo.csv16 2016
Apeldoorn16 2016
Breda16 2016
Deventer16 2016
ndhoven16 2016
Enschede16 2016
Heerenveen16 2016
Leeuwardenvbo.csv16 2016
Maastricht16 2016
Nijmegen16 2016
Tilburg16 2016
Utrecht16 2016
Venlovbo.csv16 2016
Zwolle16 2016
's-Hertogenbosch17 2017
Alkmaarvbo.csv17 2017
Amersfoortvbo.csv17 2017
Apeldoorn17 2017
Breda17 2017
Deventer17 2017
ndhoven17 2017
Enschede17 2017
Heerenveen17 2017
Leeuwardenvbo.csv17 2017
Maastricht17 2017
Nijmegen17 2017
Tilburg17 2017
Utrecht17 2017
Venlovbo.csv17 2017
Zwolle17 2017
's-Hertogenbosch18 2018
Alkmaarvbo.csv18 2018
Amersfoortvbo.csv18 2018
Apeldoorn18 2018
Breda18 2018
Deventer18 2018
ndhoven18 2018
Enschede18 2018
Heeren

In [51]:
(grid!=0).sum()

114311

In [52]:
grid.shape

(122, 103, 41)

# #############################################

create cbs C28992R100 to id file

In [232]:
vierkant = gpd.read_file("../Data/NLvierkant/NL_vierkant100m.shp")

KeyboardInterrupt: 

In [174]:
vierkant.reset_index().rename(columns = {"index" : "id"}).id.values

array([      0,       1,       2, ..., 3723285, 3723286, 3723287],
      dtype=int64)

In [193]:
zip_iterator = zip(vierkant.reset_index().rename(columns = {"index" : "id"}).C28992R100.values, vierkant.reset_index().rename(columns = {"index" : "id"}).id.values)

In [194]:
cbs_vierkant_dict = dict(zip_iterator)

In [200]:
br.C28992R100 = br.C28992R100.map(cbs_vierkant_dict)

In [198]:
br.C28992R100.iloc[0]

'E1824N3269'

In [204]:
with open('cbs_id_koppel.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(cbs_vierkant_dict, f, pickle.HIGHEST_PROTOCOL)

In [1]:
def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [7]:
import libpysal

In [4]:
df15 = load_data(2015)
df16 = load_data(2016)
df17 = load_data(2017)
df18 = load_data(2018)
df19 = load_data(2019)

In [5]:
with open('cbs_id_koppel.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [8]:
df15["id"] = df15.C28992R100.map(b)
df16["id"] = df16.C28992R100.map(b)
df17["id"] = df17.C28992R100.map(b)
df18["id"] = df18.C28992R100.map(b)
df19["id"] = df19.C28992R100.map(b)

df15 = df15.set_index("id")
df16 = df16.set_index("id")
df17 = df17.set_index("id")
df18 = df18.set_index("id")
df19 = df19.set_index("id")

w15 = libpysal.weights.DistanceBand.from_dataframe(df15, threshold=150, binary = True, silence_warnings = True)
w16 = libpysal.weights.DistanceBand.from_dataframe(df16, threshold=150, binary = True, silence_warnings = True)
w17 = libpysal.weights.DistanceBand.from_dataframe(df17, threshold=150, binary = True, silence_warnings = True)
w18 = libpysal.weights.DistanceBand.from_dataframe(df18, threshold=150, binary = True, silence_warnings = True)
w19 = libpysal.weights.DistanceBand.from_dataframe(df19, threshold=150, binary = True, silence_warnings = True)

In [9]:
with open('w15.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(w15, f, pickle.HIGHEST_PROTOCOL)
    
with open('w16.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(w16, f, pickle.HIGHEST_PROTOCOL)
    
with open('w17.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(w17, f, pickle.HIGHEST_PROTOCOL)
    
with open('w18.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(w18, f, pickle.HIGHEST_PROTOCOL)
    
with open('w19.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(w19, f, pickle.HIGHEST_PROTOCOL)