In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Extract recurrent mobility matrix

First, the raw spreadsheet `data/raw/census_2011/Pop_LPW_NL_25FEB15.xlsx` was modified in MS Excel. The format of the data is as follows:

- rows: municipality of residence
- columns: municipality of work

The dataset contained, for each Belgian province, a column of 'unknowns', indicating we know where these individuals live but not where they work. These 10 columns were removed manually. Further, the column `Werkt in Belgie` was renamed `Belgie` to make name-based row and column matching easier. The resulting dataset was placed in `data/raw/census_2011/Pop_LPW_NL_25FEB15_remove_unknowns.xlsx`. The recurrent mobility matrix extracted from the resulting spreadsheet thus has two assumptions,

- People working abroad are ignored. These 92.000 individuals account for 2% of the working population.
- People whos workplace is unknown are ignored. An alternative assumptions could be the following: the unknowns work in their place of residence.

In [None]:
abs_dir = os.getcwd()
rel_dir = os.path.join(abs_dir, '../data/interim/census_2011/Pop_LPW_NL_25FEB15_delete_unknown.xlsx')
df = pd.read_excel(rel_dir, sheet_name="Tabel1_2011")

In [None]:
codes=df['00.24 - Werkende bevolking volgens geslacht, verblijfplaats en plaats van tewerkstelling'].loc[5:1942].dropna().values
codes_int = [int(i) for i in codes]
mobility_df=pd.DataFrame(np.zeros([len(codes),len(codes)]),index=codes,columns=codes)
names = df.iloc[5:,1].dropna().values
rows=[]
for i in df['00.24 - Werkende bevolking volgens geslacht, verblijfplaats en plaats van tewerkstelling'].loc[5:1942].dropna().index:
    rows.append(df.iloc[i+2,4:-1].values)

matrix = np.zeros([len(rows),len(rows)])
for j in range(len(rows)):
    matrix[j,:]=rows[j]

In [None]:
mobility_df=pd.DataFrame(matrix,index=codes_int,columns=codes_int)
mobility_df.head()

In [None]:
idx_arrondisement=[]
for idx in mobility_df.index:
    if ((str(idx)[-3:] == '000') & (len(str(idx)) != 4) & (str(idx)[-4:] != '0000') & (str(idx)[0] != '0')):
        idx_arrondisement.append(idx)
print(idx_arrondisement)

In [None]:
mobility_df.loc[idx_arrondisement,idx_arrondisement]

In [None]:
mobility_df.loc[idx_arrondisement,idx_arrondisement].to_csv('../data/interim/census_2011/recurrent_mobility.csv', index=True)

## Extract population size (initN)

In [None]:
abs_dir = os.getcwd()
rel_dir = os.path.join(abs_dir, '../data/raw/census_2011/census_demo_nl_04nov14.xlsx')
df = pd.read_excel(rel_dir, sheet_name="Tabel3_2011")

In [None]:
codes=df['00.04 - Bevolking van Belgische en vreemde nationaliteit naar geslacht en leeftijdsklasse'].loc[5:1943].dropna().values
names = df.iloc[5:,1].dropna().values
columns = ['total','[0,10[','[10,20[','[20,30[','[30,40[','[40,50[','[50,60[','[60,70[','[70,80[','[80,inf[']
initN_df=pd.DataFrame(np.zeros([len(codes),len(columns)]),index=codes,columns=columns)
initN_df.head()

In [None]:
rows=np.zeros([len(codes),len(columns)])
k=0
for i in df['00.04 - Bevolking van Belgische en vreemde nationaliteit naar geslacht en leeftijdsklasse'].loc[5:1943].dropna().index:
    rows[k,0] = int(df.iloc[i+2,-1]) # total
    vals_raw = df.iloc[i+2,45:-1].values
    for j in range(len(vals_raw)-11):
        rows[k,j+1] = int(vals_raw[2*j]+vals_raw[2*j+1])
        if j == 8:
            rows[k,j+1] = int(sum(vals_raw[16:]))
    k = k + 1
    
initN_df=pd.DataFrame(rows,index=codes,columns=columns)
initN_df['name']=names
cols = initN_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
initN_df = initN_df[cols]
initN_df.index.name = 'NIS-code'
initN_df.head()

In [None]:
idx_arrondisement=[]
for idx in initN_df.index:
    if ((str(idx)[-3:] == '000') & (len(str(idx)) != 4) & (str(idx)[-4:] != '0000') & (str(idx)[0] != '0')):
        idx_arrondisement.append(idx)
print(idx_arrondisement)

In [None]:
initN_df.loc[idx_arrondisement,:]

In [None]:
initN_df.loc[idx_arrondisement,:].to_csv('../data/interim/census_2011/initN.csv', index=True)

## NIS - name list

In [None]:
name_df = pd.DataFrame(initN_df.index.values,index=np.arange(len(initN_df.index)),columns=['NIS'])
name_df['name']=names

In [None]:
name_df.head()

In [None]:
name_df.to_csv('../data/interim/census_2011/NIS_name.csv', index=False)