In [None]:
%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics

import cbsodata
import pandas as pd

This notebook shows how to analyse a Statline dataset using the fast.ai library (https://github.com/fastai/fastai).
An old version (0.7.2) of this library is used to be compatible with the fast.ai Machine Learning course (see https://github.com/fastai/fastai/tree/master/courses/ml1). This notebook is an adaption of the lesson1-rf notebook. 

To get started, download the Statline dataset "Kerncijfers wijken en buurten 2018" (https://opendata.cbs.nl/statline/#/CBS/nl/dataset/84286NED/table).

In [None]:
data = cbsodata.get_data('84286NED')

df = pd.DataFrame(data)

The dataframe contains data for different levels of regions. The region we are interested in is neighbourhood ("Buurt" in dutch)

In [None]:
df[df['SoortRegio_2']=='Buurt     '].head().T

In [None]:
buurt = df[df['SoortRegio_2']=='Buurt     ']

In [None]:
buurt.describe().T

In [None]:
buurt.dtypes

For simplicity only select numeric columns 

In [None]:
cols = [x for x in buurt.columns if buurt[x].dtypes in ['float64', 'int64']]
buurt = buurt[cols]

In [None]:
buurt.head().T

In [None]:
buurt.to_csv('buurt.csv', sep=';', index=False)

As an illustration we try to find correlations between the percentage of private dwellings in a neighbourhood ('Koopwoningen_40') and other variables available in the dataset.

First prepare the dataframe to be able to train a random forest.
Use the function proc_df to split the dataset and replace missing values with the mean.
In our case the target variable contains missings and those are not replaced, remove these records from the dataset. A second 'problem' with this dataset is that some columns are heavily correlated with Koopwoningen_1.
In most cases the percentage of rented dwellings and private dwellings adds up to 100%. Delete the remaining columns concerning ownership from the dataset.

In [None]:
buurt = buurt[pd.notnull(buurt['Koopwoningen_40'])].copy()
to_remove = ['ID', 'HuurwoningenTotaal_41', 'InBezitWoningcorporatie_42',
             'InBezitOverigeVerhuurders_43', 'EigendomOnbekend_44']
buurt.drop(to_remove, axis=1,inplace=True)

In [None]:
def split_vals(a,n): return a[:n], a[n:]
n_valid = 3000
n_trn = len(df_trn)-n_valid
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y_trn, n_trn)
raw_train, raw_valid = split_vals(buurt, n_trn)

In [None]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [None]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

In [None]:
fi = rf_feat_importance(m, df_trn); fi[:10]

In [None]:
def plot_fi(fi): return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [None]:
plot_fi(fi[:30]);

In [None]:
to_keep = fi[fi.imp>0.005].cols; len(to_keep)

In [None]:
df_keep = df_trn[to_keep].copy()
X_train, X_valid = split_vals(df_keep, n_trn)

In [None]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5,
                          n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

In [None]:
fi = rf_feat_importance(m, df_keep)
plot_fi(fi);

In [None]:
from scipy.cluster import hierarchy as hc

corr = np.round(scipy.stats.spearmanr(df_keep).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(16,10))
dendrogram = hc.dendrogram(z, labels=df_keep.columns, orientation='left', leaf_font_size=16)
plt.show()