In [3]:
from sklearn.ensemble import RandomForestClassifier


In [4]:
import pandas as pd
import geopandas as gpd
import numpy as np
import shapely
import libpysal
from sklearn.model_selection import train_test_split, GridSearchCV
import os
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import Normalizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score



In [7]:
# df = pd.read_csv("EHfilled_data15.csv", index_col = 0)
def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [8]:
df15 = load_data(2015)
df16 = load_data(2016)
df17 = load_data(2017)
df18 = load_data(2018)
df19 = load_data(2019)

In [11]:
df15.shape

(30974, 43)

In [12]:
(df15.y==1).sum()

2214

In [14]:
class CA:
    
    def __init__(self, df, normalize = True):
        self.X = df[df.columns[:-4]]
        self.y = df.y
        self.w = libpysal.weights.DistanceBand.from_dataframe(df,threshold=150, binary = True, silence_warnings = True)
#         self.X_train, self. X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=2)
        self.X_train, self. X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=2)
        
        if normalize:
            self.transformer = Normalizer().fit(self.X_train)
            self.X_norm = self.transformer.transform(self.X) # normalize full data   
            
    def transition(self, model):
        self.trans_model = model # set the transition model
        self.trans_model.fit(self.transformer.transform(self.X_train), self.y_train) # fit the model on normalized training data        
        self.full_p = pd.DataFrame(self.trans_model.predict_proba(self.X_norm)).set_index(self.X.index) # predict probablities for full data
        
    def neighbor_function(self, train):
        if train:
            # get the neighbors of training set
            neighbors = [self.w.neighbors[x] for x in self.X_train.index]          
            # Keep the transition_probabilities of the training set
            ps = self.full_p.loc[self.X_train.index] 
            
        else:
            # get the neighbors of the test set
            neighbors = [self.w.neighbors[x] for x in self.X_test.index]
            # keep the transition probabilites of the test set
            ps = self.full_p.loc[self.X_test.index] 
            
        # calculate total probability of neighbors and put in dataframe    
        p = pd.DataFrame([self.full_p.loc[x].sum() for x in neighbors])
        p.columns = ["a", "b"]

        # reset and drop indices to ensure proper concatenation
        ps.reset_index(drop = True, inplace = True)
        p.reset_index(drop = True, inplace = True)

        self.ps = pd.concat([p, ps], axis = 1)
            
    def final_bagger(self, model):
        # initializes and trains the bagger that decides the final 0/1 prediction
        self.bag_model = model
        self.bag_model.fit(self.ps, self.y_train)
        
    def train(self, transition_model, bagger_model):
        # trains the CA
        self.transition(transition_model)
        self.neighbor_function(train = True)
        self.final_bagger(bagger_model)
        
    def test(self):
        # tests the CA
        ca.neighbor_function(train = False)
        self.y_pred = self.bag_model.predict(self.ps)
        self.scores = {"AUC": roc_auc_score(self.y_test, self.y_pred), "recall" : recall_score(self.y_test, self.y_pred),
                      "precision" : precision_score(self.y_test, self.y_pred)}
        
    def optimize(self, params, scoring = "roc_auc"):
        # for now only transition function
        self.clf = GridSearchCV(self.trans_model, params, cv = 5, scoring = scoring,
                               verbose = 3)
        self.clf.fit(self.transformer.transform(self.X_train), self.y_train)
        
     

In [15]:
ca = CA(df15)

In [16]:
clf = RandomForestClassifier()
# clf = MLPClassifier(random_state=1, max_iter=300, hidden_layer_sizes = (200,100,50))
clf_bagger = RandomForestClassifier(max_depth = 4, random_state = 0)
ca.train(clf, clf_bagger)

In [17]:
params = {"max_depth": [2,4,6,8], "oob_score" : [True, False]}
ca.optimize(params)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .......max_depth=2, oob_score=True;, score=0.665 total time=   1.4s
[CV 2/5] END .......max_depth=2, oob_score=True;, score=0.690 total time=   1.4s
[CV 3/5] END .......max_depth=2, oob_score=True;, score=0.656 total time=   1.4s
[CV 4/5] END .......max_depth=2, oob_score=True;, score=0.682 total time=   1.4s
[CV 5/5] END .......max_depth=2, oob_score=True;, score=0.659 total time=   1.4s
[CV 1/5] END ......max_depth=2, oob_score=False;, score=0.665 total time=   1.2s
[CV 2/5] END ......max_depth=2, oob_score=False;, score=0.687 total time=   1.2s
[CV 3/5] END ......max_depth=2, oob_score=False;, score=0.655 total time=   1.2s
[CV 4/5] END ......max_depth=2, oob_score=False;, score=0.676 total time=   1.2s
[CV 5/5] END ......max_depth=2, oob_score=False;, score=0.657 total time=   1.2s
[CV 1/5] END .......max_depth=4, oob_score=True;, score=0.675 total time=   2.3s
[CV 2/5] END .......max_depth=4, oob_score=True;,

In [20]:
ca.train(ca.clf.best_estimator_, clf_bagger)

In [21]:
ca.test()

In [22]:
ca.scores

{'AUC': 0.5430734635765458,
 'recall': 0.10452418096723869,
 'precision': 0.29646017699115046}

array([False, False, False, ..., False, False, False])

In [587]:
ca.final_bagger(RandomForestClassifier(max_depth = 4, random_state = 0))

In [588]:
ca.neighbor_function(False)

In [591]:
precision_score(ca.y_test, ca.bag_model.predict(ca.ps))

0.5517241379310345

In [579]:
average_precision_score(ca.bag_model.predict(ca.ps), ca.y_test)

0.21812896105108287

In [525]:
(ca.bag_model.predict(ca.ps) == 1).sum()

0

In [528]:
ca.ps
(ca.y_train == 1).sum()

2807

In [None]:
bag_forest = RandomForestClassifier(max_depth = 4, random_state = 0)


In [287]:
ps.loc[ca.X_train.index].add(np.array(p))

Unnamed: 0,0,1
35095,4.95,0.05
38770,4.96,6.04
26006,6.49,0.51
3005,6.94,0.06
35776,3.00,0.00
...,...,...
31019,8.12,0.88
30280,8.87,0.13
6637,13.16,1.84
35343,7.00,0.00


In [256]:
a

[0    3.91
 1    0.09
 dtype: float64,
 0    4.82
 1    5.18
 dtype: float64,
 0    5.54
 1    0.46
 dtype: float64,
 0    6.0
 1    0.0
 dtype: float64,
 0    2.0
 1    0.0
 dtype: float64,
 0    7.0
 1    0.0
 dtype: float64,
 0    6.9
 1    1.1
 dtype: float64,
 0    6.84
 1    0.16
 dtype: float64,
 0    6.0
 1    0.0
 dtype: float64,
 0    11.48
 1     0.52
 dtype: float64,
 0    3.76
 1    0.24
 dtype: float64,
 0    14.18
 1     1.82
 dtype: float64,
 0    6.02
 1    1.98
 dtype: float64,
 0    5.25
 1    2.75
 dtype: float64,
 0    13.24
 1     2.76
 dtype: float64,
 0    4.31
 1    0.69
 dtype: float64,
 0    7.15
 1    0.85
 dtype: float64,
 0    11.94
 1     2.06
 dtype: float64,
 0    4.46
 1    2.54
 dtype: float64,
 0    4.0
 1    0.0
 dtype: float64,
 0    5.9
 1    0.1
 dtype: float64,
 0    6.14
 1    0.86
 dtype: float64,
 0    14.0
 1     0.0
 dtype: float64,
 0    3.82
 1    0.18
 dtype: float64,
 0    4.15
 1    0.85
 dtype: float64,
 0    7.82
 1    0.18
 dtype: f

In [217]:
neighbors = [ca.w.neighbors[x] for x in ca.X_train.index]

In [126]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier

In [97]:
df.columns

Index(['bijeenkomstfunctie', 'gezondheidszorgfunctie', 'industriefunctie',
       'kantoorfunctie', 'logiesfunctie', 'onderwijsfunctie', 'sportfunctie',
       'winkelfunctie', 'woonfunctie', 'oppervlakteVerblijfsobject',
       'inwoner_g', 'geboorte_g', 'inw_014_g', 'inw_1524_g', 'inw_2544_g',
       'inw_4564_g', 'inw_65_g', 'p_western_g', 'p_nonWestern_g',
       'p_buyhouses_g', 'p_rentals_g', 'p_socialHousing_g', 'medianIncome_g',
       'avg_electricity_g', 'avg_gas_g', 'p_benefits_g', 'afs_haprak_g',
       'afs_ziek_g', 'afs_superm_g', 'afs_vo_g', 'afs_oprit_g', 'afs_train_g',
       'afs_transit_g', 'p_buildBefore2000_g', 'p_buildAfter2000_g',
       'mean_WOZ', 'PLaagste40Inkomen', 'PHoogste20Inkomen', 'C28992R100',
       'geometry', 'y', 'BU_CODE'],
      dtype='object')

In [98]:
X = df[df.columns[:-4]]

In [99]:
Y = df.y

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2)

In [101]:
# clf = RandomForestClassifier(max_depth=4, random_state=0, n_estimators = 1000, min_samples_split = 3)
clf = DecisionTreeClassifier(max_depth=4, random_state=0)
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4, random_state=0)

In [102]:
X_pred = clf.predict_proba(X)

In [103]:
X_pred = pd.DataFrame(X_pred[:,1])
X_t_pred = X_pred.loc[X_test.index]

In [104]:
def calc_score

SyntaxError: invalid syntax (Temp/ipykernel_6656/3580027668.py, line 1)

In [105]:
def calc_p(test, full):
    neighbors = [w.neighbors[x] for x in test.index]
    
    p = [full.loc[x].sum() for x in neighbors]
    return pd.DataFrame(test.values + p)

In [112]:
p = calc_p(X_pred, X_pred).set_index(X_pred.index)

In [113]:
bag_forest = RandomForestClassifier(max_depth = 4, random_state = 0)
bag_forest.fit(p, Y)

RandomForestClassifier(max_depth=4, random_state=0)

In [107]:
p.loc[p.nlargest(1254, 0).index] = 1
p[(p != 1).values] = 0

In [108]:
recall_score(p, y_test)

0.24401913875598086

In [109]:
precision_score(p, y_test)

0.24401913875598086

In [51]:
import xgboost
from sklearn.model_selection import GridSearchCV,StratifiedKFold

In [53]:
model = xgboost.XGBClassifier()
kfold = StratifiedKFold(n_splits=10)
params = {
    'n_estimators': [25,50,75,100],
    'max_depth': [1,3,5,7,9,10],
    'learning_rate':[0.001,0.01,0.1,0.25]
}
grid_search = GridSearchCV(model,param_grid=params,cv=kfold,n_jobs=-1)
grid_search.fit(X,Y)





GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample

In [54]:
grid_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.001, max_delta_step=0, max_depth=1,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=25, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [60]:
grid_search.best_params_

{'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 25}

In [63]:
model = xgboost.XGBClassifier(learning_rate = 0.001, max_depth = 1, n_estimators = 25)

In [64]:
model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.001, max_delta_step=0, max_depth=1,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=25, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [68]:
X_pred = model.predict_proba(X)

In [297]:
a = model.predict(X_test)

In [298]:
accuracy_score(a, y_test)

0.904891304347826

In [299]:
precision_score(a, y_test)

0.05223880597014925

In [300]:
recall_score(a, y_test)

0.12727272727272726

In [302]:
df.columns

Index(['bijeenkomstfunctie', 'gezondheidszorgfunctie', 'industriefunctie',
       'kantoorfunctie', 'logiesfunctie', 'onderwijsfunctie', 'sportfunctie',
       'winkelfunctie', 'woonfunctie', 'oppervlakteVerblijfsobject',
       'inwoner_g', 'geboorte_g', 'inw_014_g', 'inw_1524_g', 'inw_2544_g',
       'inw_4564_g', 'inw_65_g', 'p_western_g', 'p_nonWestern_g',
       'p_buyhouses_g', 'p_rentals_g', 'p_socialHousing_g', 'medianIncome_g',
       'avg_electricity_g', 'avg_gas_g', 'p_benefits_g', 'afs_haprak_g',
       'afs_ziek_g', 'afs_superm_g', 'afs_vo_g', 'afs_oprit_g', 'afs_train_g',
       'afs_transit_g', 'p_buildBefore2000_g', 'p_buildAfter2000_g',
       'mean_WOZ', 'PLaagste40Inkomen', 'PHoogste20Inkomen', 'C28992R100',
       'geometry', 'y', 'BU_CODE'],
      dtype='object')

In [75]:
corr = df.corr()

corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,bijeenkomstfunctie,gezondheidszorgfunctie,industriefunctie,kantoorfunctie,logiesfunctie,onderwijsfunctie,sportfunctie,winkelfunctie,woonfunctie,oppervlakteVerblijfsobject,inwoner_g,geboorte_g,inw_014_g,inw_1524_g,inw_2544_g,inw_4564_g,inw_65_g,p_western_g,p_nonWestern_g,p_buyhouses_g,p_rentals_g,p_socialHousing_g,medianIncome_g,avg_electricity_g,avg_gas_g,p_benefits_g,afs_haprak_g,afs_ziek_g,afs_superm_g,afs_vo_g,afs_oprit_g,afs_train_g,afs_transit_g,p_buildBefore2000_g,p_buildAfter2000_g,mean_WOZ,PLaagste40Inkomen,PHoogste20Inkomen,y
bijeenkomstfunctie,1.0,0.018667,0.084886,0.170065,0.03075,0.04132,0.088471,0.355058,0.210158,-0.004505,0.032687,0.001333,-0.147869,0.165605,0.105504,-0.132501,-0.031304,0.129497,0.0182,-0.148754,0.130502,0.035454,-0.047776,-0.12409,-0.073056,0.023217,-0.082737,-0.069443,-0.086455,-0.077178,-0.001754,-0.105612,-0.146331,0.016941,-0.016861,-0.072281,0.079919,-0.051176,0.078694
gezondheidszorgfunctie,0.018667,1.0,0.005533,0.012338,-0.000443,0.012094,0.007794,0.012534,0.040262,-0.00181,0.019493,0.015743,-0.018791,0.003421,0.005357,-0.026192,0.027179,0.004598,0.012782,-0.030489,0.024669,0.022754,-0.019778,-0.030376,-0.030404,0.017375,-0.029967,-0.021149,-0.028107,-0.023014,0.006176,-0.020768,-0.023455,-0.005432,0.005481,-0.027739,0.01471,-0.019988,0.023385
industriefunctie,0.084886,0.005533,1.0,0.221601,0.008217,0.043031,0.062289,0.19301,0.074298,-0.002395,-0.042845,-0.015206,-0.118743,0.119727,0.171318,-0.101024,-0.10483,0.035485,0.046381,-0.057949,0.050426,-0.029605,-0.008372,-0.039494,-0.0514,-0.025792,-0.019526,-0.062912,-0.03633,-0.04674,-0.006958,-0.066119,-0.103891,-0.002071,0.000987,0.01206,0.023311,0.015227,0.065633
kantoorfunctie,0.170065,0.012338,0.221601,1.0,0.029574,0.070721,0.057845,0.198983,0.116521,-0.003219,-0.008473,-0.003334,-0.150494,0.136134,0.168934,-0.160503,-0.076503,0.102601,0.034267,-0.128101,0.116301,0.013939,0.002557,-0.123249,-0.113427,-0.008016,-0.087969,-0.111695,-0.096831,-0.113579,-0.022777,-0.127823,-0.171245,-0.013121,0.012951,-0.027379,0.020465,0.01799,0.073575
logiesfunctie,0.03075,-0.000443,0.008217,0.029574,1.0,-0.000845,0.009179,0.024552,-0.006489,-0.001462,-0.034493,-0.03443,-0.027487,0.008,-0.007309,0.018659,0.016145,0.02917,-0.02389,-0.007019,0.005483,-0.025063,0.005542,0.021245,0.030673,0.000271,0.031581,0.024891,0.030213,0.013577,0.006804,0.004732,0.025647,0.01519,-0.015144,0.022656,0.005271,-0.001429,0.015763
onderwijsfunctie,0.04132,0.012094,0.043031,0.070721,-0.000845,1.0,0.057621,0.038148,0.039107,-0.001379,0.024229,0.025967,-0.046754,0.10974,0.058271,-0.086206,-0.049899,0.067121,0.050776,-0.077012,0.074607,0.043882,-0.037115,-0.074991,-0.082429,0.008207,-0.048559,-0.062042,-0.050024,-0.052603,-0.010736,-0.045643,-0.063922,-0.009656,0.009721,-0.043142,0.058352,-0.025539,0.032397
sportfunctie,0.088471,0.007794,0.062289,0.057845,0.009179,0.057621,1.0,0.058452,0.043644,-0.001729,0.018109,0.022599,-0.023282,0.033896,0.054103,-0.05408,-0.021541,0.025482,0.026103,-0.040391,0.039829,0.012924,-0.003486,-0.051602,-0.039009,0.005658,-0.044164,-0.030299,-0.045548,-0.045656,-0.000824,-0.039991,-0.051991,0.000704,-0.000643,-0.025998,0.00525,-0.002106,0.033912
winkelfunctie,0.355058,0.012534,0.19301,0.198983,0.024552,0.038148,0.058452,1.0,0.193093,-0.004832,0.026562,0.011417,-0.173642,0.184831,0.163308,-0.162686,-0.059277,0.143086,0.038316,-0.16479,0.149766,0.02124,-0.044007,-0.145877,-0.110602,0.007012,-0.099141,-0.09008,-0.115623,-0.094587,-0.007363,-0.113114,-0.156872,0.022066,-0.022124,-0.077738,0.058136,-0.036813,0.116416
woonfunctie,0.210158,0.040262,0.074298,0.116521,-0.006489,0.039107,0.043644,0.193093,1.0,-0.019634,0.275953,0.279053,-0.085333,0.185879,0.2969,-0.305936,-0.136751,0.225324,0.294212,-0.388616,0.386322,0.331561,-0.222997,-0.449416,-0.386306,0.22488,-0.327168,-0.278945,-0.338438,-0.299444,0.063157,-0.259977,-0.311996,0.031484,-0.030883,-0.365382,0.17049,-0.194561,0.123996
oppervlakteVerblijfsobject,-0.004505,-0.00181,-0.002395,-0.003219,-0.001462,-0.001379,-0.001729,-0.004832,-0.019634,1.0,-0.024553,-0.020457,-0.014398,0.034072,-0.015219,0.001474,0.010687,0.011191,-0.005791,0.000705,-0.001964,-0.02106,0.023295,0.019733,0.027551,-0.012458,0.017258,0.008805,0.015966,0.021536,-0.018213,0.01553,0.01089,0.001481,-0.001495,0.016117,0.007785,0.008127,-0.007138


In [78]:
corr["woonfunctie"]

bijeenkomstfunctie            0.210158
gezondheidszorgfunctie        0.040262
industriefunctie              0.074298
kantoorfunctie                0.116521
logiesfunctie                -0.006489
onderwijsfunctie              0.039107
sportfunctie                  0.043644
winkelfunctie                 0.193093
woonfunctie                   1.000000
oppervlakteVerblijfsobject   -0.019634
inwoner_g                     0.275953
geboorte_g                    0.279053
inw_014_g                    -0.085333
inw_1524_g                    0.185879
inw_2544_g                    0.296900
inw_4564_g                   -0.305936
inw_65_g                     -0.136751
p_western_g                   0.225324
p_nonWestern_g                0.294212
p_buyhouses_g                -0.388616
p_rentals_g                   0.386322
p_socialHousing_g             0.331561
medianIncome_g               -0.222997
avg_electricity_g            -0.449416
avg_gas_g                    -0.386306
p_benefits_g             

In [7]:
df15.shape

(39148, 42)