In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))

Device: cpu


In [2]:
import numpy as np
import os
from get_data import get_data
from imblearn.over_sampling import SMOTE 
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.tensorflow import balanced_batch_generator
from numpy.random import default_rng
rng = default_rng(seed = 31)
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  GridSearchCV
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, roc_auc_score


In [3]:
from torch import nn, optim
import scipy.ndimage as ndimage

In [4]:
# X, Y, ID = get_data("../Data/filled/grids/", [2015,2016,2017,2018,2019])

In [5]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [6]:
with open('w15.pickle', 'rb') as handle:
    wt = pickle.load(handle)
    
with open('w16.pickle', 'rb') as handle:
    wv = pickle.load(handle)

In [7]:
def create_CNN_samples(grid, block, dims = 39):
    
    nonzero = np.transpose(grid[:,:,-2].nonzero()) # Get indices of nonzero componetns

    size = nonzero.shape[0]
    width = block * 2 + 1 # calculate widht and height. Needed later on
    
    X = np.zeros((size, width, width, dims))
    Y = np.zeros(size)
    ID = np.zeros(size)
    
    for idx, i in enumerate(nonzero):
        x, ID[idx], Y[idx] = get_neighbor_grid(grid, i, block)
        X[idx] = x.reshape(width,width, 39)
        
    X = np.moveaxis(X, -1, 1) # order the indices correctly to make sure it works in CNN
    X = torch.from_numpy(X).float()
    Y = torch.from_numpy(Y).float()
    
    return X,ID,Y

In [8]:
def get_neighbor_grid(full, hw, block = 1):
    
    # get the nonzero (built) blocks by checking if they have a ID

    h = hw[0]
    w = hw[1]
    
    y = full[h,w,-1]
    ID = full[h,w,-2]
    
    hu = h - block
    hd = h + block
    hshort, hextra, wshort, wextra = 0,0,0,0
    if hu < 0:
        hshort = 0 - hu
        hu = 0
    if hd >= full.shape[0]:
        hextra = (hd - full.shape[0]) + 1
        hd = full.shape[0]

    wr = w + block
    wl = w - block

    if wr >= full.shape[1]:
        wextra = (wr - full.shape[1]) + 1
        wr = full.shape[1]
    if wl < 0:
        wshort = 0 - wl
        wl = 0

    nb = full[hu : hd + 1, wl : wr + 1, :]
    nb = np.pad(nb, ((hshort, hextra), (wshort, wextra), (0,0)), mode = "constant", constant_values = 0)
    return nb[:,:,:-2], ID, y


In [9]:
X_train = []
Y_train = []
ID_train = []
for filename in os.listdir("../Data/filled/grids/2015/"):
    n = np.load("../Data/filled/grids/2015/" + filename)
    X, ID, Y = create_CNN_samples(n, 5)
    X_train.append(X)
    Y_train.append(Y)
    ID_train.append(ID)
    
Y_train = np.concatenate(Y_train)
ID_train = np.concatenate(ID_train)
X_train = np.concatenate(X_train)

X_train = np.moveaxis(X_train, 1, -1)
X_train = X_train.reshape(-1, 39)

X_train = ss.fit_transform(X_train)
X_train = X_train.reshape(-1, 11, 11, 39)
X_train = X_train.reshape(X_train.shape[0], -1)

oversampler = SMOTE()
X_train_resamp, Y_train_resamp = oversampler.fit_resample(X_train, Y_train)
X_train = X_train.reshape(-1, 11, 11, 39)
X_train_resamp = X_train_resamp.reshape(-1, 11, 11, 39)

X_train = np.moveaxis(X_train, -1, 1)
X_train_resamp = np.moveaxis(X_train_resamp, -1, 1) 




X_val = []
Y_val = []
ID_val = []

for filename in os.listdir("../Data/filled/grids/2016/"):
    n = np.load("../Data/filled/grids/2016/" + filename)
    X, ID, Y = create_CNN_samples(n, 5)
    X_val.append(X)
    Y_val.append(Y)
    ID_val.append(ID)
    
X_val = np.concatenate(X_val)
X_val = np.moveaxis(X_val, 1, -1)
X_val = X_val.reshape(-1, 39)

X_val = ss.transform(X_val)


X_val = X_val.reshape(-1, 11, 11, 39)
X_val = np.moveaxis(X_val, -1, 1)
Y_val = np.concatenate(Y_val)
ID_val = np.concatenate(ID_val)


In [19]:
import shapely
import libpysal
import geopandas as gpd

def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

df15 = load_data(2015)
df16 = load_data(2016)

scaler = StandardScaler()
X15 = df15[df15.columns[:-4]]
Y15 = df15.y

X16 = df16[df16.columns[:-4]]
Y16 = df16.y

X15 = pd.DataFrame(scaler.fit_transform(X15))
X16 = pd.DataFrame(scaler.transform(X16))

In [10]:
def get_batch(X, Y, batch_size = 32):
    
    idxs = rng.integers(X.shape[0], size = batch_size)
    
    X = torch.from_numpy(X[idxs]).float()
    Y = torch.from_numpy(Y[idxs]).float()
    

    return X, Y


In [11]:
class Model1(nn.Module):
    def __init__(self, name):
        super(Model1, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 32, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.MaxPool2d(2),
            nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(64, 8),
            nn.ReLU())
        
        self.final = nn.Sequential(
            nn.Linear(8,1),
            nn.Sigmoid())
            
        self.name = name
        
        
    def forward(self, x):
        out = self.net(x)
        out = self.final(out)
        return out
    
    def get_no_activation(self, x):
        out = self.net(x)
        return out

In [12]:
class Model2(nn.Module):
    def __init__(self, name):
        super(Model2, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 128, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.Conv2d(in_channels = 128, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(576, 64),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid())
            
        self.name = name
    def forward(self, x):
        out = self.net(x)
        return out

In [13]:
class Model3(nn.Module):
    def __init__(self, name):
        super(Model3, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 128, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.Conv2d(in_channels = 128, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(576, 128),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid())
            
        self.name = name
    def forward(self, x):
        out = self.net(x)
        return out

In [14]:
class Model4(nn.Module):
    def __init__(self, name):
        super(Model4, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(1152, 512),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid())
            
        self.name = name
    def forward(self, x):
        out = self.net(x)
        return out

In [15]:
model1 = Model1("een")
model2 = Model2("twee")
model3 = Model3("drie")
model4 = Model4("vier")
models = [model1, model2, model3, model4]

In [16]:
from sklearn.metrics import recall_score, precision_score, confusion_matrix

In [17]:

# # optimizer = optim.RMSprop(model.parameters(), lr=0.001) 
# BCEloss = nn.BCELoss()
# model.train()

def train_model(model, X_train, Y_train, X_val, Y_val, num_epochs, batch_per_e = 500):
    optimizer = optim.RMSprop(model.parameters(), lr=0.0001) 
    BCEloss = nn.BCELoss()
    train_loss = []
    train_loss_history = []
    acc_history = []
    ROC_history = []
    f1_score_history = []
    cmc_best = 0
    
    

    for epoch in range(num_epochs):
        epoch_loss = []
        print("epoch: {} of {}".format(epoch, num_epochs))
        for batch in range(batch_per_e):
            
            x_train_, y_train_ = get_batch(X_train, Y_train)
            model.train()

            optimizer.zero_grad()
            out = model(x_train_)

            loss = BCEloss(out.squeeze(), y_train_)
            loss.backward()
            optimizer.step()
            if batch % 10 == 0:
                train_loss.append(loss)
                
        model.eval()
        
        x_val_, y_val_ = get_batch(X_val, Y_val, batch_size = 1000)
        predictions = model(x_val_).detach().numpy()
        predictions = (predictions > 0.5).astype(int)
        
        acc = accuracy_score(y_val_, predictions)
        ROC = roc_auc_score(y_val_, predictions)
        f1 = f1_score(y_val_, predictions)
        train_loss = (np.sum(train_loss) / (batch_per_e/10)).detach().item()

        acc_history.append(acc)
        ROC_history.append(ROC)
        train_loss_history.append(train_loss)
        f1_score_history.append(f1)
        
        
        print("training_loss: {:.4f}, acc: {:.3f}, ROC: {:.3f}, f1: {:.3f}".format(train_loss, acc, ROC , f1))
        train_loss = []


    return acc_history, ROC_history, train_loss_history, f1_score_history


        

In [51]:
def neighbor_part(models, X, idxs, w, labels):
    tot_preds = pd.DataFrame()
    for i in labels15.index.unique():
        x = X[labels.loc[i]["X"]]
        IDs = idxs[labels.loc[i]["X"]]
        preds = models[i](torch.from_numpy(x).float())
        preds = pd.DataFrame(np.array(preds.detach())).set_index(IDs)
        tot_preds = pd.concat([tot_preds, preds])
    
    
    neighbors = [w.neighbors[x] for x in idxs]
    transitions = [tot_preds.loc[x].values for x in neighbors]


    n_function = np.zeros((len(tot_preds), w.max_neighbors + 1))
    for i, (t, idx) in enumerate(zip(transitions, idxs)):
        n_function[i, 1:len(t) + 1] = t.squeeze()
        n_function[i, 0] = tot_preds.loc[idx]
        
    return n_function

In [52]:
models = [Model1, Model2, Model3, Model4]

In [None]:
from sklearn.cluster import KMeans
for clusters in [3, 5, 7, 9, 11]:
    for i, mod in enumerate(models):
        
        cluster_model = KMeans(n_clusters=clusters, random_state=0)
        labels15 = pd.DataFrame(cluster_model.fit_predict(X15))
        labels16 = pd.DataFrame(cluster_model.predict(X16))

        labels15["X"] = X15.index
        labels16["X"] = X16.index


        labels15 = labels15.set_index(0)
        labels16 = labels16.set_index(0)
        
        mods = {}
        for i in labels15.index.unique():
            model = mod(str(i))
            x = X_train[labels15.loc[i]["X"]]
            y = Y_train[labels15.loc[i]["X"]]
            
            x_val = X_val[labels16.loc[i]["X"]]
            y_val = Y_val[labels16.loc[i]["X"]]
            print(x.shape)
            oversample = SMOTE()
            x, y = oversample.fit_resample(x.reshape(x.shape[0], -1), y)
            x = x.reshape(-1, 39, 11, 11)

            
            hists = train_model(model, x, y, x_val, y_val, 250)
            mods[i] = model
        print(mods)
            
        n_function = neighbor_part(mods, X_train, ID_train, wt, labels15)

        oversample = SMOTE()
        x, y = oversample.fit_resample(n_function, Y_train)

        clf_bagger = RandomForestClassifier(max_depth = 12, oob_score = True)
        clf_bagger.fit(x,y)

        n_function = neighbor_part(mods, X_val, ID_val, wv, labels16)
        preds = clf_bagger.predict(n_function)
        totacc = accuracy_score(Y_val, preds)
        totf1 = f1_score(Y_val, preds)
        totROC = roc_auc_score(Y_val, preds)


        with open("../results/CNN/model" + str(i) +"_"+ str(clusters) + "cluster.csv", "a+") as f:
            f.write("loss;acc;ROC;f1_score\n")
            f.write(str(hists[2]) + ";" + str(hists[0]) + ";" + str(hists[1]) + ";" + str(hists[3]) + "\n")
            f.write("--;" + str(totacc) + ";" + str(totROC) + ";" + str(totf1))

    
    

(10784, 39, 11, 11)
epoch: 0 of 250
training_loss: 0.6071, acc: 0.637, ROC: 0.598, f1: 0.188
epoch: 1 of 250
training_loss: 0.5594, acc: 0.664, ROC: 0.491, f1: 0.116
epoch: 2 of 250
training_loss: 0.5088, acc: 0.746, ROC: 0.573, f1: 0.164
epoch: 3 of 250
training_loss: 0.4563, acc: 0.783, ROC: 0.496, f1: 0.100
epoch: 4 of 250
training_loss: 0.4515, acc: 0.773, ROC: 0.613, f1: 0.215
epoch: 5 of 250
training_loss: 0.4082, acc: 0.837, ROC: 0.548, f1: 0.155
epoch: 6 of 250
training_loss: 0.3781, acc: 0.787, ROC: 0.565, f1: 0.151
epoch: 7 of 250
training_loss: 0.3574, acc: 0.790, ROC: 0.579, f1: 0.198
epoch: 8 of 250
training_loss: 0.3500, acc: 0.827, ROC: 0.550, f1: 0.164
epoch: 9 of 250
training_loss: 0.3204, acc: 0.828, ROC: 0.531, f1: 0.149
epoch: 10 of 250
training_loss: 0.2779, acc: 0.835, ROC: 0.549, f1: 0.154
epoch: 11 of 250
training_loss: 0.2806, acc: 0.796, ROC: 0.579, f1: 0.177
epoch: 12 of 250
training_loss: 0.2487, acc: 0.884, ROC: 0.540, f1: 0.147
epoch: 13 of 250
training_lo

training_loss: 0.0327, acc: 0.881, ROC: 0.557, f1: 0.190
epoch: 112 of 250
training_loss: 0.0296, acc: 0.883, ROC: 0.537, f1: 0.146
epoch: 113 of 250
training_loss: 0.0329, acc: 0.905, ROC: 0.513, f1: 0.078
epoch: 114 of 250
training_loss: 0.0236, acc: 0.879, ROC: 0.539, f1: 0.154
epoch: 115 of 250
training_loss: 0.0280, acc: 0.866, ROC: 0.529, f1: 0.130
epoch: 116 of 250
training_loss: 0.0306, acc: 0.876, ROC: 0.521, f1: 0.114
epoch: 117 of 250
training_loss: 0.0412, acc: 0.875, ROC: 0.545, f1: 0.150
epoch: 118 of 250
training_loss: 0.0287, acc: 0.875, ROC: 0.559, f1: 0.183
epoch: 119 of 250
training_loss: 0.0295, acc: 0.889, ROC: 0.505, f1: 0.067
epoch: 120 of 250
training_loss: 0.0194, acc: 0.849, ROC: 0.515, f1: 0.107
epoch: 121 of 250
training_loss: 0.0300, acc: 0.863, ROC: 0.575, f1: 0.208
epoch: 122 of 250
training_loss: 0.0271, acc: 0.864, ROC: 0.534, f1: 0.150
epoch: 123 of 250
training_loss: 0.0242, acc: 0.900, ROC: 0.524, f1: 0.107
epoch: 124 of 250
training_loss: 0.0190, ac

training_loss: 0.0200, acc: 0.881, ROC: 0.563, f1: 0.179
epoch: 222 of 250
training_loss: 0.0212, acc: 0.884, ROC: 0.515, f1: 0.094
epoch: 223 of 250
training_loss: 0.0111, acc: 0.890, ROC: 0.531, f1: 0.127
epoch: 224 of 250
training_loss: 0.0108, acc: 0.900, ROC: 0.545, f1: 0.153
epoch: 225 of 250
training_loss: 0.0093, acc: 0.885, ROC: 0.516, f1: 0.094
epoch: 226 of 250
training_loss: 0.0114, acc: 0.901, ROC: 0.530, f1: 0.124
epoch: 227 of 250
training_loss: 0.0098, acc: 0.865, ROC: 0.561, f1: 0.192
epoch: 228 of 250
training_loss: 0.0089, acc: 0.872, ROC: 0.535, f1: 0.147
epoch: 229 of 250
training_loss: 0.0164, acc: 0.859, ROC: 0.553, f1: 0.194
epoch: 230 of 250
training_loss: 0.0115, acc: 0.890, ROC: 0.524, f1: 0.113
epoch: 231 of 250
training_loss: 0.0174, acc: 0.878, ROC: 0.503, f1: 0.062
epoch: 232 of 250
training_loss: 0.0118, acc: 0.903, ROC: 0.578, f1: 0.224
epoch: 233 of 250
training_loss: 0.0240, acc: 0.896, ROC: 0.560, f1: 0.188
epoch: 234 of 250
training_loss: 0.0121, ac

training_loss: 0.1614, acc: 0.892, ROC: 0.519, f1: 0.100
epoch: 83 of 250
training_loss: 0.1653, acc: 0.869, ROC: 0.549, f1: 0.166
epoch: 84 of 250
training_loss: 0.1630, acc: 0.878, ROC: 0.581, f1: 0.218
epoch: 85 of 250
training_loss: 0.1858, acc: 0.847, ROC: 0.553, f1: 0.155
epoch: 86 of 250
training_loss: 0.1778, acc: 0.872, ROC: 0.547, f1: 0.169
epoch: 87 of 250
training_loss: 0.1611, acc: 0.886, ROC: 0.517, f1: 0.095
epoch: 88 of 250
training_loss: 0.1635, acc: 0.739, ROC: 0.555, f1: 0.166
epoch: 89 of 250
training_loss: 0.1863, acc: 0.812, ROC: 0.618, f1: 0.260
epoch: 90 of 250
training_loss: 0.1759, acc: 0.844, ROC: 0.524, f1: 0.133
epoch: 91 of 250
training_loss: 0.1676, acc: 0.824, ROC: 0.531, f1: 0.146
epoch: 92 of 250
training_loss: 0.1424, acc: 0.880, ROC: 0.550, f1: 0.167
epoch: 93 of 250
training_loss: 0.1679, acc: 0.903, ROC: 0.525, f1: 0.110
epoch: 94 of 250
training_loss: 0.1808, acc: 0.860, ROC: 0.502, f1: 0.079
epoch: 95 of 250
training_loss: 0.1789, acc: 0.899, ROC

training_loss: 0.0907, acc: 0.866, ROC: 0.555, f1: 0.173
epoch: 193 of 250
training_loss: 0.1018, acc: 0.859, ROC: 0.575, f1: 0.185
epoch: 194 of 250
training_loss: 0.0875, acc: 0.866, ROC: 0.568, f1: 0.183
epoch: 195 of 250
training_loss: 0.1082, acc: 0.869, ROC: 0.581, f1: 0.225
epoch: 196 of 250
training_loss: 0.1001, acc: 0.852, ROC: 0.564, f1: 0.169
epoch: 197 of 250
training_loss: 0.1031, acc: 0.903, ROC: 0.553, f1: 0.171
epoch: 198 of 250
training_loss: 0.1122, acc: 0.855, ROC: 0.561, f1: 0.208
epoch: 199 of 250
training_loss: 0.0949, acc: 0.845, ROC: 0.601, f1: 0.236
epoch: 200 of 250
training_loss: 0.1079, acc: 0.896, ROC: 0.546, f1: 0.161
epoch: 201 of 250
training_loss: 0.1098, acc: 0.861, ROC: 0.502, f1: 0.079
epoch: 202 of 250
training_loss: 0.1150, acc: 0.844, ROC: 0.587, f1: 0.204
epoch: 203 of 250
training_loss: 0.1061, acc: 0.883, ROC: 0.538, f1: 0.146
epoch: 204 of 250
training_loss: 0.1238, acc: 0.874, ROC: 0.503, f1: 0.074
epoch: 205 of 250
training_loss: 0.1150, ac

training_loss: 0.2104, acc: 0.810, ROC: 0.554, f1: 0.159
epoch: 53 of 250
training_loss: 0.1983, acc: 0.824, ROC: 0.609, f1: 0.279
epoch: 54 of 250
training_loss: 0.1972, acc: 0.851, ROC: 0.492, f1: 0.063
epoch: 55 of 250
training_loss: 0.1893, acc: 0.856, ROC: 0.539, f1: 0.153
epoch: 56 of 250
training_loss: 0.2136, acc: 0.825, ROC: 0.592, f1: 0.236
epoch: 57 of 250
training_loss: 0.1877, acc: 0.818, ROC: 0.542, f1: 0.165
epoch: 58 of 250
training_loss: 0.1934, acc: 0.812, ROC: 0.576, f1: 0.203
epoch: 59 of 250
training_loss: 0.2185, acc: 0.749, ROC: 0.597, f1: 0.223
epoch: 60 of 250
training_loss: 0.1891, acc: 0.853, ROC: 0.588, f1: 0.222
epoch: 61 of 250
training_loss: 0.1834, acc: 0.743, ROC: 0.546, f1: 0.189
epoch: 62 of 250
training_loss: 0.1876, acc: 0.856, ROC: 0.525, f1: 0.122
epoch: 63 of 250
training_loss: 0.2011, acc: 0.826, ROC: 0.550, f1: 0.179
epoch: 64 of 250
training_loss: 0.1850, acc: 0.808, ROC: 0.592, f1: 0.220
epoch: 65 of 250
training_loss: 0.1853, acc: 0.865, ROC

training_loss: 0.0942, acc: 0.819, ROC: 0.588, f1: 0.210
epoch: 163 of 250
training_loss: 0.0943, acc: 0.872, ROC: 0.525, f1: 0.123
epoch: 164 of 250
training_loss: 0.1265, acc: 0.855, ROC: 0.571, f1: 0.225
epoch: 165 of 250
training_loss: 0.0910, acc: 0.860, ROC: 0.533, f1: 0.146
epoch: 166 of 250
training_loss: 0.1059, acc: 0.883, ROC: 0.568, f1: 0.215
epoch: 167 of 250
training_loss: 0.1207, acc: 0.836, ROC: 0.553, f1: 0.188
epoch: 168 of 250
training_loss: 0.1089, acc: 0.824, ROC: 0.554, f1: 0.193
epoch: 169 of 250
training_loss: 0.0955, acc: 0.789, ROC: 0.554, f1: 0.185
epoch: 170 of 250
training_loss: 0.0917, acc: 0.846, ROC: 0.548, f1: 0.181
epoch: 171 of 250
training_loss: 0.0817, acc: 0.878, ROC: 0.530, f1: 0.129
epoch: 172 of 250
training_loss: 0.0883, acc: 0.843, ROC: 0.519, f1: 0.123
epoch: 173 of 250
training_loss: 0.1142, acc: 0.873, ROC: 0.509, f1: 0.086
epoch: 174 of 250
training_loss: 0.0919, acc: 0.863, ROC: 0.556, f1: 0.199
epoch: 175 of 250
training_loss: 0.0918, ac

(10784, 39, 11, 11)
epoch: 0 of 250
training_loss: 0.5819, acc: 0.829, ROC: 0.527, f1: 0.132
epoch: 1 of 250
training_loss: 0.4639, acc: 0.763, ROC: 0.547, f1: 0.163
epoch: 2 of 250
training_loss: 0.4162, acc: 0.876, ROC: 0.528, f1: 0.114
epoch: 3 of 250
training_loss: 0.3278, acc: 0.858, ROC: 0.518, f1: 0.101
epoch: 4 of 250
training_loss: 0.2822, acc: 0.886, ROC: 0.496, f1: 0.050
epoch: 5 of 250
training_loss: 0.2155, acc: 0.922, ROC: 0.517, f1: 0.071
epoch: 6 of 250
training_loss: 0.2093, acc: 0.823, ROC: 0.523, f1: 0.128
epoch: 7 of 250
training_loss: 0.1494, acc: 0.915, ROC: 0.546, f1: 0.158
epoch: 8 of 250
training_loss: 0.1423, acc: 0.866, ROC: 0.547, f1: 0.163
epoch: 9 of 250
training_loss: 0.1110, acc: 0.892, ROC: 0.538, f1: 0.143
epoch: 10 of 250
training_loss: 0.0874, acc: 0.868, ROC: 0.535, f1: 0.143
epoch: 11 of 250
training_loss: 0.0600, acc: 0.873, ROC: 0.527, f1: 0.124
epoch: 12 of 250
training_loss: 0.0512, acc: 0.896, ROC: 0.583, f1: 0.212
epoch: 13 of 250
training_lo

training_loss: 0.0033, acc: 0.903, ROC: 0.525, f1: 0.110
epoch: 112 of 250
training_loss: 0.0026, acc: 0.877, ROC: 0.501, f1: 0.061
epoch: 113 of 250
training_loss: 0.0005, acc: 0.894, ROC: 0.539, f1: 0.145
epoch: 114 of 250
training_loss: 0.0001, acc: 0.895, ROC: 0.538, f1: 0.146
epoch: 115 of 250
training_loss: 0.0049, acc: 0.872, ROC: 0.524, f1: 0.123
epoch: 116 of 250
training_loss: 0.0034, acc: 0.902, ROC: 0.525, f1: 0.109
epoch: 117 of 250
training_loss: 0.0018, acc: 0.891, ROC: 0.564, f1: 0.216
epoch: 118 of 250
training_loss: 0.0003, acc: 0.888, ROC: 0.554, f1: 0.188
epoch: 119 of 250
training_loss: 0.0022, acc: 0.902, ROC: 0.549, f1: 0.169
epoch: 120 of 250
training_loss: 0.0029, acc: 0.904, ROC: 0.564, f1: 0.186
epoch: 121 of 250
training_loss: 0.0005, acc: 0.875, ROC: 0.531, f1: 0.138
epoch: 122 of 250
training_loss: 0.0067, acc: 0.891, ROC: 0.542, f1: 0.155
epoch: 123 of 250
training_loss: 0.0024, acc: 0.913, ROC: 0.565, f1: 0.202
epoch: 124 of 250
training_loss: 0.0001, ac

training_loss: 0.0007, acc: 0.872, ROC: 0.502, f1: 0.072
epoch: 222 of 250
training_loss: 0.0002, acc: 0.896, ROC: 0.546, f1: 0.161
epoch: 223 of 250
training_loss: 0.0017, acc: 0.884, ROC: 0.558, f1: 0.183
epoch: 224 of 250
training_loss: 0.0002, acc: 0.892, ROC: 0.525, f1: 0.115
epoch: 225 of 250
training_loss: 0.0001, acc: 0.873, ROC: 0.520, f1: 0.112
epoch: 226 of 250
training_loss: 0.0001, acc: 0.881, ROC: 0.507, f1: 0.078
epoch: 227 of 250
training_loss: 0.0041, acc: 0.887, ROC: 0.510, f1: 0.081
epoch: 228 of 250
training_loss: 0.0002, acc: 0.886, ROC: 0.527, f1: 0.123
epoch: 229 of 250
training_loss: 0.0002, acc: 0.899, ROC: 0.535, f1: 0.137
epoch: 230 of 250
training_loss: 0.0003, acc: 0.868, ROC: 0.522, f1: 0.120
epoch: 231 of 250
training_loss: 0.0003, acc: 0.904, ROC: 0.529, f1: 0.111
epoch: 232 of 250
training_loss: 0.0075, acc: 0.893, ROC: 0.543, f1: 0.157
epoch: 233 of 250
training_loss: 0.0007, acc: 0.898, ROC: 0.522, f1: 0.105
epoch: 234 of 250
training_loss: 0.0021, ac

training_loss: 0.0073, acc: 0.881, ROC: 0.529, f1: 0.131
epoch: 83 of 250
training_loss: 0.0109, acc: 0.857, ROC: 0.526, f1: 0.123
epoch: 84 of 250
training_loss: 0.0184, acc: 0.892, ROC: 0.552, f1: 0.169
epoch: 85 of 250
training_loss: 0.0106, acc: 0.884, ROC: 0.526, f1: 0.121
epoch: 86 of 250
training_loss: 0.0106, acc: 0.841, ROC: 0.531, f1: 0.141
epoch: 87 of 250
training_loss: 0.0101, acc: 0.863, ROC: 0.521, f1: 0.116
epoch: 88 of 250
training_loss: 0.0167, acc: 0.863, ROC: 0.540, f1: 0.160
epoch: 89 of 250
training_loss: 0.0052, acc: 0.851, ROC: 0.555, f1: 0.186
epoch: 90 of 250
training_loss: 0.0195, acc: 0.881, ROC: 0.568, f1: 0.190
epoch: 91 of 250
training_loss: 0.0048, acc: 0.855, ROC: 0.546, f1: 0.162
epoch: 92 of 250
training_loss: 0.0172, acc: 0.871, ROC: 0.514, f1: 0.098
epoch: 93 of 250
training_loss: 0.0061, acc: 0.868, ROC: 0.543, f1: 0.154
epoch: 94 of 250
training_loss: 0.0134, acc: 0.841, ROC: 0.514, f1: 0.112
epoch: 95 of 250
training_loss: 0.0051, acc: 0.874, ROC

training_loss: 0.0061, acc: 0.863, ROC: 0.548, f1: 0.180
epoch: 193 of 250
training_loss: 0.0082, acc: 0.879, ROC: 0.526, f1: 0.117
epoch: 194 of 250
training_loss: 0.0024, acc: 0.895, ROC: 0.534, f1: 0.132
epoch: 195 of 250
training_loss: 0.0047, acc: 0.884, ROC: 0.533, f1: 0.134
epoch: 196 of 250
training_loss: 0.0044, acc: 0.869, ROC: 0.540, f1: 0.155
epoch: 197 of 250
training_loss: 0.0054, acc: 0.875, ROC: 0.522, f1: 0.113
epoch: 198 of 250
training_loss: 0.0011, acc: 0.883, ROC: 0.527, f1: 0.120
epoch: 199 of 250
training_loss: 0.0127, acc: 0.889, ROC: 0.545, f1: 0.153
epoch: 200 of 250
training_loss: 0.0108, acc: 0.869, ROC: 0.534, f1: 0.144
epoch: 201 of 250
training_loss: 0.0148, acc: 0.872, ROC: 0.554, f1: 0.179
epoch: 202 of 250
training_loss: 0.0029, acc: 0.837, ROC: 0.552, f1: 0.189
epoch: 203 of 250
training_loss: 0.0086, acc: 0.869, ROC: 0.551, f1: 0.166
epoch: 204 of 250
training_loss: 0.0074, acc: 0.894, ROC: 0.559, f1: 0.185
epoch: 205 of 250
training_loss: 0.0066, ac

training_loss: 0.0158, acc: 0.864, ROC: 0.584, f1: 0.253
epoch: 53 of 250
training_loss: 0.0117, acc: 0.859, ROC: 0.549, f1: 0.185
epoch: 54 of 250
training_loss: 0.0156, acc: 0.866, ROC: 0.531, f1: 0.141
epoch: 55 of 250
training_loss: 0.0107, acc: 0.849, ROC: 0.527, f1: 0.137
epoch: 56 of 250
training_loss: 0.0129, acc: 0.868, ROC: 0.543, f1: 0.165
epoch: 57 of 250
training_loss: 0.0332, acc: 0.793, ROC: 0.600, f1: 0.225
epoch: 58 of 250
training_loss: 0.0142, acc: 0.858, ROC: 0.562, f1: 0.211
epoch: 59 of 250
training_loss: 0.0168, acc: 0.864, ROC: 0.552, f1: 0.181
epoch: 60 of 250
training_loss: 0.0058, acc: 0.857, ROC: 0.536, f1: 0.154
epoch: 61 of 250
training_loss: 0.0095, acc: 0.857, ROC: 0.551, f1: 0.183
epoch: 62 of 250
training_loss: 0.0099, acc: 0.859, ROC: 0.558, f1: 0.203
epoch: 63 of 250
training_loss: 0.0161, acc: 0.854, ROC: 0.527, f1: 0.131
epoch: 64 of 250
training_loss: 0.0072, acc: 0.859, ROC: 0.553, f1: 0.185
epoch: 65 of 250
training_loss: 0.0105, acc: 0.859, ROC

In [59]:
print(mods[1])

Model1(
  (net): Sequential(
    (0): Conv2d(39, 32, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Flatten()
    (8): Linear(in_features=64, out_features=8, bias=True)
    (9): ReLU()
  )
  (final): Sequential(
    (0): Linear(in_features=8, out_features=1, bias=True)
    (1): Sigmoid()
  )
)


# Neighbor function

In [34]:
import shapely
import libpysal

def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

df15 = load_data(2015)
df16 = load_data(2016)

scaler = StandardScaler()
X15 = df15[df15.columns[:-4]]
Y15 = df15.y

X16 = df16[df16.columns[:-4]]
Y16 = df16.y

X15 = pd.DataFrame(scaler.fit_transform(X15))
X16 = pd.DataFrame(scaler.transform(X16))

In [35]:
df15 = load_data(2015)
df16 = load_data(2016)
wt = libpysal.weights.DistanceBand.from_dataframe(df15, threshold=150, binary = True, silence_warnings = True)
wv = libpysal.weights.DistanceBand.from_dataframe(df16, threshold=150, binary = True, silence_warnings = True)

In [36]:

scaler = StandardScaler()
X15 = df15[df15.columns[:-4]]
Y15 = df15.y

X16 = df16[df16.columns[:-4]]
Y16 = df16.y

X15 = pd.DataFrame(scaler.fit_transform(X15))
X16 = pd.DataFrame(scaler.transform(X16))

In [None]:
cluster_model = KMeans(n_clusters=5, random_state=0)
labels15 = pd.DataFrame(cluster_model.fit_predict(X15))
labels16 = pd.DataFrame(cluster_model.predict(X16))

labels15["X"] = X15.index
labels16["X"] = X16.index


labels15 = labels15.set_index(0)
labels16 = labels16.set_index(0)

In [42]:
with open('cbs_id_koppel.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [43]:
# br.C28992R100 = br.C28992R100.map(b) # change C28992code for id
labels15.C28 = labels15.C28.map(b)

In [48]:
labels15

Unnamed: 0_level_0,X,C28
0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,2748208
0,1,2748211
0,2,2748212
0,3,2748245
0,4,2748263
...,...,...
0,56362,1413137
0,56363,1413181
0,56364,1414800
0,56365,1414808


(56367, 39, 11, 11)

In [50]:
ID_train

array([2893058., 2870455., 2868833., ..., 1311910., 1176428., 1314791.])

In [84]:

# preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_train)
neighbors = [w15.neighbors[x] for x in ID_train]
transitions = [preds.loc[x].values for x in neighbors]



n_function = np.zeros((len(preds), w15.max_neighbors + 1))
for i, (t, idx) in enumerate(zip(transitions, ID_train)):
    n_function[i, 1:len(t) + 1] = t.squeeze()
    n_function[i, 0] = preds.loc[idx]

In [85]:
params = {"max_depth": [6, 8, 10, 12], "oob_score" : [True, False]}
clf_bagger = GridSearchCV(RandomForestClassifier(), params, cv = 5, scoring = "balanced_accuracy",
                               verbose = 3)
oversample = SMOTE()
x, y = oversample.fit_resample(n_function, Y_train)

clf_bagger.fit(x,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .......max_depth=6, oob_score=True;, score=0.641 total time=   4.8s
[CV 2/5] END .......max_depth=6, oob_score=True;, score=0.649 total time=   4.6s
[CV 3/5] END .......max_depth=6, oob_score=True;, score=0.663 total time=   4.7s
[CV 4/5] END .......max_depth=6, oob_score=True;, score=0.619 total time=   4.8s
[CV 5/5] END .......max_depth=6, oob_score=True;, score=0.681 total time=   4.7s
[CV 1/5] END ......max_depth=6, oob_score=False;, score=0.639 total time=   4.1s
[CV 2/5] END ......max_depth=6, oob_score=False;, score=0.652 total time=   4.3s
[CV 3/5] END ......max_depth=6, oob_score=False;, score=0.663 total time=   4.0s
[CV 4/5] END ......max_depth=6, oob_score=False;, score=0.619 total time=   3.9s
[CV 5/5] END ......max_depth=6, oob_score=False;, score=0.682 total time=   4.4s
[CV 1/5] END .......max_depth=8, oob_score=True;, score=0.647 total time=   5.4s
[CV 2/5] END .......max_depth=8, oob_score=True;,

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [6, 8, 10, 12],
                         'oob_score': [True, False]},
             scoring='balanced_accuracy', verbose=3)

In [88]:
clf_bagger.best_params_

{'max_depth': 12, 'oob_score': False}

In [None]:
np.array(preds.detach())

In [90]:
X_val = torch.from_numpy(X_val).float()
preds = model(X_val)

In [91]:
with open('w16.pickle', 'rb') as handle:
    w16 = pickle.load(handle)

In [92]:
preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_val)
neighbors = [w16.neighbors[x] for x in ID_val]
transitions = [preds.loc[x].values for x in neighbors]



n_function = np.zeros((len(preds), w16.max_neighbors + 1))
for i, (t, idx) in enumerate(zip(transitions, ID_val)):
    n_function[i, 1:len(t) + 1] = t.squeeze()
    n_function[i, 0] = preds.loc[idx]

In [93]:
preds = clf_bagger.predict(n_function)

In [94]:
len(neighbors)

57050

In [95]:
clf_bagger.score(n_function, Y_val)

0.5448808332705778

In [96]:
recall_score(Y_val, preds)

0.22256568778979907

In [97]:
precision_score(Y_val, preds)

0.1262683201803833

In [246]:
(Y_val == 1).sum()

4529

# No bagger on the end

In [277]:
preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_val)

In [279]:
preds["y"] = Y_val

In [280]:
preds = preds.sort_values(by=[0], ascending = False)

In [281]:
preds

Unnamed: 0,0,y
3699708.0,9.999919e-01,0.0
2206980.0,9.999806e-01,0.0
2206983.0,9.999349e-01,0.0
2206982.0,9.999343e-01,0.0
2210563.0,9.999300e-01,0.0
...,...,...
2654130.0,3.897061e-21,0.0
2113362.0,1.054448e-21,0.0
3705586.0,2.995088e-22,0.0
3703981.0,6.478963e-23,0.0


In [282]:
preds[0].iloc[0:4529] = 1
preds[0].iloc[4529:] = 0

In [283]:
recall_score(preds["y"], preds[0])


0.17244424817840584

In [284]:
precision_score(preds["y"], preds[0])

0.17244424817840584