In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))

Device: cpu


In [2]:
import numpy as np
import os
from get_data import get_data
from imblearn.over_sampling import SMOTE 
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.tensorflow import balanced_batch_generator
from numpy.random import default_rng
rng = default_rng(seed = 31)
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  GridSearchCV
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, roc_auc_score


In [3]:
from torch import nn, optim
import scipy.ndimage as ndimage

In [4]:
# X, Y, ID = get_data("../Data/filled/grids/", [2015,2016,2017,2018,2019])

In [5]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [6]:
with open('w15.pickle', 'rb') as handle:
    wt = pickle.load(handle)
    
with open('w16.pickle', 'rb') as handle:
    wv = pickle.load(handle)

In [7]:
def create_CNN_samples(grid, block, dims = 39):
    
    nonzero = np.transpose(grid[:,:,-2].nonzero()) # Get indices of nonzero componetns

    size = nonzero.shape[0]
    width = block * 2 + 1 # calculate widht and height. Needed later on
    
    X = np.zeros((size, width, width, dims))
    Y = np.zeros(size)
    ID = np.zeros(size)
    
    for idx, i in enumerate(nonzero):
        x, ID[idx], Y[idx] = get_neighbor_grid(grid, i, block)
        X[idx] = x.reshape(width,width, 39)
        
    X = np.moveaxis(X, -1, 1) # order the indices correctly to make sure it works in CNN
    X = torch.from_numpy(X).float()
    Y = torch.from_numpy(Y).float()
    
    return X,ID,Y

In [8]:
def get_neighbor_grid(full, hw, block = 1):
    
    # get the nonzero (built) blocks by checking if they have a ID

    h = hw[0]
    w = hw[1]
    
    y = full[h,w,-1]
    ID = full[h,w,-2]
    
    hu = h - block
    hd = h + block
    hshort, hextra, wshort, wextra = 0,0,0,0
    if hu < 0:
        hshort = 0 - hu
        hu = 0
    if hd >= full.shape[0]:
        hextra = (hd - full.shape[0]) + 1
        hd = full.shape[0]

    wr = w + block
    wl = w - block

    if wr >= full.shape[1]:
        wextra = (wr - full.shape[1]) + 1
        wr = full.shape[1]
    if wl < 0:
        wshort = 0 - wl
        wl = 0

    nb = full[hu : hd + 1, wl : wr + 1, :]
    nb = np.pad(nb, ((hshort, hextra), (wshort, wextra), (0,0)), mode = "constant", constant_values = 0)
    return nb[:,:,:-2], ID, y


In [9]:
X_train = []
Y_train = []
ID_train = []
for filename in os.listdir("../Data/filled/grids/2015/"):
    n = np.load("../Data/filled/grids/2015/" + filename)
    X, ID, Y = create_CNN_samples(n, 5)
    X_train.append(X)
    Y_train.append(Y)
    ID_train.append(ID)
    
Y_train = np.concatenate(Y_train)
ID_train = np.concatenate(ID_train)
X_train = np.concatenate(X_train)

X_train = np.moveaxis(X_train, 1, -1)
X_train = X_train.reshape(-1, 39)

X_train = ss.fit_transform(X_train)
X_train = X_train.reshape(-1, 11, 11, 39)
X_train = X_train.reshape(X_train.shape[0], -1)

oversampler = SMOTE()
X_train_resamp, Y_train_resamp = oversampler.fit_resample(X_train, Y_train)
X_train = X_train.reshape(-1, 11, 11, 39)
X_train_resamp = X_train_resamp.reshape(-1, 11, 11, 39)

X_train = np.moveaxis(X_train, -1, 1)
X_train_resamp = np.moveaxis(X_train_resamp, -1, 1) 




X_val = []
Y_val = []
ID_val = []

for filename in os.listdir("../Data/filled/grids/2016/"):
    n = np.load("../Data/filled/grids/2016/" + filename)
    X, ID, Y = create_CNN_samples(n, 5)
    X_val.append(X)
    Y_val.append(Y)
    ID_val.append(ID)
    
X_val = np.concatenate(X_val)
X_val = np.moveaxis(X_val, 1, -1)
X_val = X_val.reshape(-1, 39)

X_val = ss.transform(X_val)


X_val = X_val.reshape(-1, 11, 11, 39)
X_val = np.moveaxis(X_val, -1, 1)
Y_val = np.concatenate(Y_val)
ID_val = np.concatenate(ID_val)


In [43]:
def get_batch(X, Y, batch_size = 32):
    
    idxs = rng.integers(X.shape[0], size = batch_size)
    
    X = torch.from_numpy(X[idxs]).float()
    Y = torch.from_numpy(Y[idxs]).float()
    

    return X, Y


In [11]:
class Model1(nn.Module):
    def __init__(self, name):
        super(Model1, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 32, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.MaxPool2d(2),
            nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(64, 8),
            nn.ReLU())
        
        self.final = nn.Sequential(
            nn.Linear(8,1),
            nn.Sigmoid())
            
        self.name = name
        
        
    def forward(self, x):
        out = self.net(x)
        out = self.final(out)
        return out
    
    def get_no_activation(self, x):
        out = self.net(x)
        return out

In [12]:
class Model2(nn.Module):
    def __init__(self, name):
        super(Model2, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 128, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.Conv2d(in_channels = 128, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(576, 64),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid())
            
        self.name = name
    def forward(self, x):
        out = self.net(x)
        return out

In [13]:
class Model3(nn.Module):
    def __init__(self, name):
        super(Model3, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 128, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.Conv2d(in_channels = 128, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(576, 128),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid())
            
        self.name = name
    def forward(self, x):
        out = self.net(x)
        return out

In [14]:
class Model4(nn.Module):
    def __init__(self, name):
        super(Model4, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(1152, 512),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid())
            
        self.name = name
    def forward(self, x):
        out = self.net(x)
        return out

In [15]:
model1 = Model1("een")
model2 = Model2("twee")
model3 = Model3("drie")
model4 = Model4("vier")
models = [model1, model2, model3, model4]

In [16]:
from sklearn.metrics import recall_score, precision_score, confusion_matrix

In [47]:
import time
# # optimizer = optim.RMSprop(model.parameters(), lr=0.001) 
# BCEloss = nn.BCELoss()
# model.train()

def train_model(model, X_train, Y_train, X_val, Y_val, num_epochs, batch_per_e = 500):
    optimizer = optim.RMSprop(model.parameters(), lr=0.0001) 
    BCEloss = nn.BCELoss()
    train_loss = []
    train_loss_history = []
    acc_history = []
    ROC_history = []
    f1_score_history = []
    cmc_best = 0
    
    

    for epoch in range(num_epochs):
        epoch_loss = []
        print("epoch: {} of {}".format(epoch, num_epochs))
        for batch in range(batch_per_e):
            
            x_train_, y_train_ = get_batch(X_train, Y_train)
            model.train()
            
            optimizer.zero_grad()
            t0 = time.time()
            out = model(X_train)
            print(t0-time.time())
            loss = BCEloss(out.squeeze(), y_train_)
            loss.backward()
            optimizer.step()
            if batch % 10 == 0:
                train_loss.append(loss)
                
        model.eval()
        
        x_val_, y_val_ = get_batch(X_val, Y_val, batch_size = 1000)
        predictions = model(x_val_).detach().numpy()
        predictions = (predictions > 0.5).astype(int)
        
        acc = accuracy_score(y_val_, predictions)
        ROC = roc_auc_score(y_val_, predictions)
        f1 = f1_score(y_val_, predictions)
        train_loss = (np.sum(train_loss) / (batch_per_e/10)).detach().item()

        acc_history.append(acc)
        ROC_history.append(ROC)
        train_loss_history.append(train_loss)
        f1_score_history.append(f1)
        
        
        print("training_loss: {:.4f}, acc: {:.3f}, ROC: {:.3f}, f1: {:.3f}".format(train_loss, acc, ROC , f1))
        train_loss = []


    return acc_history, ROC_history, train_loss_history, f1_score_history


        

In [48]:


for mod in models:
    hists = train_model(mod, X_train, Y_train, X_val, Y_val, 250)
    n_function = neighbor_part(mod, X_train, ID_train, wt)
        
    oversample = SMOTE()
    x, y = oversample.fit_resample(n_function, Y_train)

    clf_bagger = RandomForestClassifier(max_depth = 12, oob_score = True)
    clf_bagger.fit(x,y)
    
    n_function = neighbor_part(mod, X_val, ID_val, wv)
    preds = clf_bagger.predict(n_function)
    totacc = accuracy_score(Y_val, preds)
    totf1 = f1_score(Y_val, preds)
    totROC = roc_auc_score(Y_val, preds)
    
    
    with open("../results/CNN/" + mod.name + ".csv", "a+") as f:
        f.write("loss;acc;ROC;f1_score\n")
        f.write(str(hists[2]) + ";" + str(hists[0]) + ";" + str(hists[1]) + ";" + str(hists[3]) + "\n")
        f.write("--;" + str(totacc) + ";" + str(totROC) + ";" + str(totf1))
    
    
    

epoch: 0 of 250
-2.5182645320892334


NameError: name 'y_train_' is not defined

In [38]:
X_train.shape

(56367, 39, 11, 11)

In [32]:
def neighbor_part(model, X, idxs, w):
    preds = model(X)
    preds = pd.DataFrame(np.array(preds.detach())).set_index(idxs)
    
    neighbors = [w.neighbors[x] for x in idxs]
    transitions = [preds.loc[x].values for x in neighbors]



    n_function = np.zeros((len(preds), w.max_neighbors + 1))
    for i, (t, idx) in enumerate(zip(transitions, idxs)):
        n_function[i, 1:len(t) + 1] = t.squeeze()
        n_function[i, 0] = preds.loc[idx]
        
    return n_function

# Neighbor function

In [34]:
import geopandas as gpd
import shapely
import libpysal

def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [35]:
df15 = load_data(2015)
df16 = load_data(2016)
wt = libpysal.weights.DistanceBand.from_dataframe(df15, threshold=150, binary = True, silence_warnings = True)
wv = libpysal.weights.DistanceBand.from_dataframe(df16, threshold=150, binary = True, silence_warnings = True)

In [36]:

scaler = StandardScaler()
X15 = df15[df15.columns[:-4]]
Y15 = df15.y

X16 = df16[df16.columns[:-4]]
Y16 = df16.y

X15 = pd.DataFrame(scaler.fit_transform(X15))
X16 = pd.DataFrame(scaler.transform(X16))

In [51]:
cluster_model = KMeans(n_clusters=5, random_state=0)
labels15 = pd.DataFrame(cluster_model.fit_predict(X15))
labels16 = pd.DataFrame(cluster_model.predict(X16))

labels15["X"] = X15.index
labels16["X"] = X16.index


labels15 = labels15.set_index(0)
labels16 = labels16.set_index(0)

In [42]:
with open('cbs_id_koppel.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [43]:
# br.C28992R100 = br.C28992R100.map(b) # change C28992code for id
labels15.C28 = labels15.C28.map(b)

In [48]:
labels15

Unnamed: 0_level_0,X,C28
0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,2748208
0,1,2748211
0,2,2748212
0,3,2748245
0,4,2748263
...,...,...
0,56362,1413137
0,56363,1413181
0,56364,1414800
0,56365,1414808


In [53]:
X_train[labels15.loc[0]["X"]].shape

(10237, 39, 11, 11)

In [54]:
labels15.loc[0].shape

(10237, 1)

In [50]:
ID_train

array([2893058., 2870455., 2868833., ..., 1311910., 1176428., 1314791.])

In [84]:

# preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_train)
neighbors = [w15.neighbors[x] for x in ID_train]
transitions = [preds.loc[x].values for x in neighbors]



n_function = np.zeros((len(preds), w15.max_neighbors + 1))
for i, (t, idx) in enumerate(zip(transitions, ID_train)):
    n_function[i, 1:len(t) + 1] = t.squeeze()
    n_function[i, 0] = preds.loc[idx]

In [85]:
params = {"max_depth": [6, 8, 10, 12], "oob_score" : [True, False]}
clf_bagger = GridSearchCV(RandomForestClassifier(), params, cv = 5, scoring = "balanced_accuracy",
                               verbose = 3)
oversample = SMOTE()
x, y = oversample.fit_resample(n_function, Y_train)

clf_bagger.fit(x,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .......max_depth=6, oob_score=True;, score=0.641 total time=   4.8s
[CV 2/5] END .......max_depth=6, oob_score=True;, score=0.649 total time=   4.6s
[CV 3/5] END .......max_depth=6, oob_score=True;, score=0.663 total time=   4.7s
[CV 4/5] END .......max_depth=6, oob_score=True;, score=0.619 total time=   4.8s
[CV 5/5] END .......max_depth=6, oob_score=True;, score=0.681 total time=   4.7s
[CV 1/5] END ......max_depth=6, oob_score=False;, score=0.639 total time=   4.1s
[CV 2/5] END ......max_depth=6, oob_score=False;, score=0.652 total time=   4.3s
[CV 3/5] END ......max_depth=6, oob_score=False;, score=0.663 total time=   4.0s
[CV 4/5] END ......max_depth=6, oob_score=False;, score=0.619 total time=   3.9s
[CV 5/5] END ......max_depth=6, oob_score=False;, score=0.682 total time=   4.4s
[CV 1/5] END .......max_depth=8, oob_score=True;, score=0.647 total time=   5.4s
[CV 2/5] END .......max_depth=8, oob_score=True;,

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [6, 8, 10, 12],
                         'oob_score': [True, False]},
             scoring='balanced_accuracy', verbose=3)

In [88]:
clf_bagger.best_params_

{'max_depth': 12, 'oob_score': False}

In [None]:
np.array(preds.detach())

In [90]:
X_val = torch.from_numpy(X_val).float()
preds = model(X_val)

In [91]:
with open('w16.pickle', 'rb') as handle:
    w16 = pickle.load(handle)

In [92]:
preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_val)
neighbors = [w16.neighbors[x] for x in ID_val]
transitions = [preds.loc[x].values for x in neighbors]



n_function = np.zeros((len(preds), w16.max_neighbors + 1))
for i, (t, idx) in enumerate(zip(transitions, ID_val)):
    n_function[i, 1:len(t) + 1] = t.squeeze()
    n_function[i, 0] = preds.loc[idx]

In [93]:
preds = clf_bagger.predict(n_function)

In [94]:
len(neighbors)

57050

In [95]:
clf_bagger.score(n_function, Y_val)

0.5448808332705778

In [96]:
recall_score(Y_val, preds)

0.22256568778979907

In [97]:
precision_score(Y_val, preds)

0.1262683201803833

In [246]:
(Y_val == 1).sum()

4529

# No bagger on the end

In [277]:
preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_val)

In [279]:
preds["y"] = Y_val

In [280]:
preds = preds.sort_values(by=[0], ascending = False)

In [281]:
preds

Unnamed: 0,0,y
3699708.0,9.999919e-01,0.0
2206980.0,9.999806e-01,0.0
2206983.0,9.999349e-01,0.0
2206982.0,9.999343e-01,0.0
2210563.0,9.999300e-01,0.0
...,...,...
2654130.0,3.897061e-21,0.0
2113362.0,1.054448e-21,0.0
3705586.0,2.995088e-22,0.0
3703981.0,6.478963e-23,0.0


In [282]:
preds[0].iloc[0:4529] = 1
preds[0].iloc[4529:] = 0

In [283]:
recall_score(preds["y"], preds[0])


0.17244424817840584

In [284]:
precision_score(preds["y"], preds[0])

0.17244424817840584