In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))

Device: cpu


In [179]:
import numpy as np
import os
from get_data import get_data
from imblearn.over_sampling import SMOTE 
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.tensorflow import balanced_batch_generator
from numpy.random import default_rng
rng = default_rng(seed = 31)
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  GridSearchCV
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, roc_auc_score
import geopandas as gpd
import shapely
import libpysal


In [3]:
from torch import nn, optim
import scipy.ndimage as ndimage

In [5]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [181]:
def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
        
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
#     df.plot(figsize = (40,40), column = "y")
#     plt.show()
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df 

In [182]:
df15 = load_data(2015)
df16 = load_data(2016)
# df17 = load_data(2017)
# df18 = load_data(2018)
# df19 = load_data(2019)

wt = libpysal.weights.DistanceBand.from_dataframe(df15, threshold=150, binary = True, silence_warnings = True)
wv =libpysal.weights.DistanceBand.from_dataframe(df16, threshold=150, binary = True, silence_warnings = True)

In [164]:
def create_CNN_samples(grid, block, dims = 39):
    
    nonzero = np.transpose(grid[:,:,-2].nonzero()) # Get indices of nonzero componetns
    size = nonzero.shape[0]
    width = block * 2 + 1 # calculate widht and height. Needed later on
    
    X = np.zeros((size, width, width, dims))
    Y = np.zeros(size)
    ID = np.zeros(size)
    Y_1 = np.zeros(size)
    
    for idx, i in enumerate(nonzero):
        x, ID[idx], Y[idx], Y_1[idx] = get_neighbor_grid(grid, i, block)
        X[idx] = x.reshape(width,width, dims)
        
    X = np.moveaxis(X, -1, 1) # order the indices correctly to make sure it works in CNN
    X = torch.from_numpy(X).float()
    Y = torch.from_numpy(Y).float()
    
    return X,ID,Y, Y_1

In [165]:
def get_neighbor_grid(full, hw, block = 1):
    
    # get the nonzero (built) blocks by checking if they have a ID

    h = hw[0]
    w = hw[1]
    
    y = full[h,w,-1]
    ID = full[h,w,-2]
    Y_1_train = full[h,w,-3]
    
    hu = h - block
    hd = h + block
    hshort, hextra, wshort, wextra = 0,0,0,0
    if hu < 0:
        hshort = 0 - hu
        hu = 0
    if hd >= full.shape[0]:
        hextra = (hd - full.shape[0]) + 1
        hd = full.shape[0]

    wr = w + block
    wl = w - block

    if wr >= full.shape[1]:
        wextra = (wr - full.shape[1]) + 1
        wr = full.shape[1]
    if wl < 0:
        wshort = 0 - wl
        wl = 0

    nb = full[hu : hd + 1, wl : wr + 1, :]
    nb = np.pad(nb, ((hshort, hextra), (wshort, wextra), (0,0)), mode = "constant", constant_values = 0)
    return nb[:,:,:-3], ID, y, Y_1_train


In [224]:
##  trainingset
X_train = []
Y_train = []
ID_train = []
Y_1_train = []
for filename in os.listdir("../Data/filled/grids/2015/"):
    n = np.load("../Data/filled/grids/2015/" + filename)
    X, ID, Y, Y_1 = create_CNN_samples(n, 5)
    X_train.append(X)
    Y_train.append(Y)
    ID_train.append(ID)
    Y_1_train.append(Y_1)
    

    
Y_train = np.concatenate(Y_train)
ID_train = np.concatenate(ID_train)
X_train = np.concatenate(X_train)

# create Y_1_train
Y_1_train = np.concatenate(Y_1_train)
neighbors = [wt.neighbors[x] for x in wt.neighbors]
Y_1_train = np.array([Y_1_train[x].sum() for x in neighbors])
Y_1_train[np.where(Y_1_train == 0)[0]] = 0.5

#reshape to rescale
X_train = np.moveaxis(X_train, 1, -1)
X_train = X_train.reshape(-1, 39)
X_train = ss.fit_transform(X_train) #rescale

# reshape to oversample, append y_1 to resample
X_train = X_train.reshape(-1, 11, 11, 39)
X_train = X_train.reshape(X_train.shape[0], -1)
print(X_train.shape)
X_train = np.append(X_train, Y_1_train.reshape(-1,1), axis = 1)

oversampler = SMOTE() #resample
X_train_resamp, Y_train_resamp = oversampler.fit_resample(X_train, Y_train)

del(X_train)
del(Y_train)

# 0.5 means no neighbors, otherwise make int
X_train_resamp[X_train_resamp[:,-1]>0.5][:,-1] =  np.ceil(X_train_resamp[X_train_resamp[:,-1] > 0.5][:,-1]) 
Y_1_train_resamp = X_train_resamp[:,-1]
X_train_resamp = np.delete(X_train_resamp, 39, 1)

# reshape to grid
X_train_resamp = X_train_resamp.reshape(-1, 11, 11, 39)

# move axis for pytorch
X_train_resamp = np.moveaxis(X_train_resamp, -1, 1) 



## validation set
X_val = []
Y_val = []
ID_val = []
Y_1_val = []

for filename in os.listdir("../Data/filled/grids/2016/"):
    n = np.load("../Data/filled/grids/2016/" + filename)
    X, ID, Y, Y_1 = create_CNN_samples(n, 5)
    X_val.append(X)
    Y_val.append(Y)
    ID_val.append(ID)
    Y_1_val.append(Y_1)
    
# create and transform X_val
X_val = np.concatenate(X_val)
X_val = np.moveaxis(X_val, 1, -1)
X_val = X_val.reshape(-1, 39)
X_val = ss.transform(X_val)
X_val = X_val.reshape(-1, 11, 11, 39)
X_val = np.moveaxis(X_val, -1, 1)

# create Y_1_val
Y_1_val = np.concatenate(Y_1_val)
neighbors = [wv.neighbors[x] for x in wv.neighbors]
Y_1_val = np.array([Y_1_val[x].sum() for x in neighbors])
Y_1_val[np.where(Y_1_val == 0)[0]] = 0.5

# create Y-val and ID_val
Y_val = np.concatenate(Y_val)
ID_val = np.concatenate(ID_val)


(21654, 4719)


In [222]:
print(X_train.shape)

(21654, 4720)


In [225]:
def get_batch(X, Y, Y1, batch_size = 32):
    
    idxs = rng.integers(X.shape[0], size = batch_size)
    
    X = torch.from_numpy(X[idxs]).float()
    Y = torch.from_numpy(Y[idxs]).float()
    Y1 = torch.from_numpy(Y1[idxs]).float()

    return X, Y, Y1


In [322]:
class Model(nn.Module):
    def __init__(self, name):
        super(Model, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 128, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.Dropout2d(0.25),
            nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(2304, 128),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(128, 1))
        
        self.final = nn.Sigmoid()
        self.name = name
        
    def forward(self, x):
        out = self.net(x)
        return out

In [348]:
class Model(nn.Module):
    def __init__(self, name):
        super(Model, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.Dropout2d(0.25),
            nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(1152, 128),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(128, 1))
        
        self.final = nn.Sigmoid()
        self.name = name
        
    def forward(self, x):
        out = self.net(x)
        return out

In [349]:
model = Model("een")

In [350]:
x,_,_ = get_batch(X_train_resamp, Y_train_resamp, Y_1_train_resamp)
model(x)

torch.Size([32, 128])


tensor([[-0.0958,  0.3128,  0.0595,  ...,  0.1434,  0.0863,  0.0574],
        [-0.1355,  0.2457,  0.1929,  ...,  0.2075,  0.1261,  0.1126],
        [-0.0372,  0.2795,  0.1792,  ...,  0.1548,  0.0780,  0.1036],
        ...,
        [-0.1607,  0.1194,  0.1279,  ...,  0.0325, -0.0077,  0.1063],
        [ 0.0212,  0.2296,  0.3290,  ..., -0.0576, -0.0866,  0.1421],
        [-0.0897,  0.2598,  0.2694,  ...,  0.1779,  0.0755,  0.0188]],
       grad_fn=<AddmmBackward>)

In [339]:
from sklearn.metrics import recall_score, precision_score, confusion_matrix

In [340]:
import time
# # optimizer = optim.RMSprop(model.parameters(), lr=0.001) 
# BCEloss = nn.BCELoss()
# model.train()

def train_model(model, X_train, Y_train, Y1_train, X_val, Y_val, Y1_val, num_epochs, batch_per_e = 500):
    optimizer = optim.RMSprop(model.parameters(), lr=0.00001) 
    rm = RandomForesClassifier(max_depth = )
    BCEloss = nn.BCEWithLogitsLoss()
    train_loss = []
    train_loss_history = []
    acc_history = []
    ROC_history = []
    f1_score_history = []

    
    

    for epoch in range(num_epochs):
        epoch_loss = []
        print("epoch: {} of {}".format(epoch, num_epochs))
        for batch in range(batch_per_e):
            optimizer.zero_grad()
            x, y, y1 = get_batch(X_train, Y_train, Y1_train)
            model.train()
            


            out = model(x)
            b = out.squeeze() * y1
            
            loss = BCEloss(b, y)
            loss.backward()
            optimizer.step()
            
            if batch % 10 == 0:
                train_loss.append(loss)
                
        model.eval()
        
        x, y, y1 = get_batch(X_val, Y_val ,Y1_val, batch_size = 1000)
        out = model(x)
        out = model.final(out)
        
        out = out.squeeze() * y1
        
        preds = np.zeros(len(out))
        pos = out.argsort()[-((y == 1).sum()) :]
        preds[pos] = 1
        


        acc = accuracy_score(y, preds)
        ROC = roc_auc_score(y, preds)
        f1 = f1_score(y, preds)
        train_loss = (np.sum(train_loss) / (batch_per_e/10)).detach().item()

        acc_history.append(acc)
        ROC_history.append(ROC)
        train_loss_history.append(train_loss)
        f1_score_history.append(f1)
        
        
        print("training_loss: {:.4f}, acc: {:.3f}, ROC: {:.3f}, f1: {:.3f}".format(train_loss, acc, ROC , f1))
        train_loss = []


    return acc_history, ROC_history, train_loss_history, f1_score_history


        

In [341]:
train_model(model, X_train_resamp, Y_train_resamp, Y_1_train_resamp, X_val, Y_val, Y_1_val, 50)

epoch: 0 of 50
training_loss: 0.6565, acc: 0.766, ROC: 0.560, f1: 0.259
epoch: 1 of 50
training_loss: 0.6289, acc: 0.824, ROC: 0.598, f1: 0.296
epoch: 2 of 50
training_loss: 0.6323, acc: 0.772, ROC: 0.540, f1: 0.214
epoch: 3 of 50
training_loss: 0.6181, acc: 0.782, ROC: 0.550, f1: 0.227
epoch: 4 of 50
training_loss: 0.6322, acc: 0.780, ROC: 0.510, f1: 0.147
epoch: 5 of 50
training_loss: 0.6060, acc: 0.804, ROC: 0.593, f1: 0.300
epoch: 6 of 50
training_loss: 0.6097, acc: 0.782, ROC: 0.518, f1: 0.162
epoch: 7 of 50
training_loss: 0.6055, acc: 0.784, ROC: 0.543, f1: 0.212
epoch: 8 of 50
training_loss: 0.5954, acc: 0.784, ROC: 0.569, f1: 0.265
epoch: 9 of 50
training_loss: 0.5926, acc: 0.782, ROC: 0.524, f1: 0.174
epoch: 10 of 50
training_loss: 0.5859, acc: 0.796, ROC: 0.543, f1: 0.203
epoch: 11 of 50
training_loss: 0.5877, acc: 0.776, ROC: 0.551, f1: 0.233
epoch: 12 of 50
training_loss: 0.5684, acc: 0.788, ROC: 0.580, f1: 0.284
epoch: 13 of 50
training_loss: 0.5677, acc: 0.798, ROC: 0.551

KeyboardInterrupt: 

In [276]:
(Y_train_resamp == 1).sum()

19922

In [277]:
Y_1_train_resamp.shape

(39844,)

In [38]:
X_train.shape

(56367, 39, 11, 11)

In [32]:
def neighbor_part(model, X, idxs, w):
    preds = model(X)
    preds = pd.DataFrame(np.array(preds.detach())).set_index(idxs)
    
    neighbors = [w.neighbors[x] for x in idxs]
    transitions = [preds.loc[x].values for x in neighbors]



    n_function = np.zeros((len(preds), w.max_neighbors + 1))
    for i, (t, idx) in enumerate(zip(transitions, idxs)):
        n_function[i, 1:len(t) + 1] = t.squeeze()
        n_function[i, 0] = preds.loc[idx]
        
    return n_function

# Neighbor function

In [34]:
import geopandas as gpd
import shapely
import libpysal

def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [35]:
df15 = load_data(2015)
df16 = load_data(2016)
wt = libpysal.weights.DistanceBand.from_dataframe(df15, threshold=150, binary = True, silence_warnings = True)
wv = libpysal.weights.DistanceBand.from_dataframe(df16, threshold=150, binary = True, silence_warnings = True)

In [36]:

scaler = StandardScaler()
X15 = df15[df15.columns[:-4]]
Y15 = df15.y

X16 = df16[df16.columns[:-4]]
Y16 = df16.y

X15 = pd.DataFrame(scaler.fit_transform(X15))
X16 = pd.DataFrame(scaler.transform(X16))

In [51]:
cluster_model = KMeans(n_clusters=5, random_state=0)
labels15 = pd.DataFrame(cluster_model.fit_predict(X15))
labels16 = pd.DataFrame(cluster_model.predict(X16))

labels15["X"] = X15.index
labels16["X"] = X16.index


labels15 = labels15.set_index(0)
labels16 = labels16.set_index(0)

In [42]:
with open('cbs_id_koppel.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [43]:
# br.C28992R100 = br.C28992R100.map(b) # change C28992code for id
labels15.C28 = labels15.C28.map(b)

In [48]:
labels15

Unnamed: 0_level_0,X,C28
0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,2748208
0,1,2748211
0,2,2748212
0,3,2748245
0,4,2748263
...,...,...
0,56362,1413137
0,56363,1413181
0,56364,1414800
0,56365,1414808


In [53]:
X_train[labels15.loc[0]["X"]].shape

(10237, 39, 11, 11)

In [54]:
labels15.loc[0].shape

(10237, 1)

In [50]:
ID_train

array([2893058., 2870455., 2868833., ..., 1311910., 1176428., 1314791.])

In [84]:

# preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_train)
neighbors = [w15.neighbors[x] for x in ID_train]
transitions = [preds.loc[x].values for x in neighbors]



n_function = np.zeros((len(preds), w15.max_neighbors + 1))
for i, (t, idx) in enumerate(zip(transitions, ID_train)):
    n_function[i, 1:len(t) + 1] = t.squeeze()
    n_function[i, 0] = preds.loc[idx]

In [85]:
params = {"max_depth": [6, 8, 10, 12], "oob_score" : [True, False]}
clf_bagger = GridSearchCV(RandomForestClassifier(), params, cv = 5, scoring = "balanced_accuracy",
                               verbose = 3)
oversample = SMOTE()
x, y = oversample.fit_resample(n_function, Y_train)

clf_bagger.fit(x,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .......max_depth=6, oob_score=True;, score=0.641 total time=   4.8s
[CV 2/5] END .......max_depth=6, oob_score=True;, score=0.649 total time=   4.6s
[CV 3/5] END .......max_depth=6, oob_score=True;, score=0.663 total time=   4.7s
[CV 4/5] END .......max_depth=6, oob_score=True;, score=0.619 total time=   4.8s
[CV 5/5] END .......max_depth=6, oob_score=True;, score=0.681 total time=   4.7s
[CV 1/5] END ......max_depth=6, oob_score=False;, score=0.639 total time=   4.1s
[CV 2/5] END ......max_depth=6, oob_score=False;, score=0.652 total time=   4.3s
[CV 3/5] END ......max_depth=6, oob_score=False;, score=0.663 total time=   4.0s
[CV 4/5] END ......max_depth=6, oob_score=False;, score=0.619 total time=   3.9s
[CV 5/5] END ......max_depth=6, oob_score=False;, score=0.682 total time=   4.4s
[CV 1/5] END .......max_depth=8, oob_score=True;, score=0.647 total time=   5.4s
[CV 2/5] END .......max_depth=8, oob_score=True;,

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [6, 8, 10, 12],
                         'oob_score': [True, False]},
             scoring='balanced_accuracy', verbose=3)

In [88]:
clf_bagger.best_params_

{'max_depth': 12, 'oob_score': False}

In [None]:
np.array(preds.detach())

In [90]:
X_val = torch.from_numpy(X_val).float()
preds = model(X_val)

In [91]:
with open('w16.pickle', 'rb') as handle:
    w16 = pickle.load(handle)

In [92]:
preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_val)
neighbors = [w16.neighbors[x] for x in ID_val]
transitions = [preds.loc[x].values for x in neighbors]



n_function = np.zeros((len(preds), w16.max_neighbors + 1))
for i, (t, idx) in enumerate(zip(transitions, ID_val)):
    n_function[i, 1:len(t) + 1] = t.squeeze()
    n_function[i, 0] = preds.loc[idx]

In [93]:
preds = clf_bagger.predict(n_function)

In [94]:
len(neighbors)

57050

In [95]:
clf_bagger.score(n_function, Y_val)

0.5448808332705778

In [96]:
recall_score(Y_val, preds)

0.22256568778979907

In [97]:
precision_score(Y_val, preds)

0.1262683201803833

In [246]:
(Y_val == 1).sum()

4529

# No bagger on the end

In [277]:
preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_val)

In [279]:
preds["y"] = Y_val

In [280]:
preds = preds.sort_values(by=[0], ascending = False)

In [281]:
preds

Unnamed: 0,0,y
3699708.0,9.999919e-01,0.0
2206980.0,9.999806e-01,0.0
2206983.0,9.999349e-01,0.0
2206982.0,9.999343e-01,0.0
2210563.0,9.999300e-01,0.0
...,...,...
2654130.0,3.897061e-21,0.0
2113362.0,1.054448e-21,0.0
3705586.0,2.995088e-22,0.0
3703981.0,6.478963e-23,0.0


In [282]:
preds[0].iloc[0:4529] = 1
preds[0].iloc[4529:] = 0

In [283]:
recall_score(preds["y"], preds[0])


0.17244424817840584

In [284]:
precision_score(preds["y"], preds[0])

0.17244424817840584