In [1]:
import torch
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cpu"
print("Device: {}".format(device))

Device: cpu


In [2]:
import numpy as np
import os
from get_data import get_data
from imblearn.over_sampling import SMOTE 
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from numpy.random import default_rng
rng = default_rng(seed = 31)
import pickle
import pandas as pd
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, roc_auc_score
import geopandas as gpd
import shapely
import libpysal
import matplotlib.pyplot as plt
from copy import deepcopy
from sklearn.cluster import KMeans
from collections import Counter

In [3]:
from torch import nn, optim
import scipy.ndimage as ndimage

In [4]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [5]:
def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
        
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
#     df.plot(figsize = (40,40), column = "y")
#     plt.show()
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df 

In [7]:
# df15 = load_data(2015)
# df16 = load_data(2016)
df17 = load_data(2017)
df18 = load_data(2018)
df19 = load_data(2019)

wt = libpysal.weights.DistanceBand.from_dataframe(df17, threshold=150, binary = True, silence_warnings = True)
wv = libpysal.weights.DistanceBand.from_dataframe(df18, threshold=150, binary = True, silence_warnings = True)
wtest = libpysal.weights.DistanceBand.from_dataframe(df19, threshold=150, binary = True, silence_warnings = True)

In [8]:
cluster_train = df17[df17.columns[:-5]]
cluster_val = df18[df18.columns[:-5]]
cluster_test = df19[df19.columns[:-5]]
cluster_train = ss.fit_transform(cluster_train)
cluster_val = ss.transform(cluster_val)
cluster_test = ss.transform(cluster_test)



cluster_model = KMeans(n_clusters = 3, random_state = 1)
train_l = pd.DataFrame(cluster_model.fit_predict(cluster_train))
val_l = pd.DataFrame(cluster_model.predict(cluster_val))
test_l= pd.DataFrame(cluster_model.predict(cluster_test))

In [11]:
Counter(train_l[0])

Counter({2: 1242, 0: 4894, 1: 2882})

In [12]:
Counter(val_l[0])

Counter({2: 1282, 0: 4897, 1: 2932})

In [13]:
Counter(test_l[0])

Counter({2: 1239, 0: 5011, 1: 2932})

In [14]:
def create_CNN_samples(grid, block, dims = 39):
    
    nonzero = np.transpose(grid[:,:,-2].nonzero()) # Get indices of nonzero componetns
    size = nonzero.shape[0]
    width = block * 2 + 1 # calculate widht and height. Needed later on
    
    X = np.zeros((size, width, width, dims))
    Y = np.zeros(size)
    ID = np.zeros(size)
    Y_1 = np.zeros(size)
    
    for idx, i in enumerate(nonzero):
        x, ID[idx], Y[idx], Y_1[idx] = get_neighbor_grid(grid, i, block)
        X[idx] = x.reshape(width,width, dims)
        
    X = np.moveaxis(X, -1, 1) # order the indices correctly to make sure it works in CNN
    X = torch.from_numpy(X).float()
    Y = torch.from_numpy(Y).float()
    
    return X,ID,Y, Y_1

In [15]:
def get_neighbor_grid(full, hw, block = 1):
    
    # get the nonzero (built) blocks by checking if they have a ID

    h = hw[0]
    w = hw[1]
    
    y = full[h,w,-1]
    ID = full[h,w,-2]
    Y_1_train = full[h,w,-3]
    
    hu = h - block
    hd = h + block
    hshort, hextra, wshort, wextra = 0,0,0,0
    if hu < 0:
        hshort = 0 - hu
        hu = 0
    if hd >= full.shape[0]:
        hextra = (hd - full.shape[0]) + 1
        hd = full.shape[0]

    wr = w + block
    wl = w - block

    if wr >= full.shape[1]:
        wextra = (wr - full.shape[1]) + 1
        wr = full.shape[1]
    if wl < 0:
        wshort = 0 - wl
        wl = 0

    nb = full[hu : hd + 1, wl : wr + 1, :]
    nb = np.pad(nb, ((hshort, hextra), (wshort, wextra), (0,0)), mode = "constant", constant_values = 0)
    return nb[:,:,:-3], ID, y, Y_1_train


In [16]:
##  trainingset
X_train = []
Y_train = []
ID_train = []
Y_1_train = []
for filename in os.listdir("../Data/filled/grids/2017/"):
    n = np.load("../Data/filled/grids/2017/" + filename)
    X, ID, Y, Y_1 = create_CNN_samples(n, 10)
    X_train.append(X)
    Y_train.append(Y)
    ID_train.append(ID)
    Y_1_train.append(Y_1)
    
block_size = X.shape[-1]
    
Y_train = np.concatenate(Y_train)
ID_train = np.concatenate(ID_train)
X_train = np.concatenate(X_train)

# create Y_1_train
Y_1_train = np.concatenate(Y_1_train)
neighbors = [wt.neighbors[x] for x in wt.neighbors]
Y_1_train = np.array([Y_1_train[x].sum() for x in neighbors])
Y_1_train[np.where(Y_1_train == 0)[0]] = 0.5

#reshape to rescale
X_train = np.moveaxis(X_train, 1, -1)
X_train = X_train.reshape(-1, 39)
X_train = ss.fit_transform(X_train) #rescale
# reshape to oversample, append y_1 to resample
X_train = X_train.reshape(-1, block_size, block_size, 39)
X_train = X_train.reshape(X_train.shape[0], -1)



## validation set
X_val = []
Y_val = []
ID_val = []
Y_1_val = []

for filename in os.listdir("../Data/filled/grids/2018/"):
    n = np.load("../Data/filled/grids/2018/" + filename)
    X, ID, Y, Y_1 = create_CNN_samples(n, 10)
    X_val.append(X)
    Y_val.append(Y)
    ID_val.append(ID)
    Y_1_val.append(Y_1)
    
# create and transform X_val
X_val = np.concatenate(X_val)
X_val = np.moveaxis(X_val, 1, -1)
X_val = X_val.reshape(-1, 39)
X_val = ss.transform(X_val)
X_val = X_val.reshape(-1, block_size, block_size, 39)
X_val = np.moveaxis(X_val, -1, 1)


# create Y_1_val
Y_1_val = np.concatenate(Y_1_val)
neighbors = [wv.neighbors[x] for x in wv.neighbors]
Y_1_val = np.array([Y_1_val[x].sum() for x in neighbors])
Y_1_val[np.where(Y_1_val == 0)[0]] = 0.5

# create Y-val and ID_val
Y_val = np.concatenate(Y_val)
ID_val = np.concatenate(ID_val)
X_val = torch.tensor(X_val).float().to(device)
Y_1_val = torch.tensor(Y_1_val).float().to(device)

## test set
X_test = []
Y_test = []
ID_test = []
Y_1_test = []

for filename in os.listdir("../Data/filled/grids/2019/"):
    n = np.load("../Data/filled/grids/2019/" + filename)
    X, ID, Y, Y_1 = create_CNN_samples(n, 10)
    X_test.append(X)
    Y_test.append(Y)
    ID_test.append(ID)
    Y_1_test.append(Y_1)
    
# create and transform X_val
X_test = np.concatenate(X_test)
X_test = np.moveaxis(X_test, 1, -1)
X_test = X_test.reshape(-1, 39)
X_test = ss.transform(X_test)
X_test = X_test.reshape(-1, block_size, block_size, 39)
X_test = np.moveaxis(X_test, -1, 1)


# create Y_1_val
Y_1_test = np.concatenate(Y_1_test)
neighbors = [wtest.neighbors[x] for x in wtest.neighbors]
Y_1_test = np.array([Y_1_test[x].sum() for x in neighbors])
Y_1_test[np.where(Y_1_test == 0)[0]] = 0.5

# create Y-val and ID_val
Y_test = np.concatenate(Y_test)
ID_test = np.concatenate(ID_test)
X_test = torch.tensor(X_test).float().to(device)
Y_1_test = torch.tensor(Y_1_test).float().to(device)

In [17]:
def get_batch(X, Y, Y1, batch_size = 32):
    
    idxs = rng.integers(X.shape[0], size = batch_size)
    
    X = X[idxs]
    Y = Y[idxs]
    Y1 = Y1[idxs]

    return X, Y, Y1


In [21]:
class Model2(nn.Module): # best auc = 0.58
    def __init__(self):
        super(Model2, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 15, kernel_size = (4,4)),
            nn.MaxPool2d(2),
            nn.ReLU(), 
            nn.Dropout2d(0.3),
            nn.Conv2d(in_channels = 15, out_channels = 20, kernel_size = (4,4)),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Dropout2d(0.3),
            nn.Conv2d(in_channels = 20, out_channels = 5, kernel_size = (3,3)),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(5, 1)
            )
        
        self.final = nn.Sigmoid()
        
    def forward(self, x, y1):
#         for i in self.net:
            
        out = self.net(x)
        out = out.squeeze() * y1
        return out

In [22]:
class Model1(nn.Module): # best auc = 0.58
    def __init__(self):
        super(Model1, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 15, kernel_size = (4,4)),
            nn.MaxPool2d(2),
            nn.ReLU(), 
            nn.Dropout2d(0.3),
            nn.Conv2d(in_channels = 15, out_channels = 20, kernel_size = (4,4)),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Dropout2d(0.3),
            nn.Flatten(),
            nn.Linear(180 , 150),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(150,1)
            )
        
        self.final = nn.Sigmoid()
        
    def forward(self, x, y1):
#         for i in self.net:
            
        out = self.net(x)
        out = out.squeeze() * y1
        return out

In [23]:

class Model3(nn.Module): # best auc = 0.58
    def __init__(self):
        super(Model3, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 5, kernel_size = (4,4)),
            nn.MaxPool2d(2),
            nn.ReLU(), 
            nn.Dropout2d(0.3),
            nn.Conv2d(in_channels = 5, out_channels = 5, kernel_size = (4,4)),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Dropout2d(0.3),
            nn.Flatten(),
            nn.Linear(45, 1)
            )
        
        self.final = nn.Sigmoid()
        
    def forward(self, x, y1):
#         for i in self.net:
            
        out = self.net(x)
        out = out.squeeze() * y1
        return out

In [24]:
class Model(nn.Module): # best auc = 0.58
    def __init__(self):
        super(Model, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 5, kernel_size = (4,4)),
            nn.MaxPool2d(2),
            nn.ReLU(), 
            nn.Dropout2d(0.3),
            nn.Conv2d(in_channels = 5, out_channels = 15, kernel_size = (4,4)),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Dropout2d(0.3),
            nn.Flatten(),
            nn.Linear(135, 150),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(150,1)
            )
        
        self.final = nn.Sigmoid()
        
    def forward(self, x, y1):
#         for i in self.net:
            
        out = self.net(x)
        out = out.squeeze() * y1
        return out

In [26]:
model = Model().to(device)
model1 = Model1().to(device)
model2 = Model2().to(device)
model3 = Model3().to(device)

models = ["0", "2"]

In [27]:
print(X_train.shape, X_val.shape) # wat is de volgorde van X_train en X_val?

(9018, 17199) torch.Size([9111, 39, 21, 21])


In [31]:
def train_model(models, X_train, Y_train, Y1_train, train_l, X_val, Y_val, Y1_val, val_l, num_epochs, lr, batch_per_e = 100):

    tot_auc = 0
    tot_model = np.NaN
    tot_param = 0
    
    for c in [3]:

        print("{}: train: {}, val: {}".format(c,Counter(train_l[0]), Counter(val_l[0])))
        
        xtrain = np.append(X_train, Y1_train.reshape(-1,1), axis = 1) # append Y1 for resampling 
        
        
        auc_clust_hist = {}
        f1_clust_hist = {}
        clust_models = {}
        param_clust_hist = {} 

        for clust in train_l[0].unique():
        
            best_auc = 0
            best_f1 = 0
            best_model = np.NaN

            idxs = train_l[train_l[0]== clust].index
            xt = xtrain[idxs]
            yt = Y_train[idxs]
            
            if np.sum(yt==1) >10:
                oversample = SMOTE()
                xt, yt = oversample.fit_resample(xt, yt)
            
            xt[xt[:,-1]>0.5][:,-1] =  np.ceil(xt[xt[:,-1] > 0.5][:,-1]) # make y1 great again
            y1t = xt[:,-1] # get y1_train
            xt = np.delete(xt, 39, 1) 

            # reshape to grid
            xt = xt.reshape(-1, block_size, block_size, 39)

            # move axis for pytorch
            xt = np.moveaxis(xt, -1, 1) 
            
            xt = torch.tensor(xt).float().to(device)
            yt = torch.tensor(yt).float().to(device)
            y1t = torch.tensor(y1t).float().to(device)
     

            # validation set for the clusters
            idxs = val_l[val_l[0] == clust].index
            xv = X_val[idxs]
            yv = Y_val[idxs]
            y1v = Y1_val[idxs]

            yv = torch.tensor(yv).float().to(device)
            yn = yv.cpu().detach().numpy()
            

            for lr in [0.00000051, 0.0000051]:
                for size in models:
                    if size == "0":
                        model = Model()
                    elif size == "1":
                        model = Model1()
                    elif size == "2":
                        model = Model2()
                    else:
                        size = Model3()
                

                    optimizer = optim.Adam(model.parameters(), lr = lr)
                    SigBCEloss = nn.BCEWithLogitsLoss()
 
                    for epoch in range(num_epochs):
                        epoch_loss = []
                        model.train()
                        for i in range(batch_per_e):

                            optimizer.zero_grad()
                            x,y, y1 = get_batch(xt, yt, y1t, 6)
                            out = model(x, y1) # get output from final linear layer

                            loss = SigBCEloss(out, y) # put through sigmoid and calculate loss
                            loss.backward()
                            optimizer.step()
                        
                        model.eval()

                        out = model(xv, y1v)             

                        yn = yv.cpu().detach().numpy()

                        ROC = roc_auc_score(yn, model.final(out).detach())
                        out = out.cpu().squeeze().detach().numpy()


                        preds = np.zeros(len(out))
                        pos = out.argsort()[-((yv==1).sum()) : ]
                        preds[pos] = 1

                        acc = accuracy_score(yn, preds)
                        f1 = f1_score(yn, preds)
                
                        if ROC > best_auc:
                            print("best UAC in cluster {}: {}".format(clust, ROC))
                            best_auc = ROC
                            best_f1 = f1
                            param_clust_hist[clust] = [lr, size, epoch]
                            clust_models[clust] = deepcopy(model.state_dict())

            # histories for this clust
            auc_clust_hist[clust] = best_auc
            f1_clust_hist[clust] = best_f1

        predictions = np.zeros(len(Y_val))
        for clust in train_l[0].unique():
        
            idxs = val_l[val_l[0] == clust].index
            xv = X_val[idxs]
            y1v = Y1_val[idxs]
            size = param_clust_hist[clust][1]
            if size == "0":
                model = Model()
                model.load_state_dict(clust_models[clust])
            elif size == "1":
                model = Model1()
                model.load_state_dict(clust_models[clust])
            elif size == "2":
                model = Model2()
                model.load_state_dict(clust_models[clust])
            else:
                size = Model3()
                model.load_state_dict(clust_models[clust])

            preds = model(xv, y1v)

            preds = preds.cpu().squeeze().detach().numpy()
            predictions[idxs] = preds

        pos_idxs = predictions.argsort()[- (Y_val == 1).sum() : ]
        Yhat = np.zeros(len(Y_val))
        Yhat[pos_idxs] = 1
        auc = roc_auc_score(Y_val, predictions)
        f1 = f1_score(Y_val, Yhat)

        if auc > tot_auc:
            print("auc!! {}".format(auc))
            tot_auc = auc
            tot_model = clust_models
            tot_param = param_clust_hist

    return tot_model, tot_param, tot_auc


In [None]:
n_epochs = 800
a = train_model(models, X_train, Y_train, Y_1_train, train_l, X_val, Y_val, Y_1_val,val_l, n_epochs, 0.00000051)

3: train: Counter({0: 4894, 1: 2882, 2: 1242}), val: Counter({0: 4897, 1: 2932, 2: 1282})
best UAC in cluster 2: 0.4262341222889965
best UAC in cluster 2: 0.4263461849189761
best UAC in cluster 2: 0.4265937651480009
best UAC in cluster 2: 0.4266693422705453
best UAC in cluster 2: 0.42674752550076356
best UAC in cluster 2: 0.42684916370004744
best UAC in cluster 2: 0.4270889256060504
best UAC in cluster 2: 0.4271671088362687
best UAC in cluster 2: 0.4272296554204434
best UAC in cluster 2: 0.42729220200461804
best UAC in cluster 2: 0.4274589928957505
best UAC in cluster 2: 0.4275632372027083
best UAC in cluster 2: 0.4277039670171013
best UAC in cluster 2: 0.4279202739540387
best UAC in cluster 2: 0.42801669993797464
best UAC in cluster 2: 0.4281105198142367
best UAC in cluster 2: 0.4282877351360649
best UAC in cluster 2: 0.42889235211642
best UAC in cluster 2: 0.42918944839124973
best UAC in cluster 2: 0.4292389644370547
best UAC in cluster 2: 0.4292806621598378
best UAC in cluster 2: 0.

best UAC in cluster 0: 0.5464701376666762
best UAC in cluster 0: 0.5464890486940746
best UAC in cluster 0: 0.5465862653830702
best UAC in cluster 0: 0.5468095555880546
best UAC in cluster 0: 0.5470577803328677
best UAC in cluster 0: 0.5472940981345042
best UAC in cluster 0: 0.5474826480817494
best UAC in cluster 0: 0.547569778889466
best UAC in cluster 0: 0.5478055363643648
best UAC in cluster 0: 0.5479604667073463
best UAC in cluster 0: 0.5481742313577894
best UAC in cluster 0: 0.5484189540604918
best UAC in cluster 0: 0.5484333824739882
best UAC in cluster 0: 0.5486544313720216
best UAC in cluster 0: 0.5487678975364114
best UAC in cluster 0: 0.5488020774674128
best UAC in cluster 0: 0.5490189239149132
best UAC in cluster 0: 0.5490679525044644
best UAC in cluster 0: 0.5492240035009215
best UAC in cluster 0: 0.5494901587013419
best UAC in cluster 0: 0.5495852741650711
best UAC in cluster 0: 0.5496733855445788
best UAC in cluster 0: 0.5497892330976039
best UAC in cluster 0: 0.5501028759

In [None]:
predictions = np.zeros(len(Y_test))
auc_clusters = {}
f1_clusters = {}
for clust in train_l[0].unique():
  
    idxs = test_l[test_l[0] == clust].index
    xt = torch.tensor(X_test[idxs]).float().to(device)
    yt = Y_test[idxs]

    size = p[clust][1]
    if size == "0":
        model = Model()
        model.load_state_dict(clust_models[clust])
    elif size == "1":
        model = Model1()
        model.load_state_dict(clust_models[clust])
    elif size == "2":
        model = Model2()
        model.load_state_dict(clust_models[clust])
    else:
        size = Model3()
        model.load_state_dict(clust_models[clust])

    preds = model(xt)
    preds = preds.cpu().squeeze().detach().numpy()  
    auc = roc_auc_score(yt, preds)
    auc_clusters[clust] = auc

    Yhat = np.zeros(len(yt))
    pos_idxs = preds.argsort()[- (yt == 1).sum() : ]
    Yhat[pos_idxs] = 1
    f1_clusters[clust] = f1 = f1_score(yt, Yhat)

    predictions[idxs] = preds

pos_idxs = predictions.argsort()[- (Y_test == 1).sum() : ]
Yhat = np.zeros(len(Y_test))
Yhat[pos_idxs] = 1
auc = roc_auc_score(Y_test, predictions)
f1 = f1_score(Y_test, Yhat)

with open ("../Results/Denhaag.csv", "a+") as f:
    f.write("\nCNN_cluster,{},{}".format(auc,f1))
    
with open ("../Results/DenhaagAUC.csv", "a+") as f:
    f.write("CNN_clust,{},{},{}".format(auc_clusters[0],auc_clusters[1],auc_clusters[2]))
    
with open ("../Results/Denhaagf1.csv", "a+") as f:
    f.write("CNN_clust,{},{},{}".format(f1_clusters[0],f1_clusters[1],f1_clusters[2]))

In [None]:
print(np.max(a[-4]))
plt.plot(a[-4], alpha = 0.5)
plt.hlines(np.mean(a[-4]), 0, n_epochs, color = "r")

In [None]:
print(np.max(a[-2]))
plt.plot(a[-2], alpha = 0.5)
plt.hlines(np.mean(a[-2]), 0, n_epochs, color = "r")

In [None]:
a

BEST CNN FOUND Den haaG: AUC 0.636, model: 2, lr = 5.1e-06


In [None]:
def neighbor_part(model, X, idxs, w):
    preds = model(X)
    preds = pd.DataFrame(np.array(preds.detach())).set_index(idxs)
    
    neighbors = [w.neighbors[x] for x in idxs]
    transitions = [preds.loc[x].values for x in neighbors]



    n_function = np.zeros((len(preds), w.max_neighbors + 1))
    for i, (t, idx) in enumerate(zip(transitions, idxs)):
        n_function[i, 1:len(t) + 1] = t.squeeze()
        n_function[i, 0] = preds.loc[idx]
        
    return n_function

In [None]:
class Model(nn.Module): # best auc = 0.59
    def __init__(self, name):
        super(Model, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 60, kernel_size = (3,3)),
            nn.MaxPool2d(2),
            nn.ReLU(), 
            nn.Dropout2d(0.25),
            nn.Conv2d(in_channels = 60, out_channels = 60, kernel_size = (3,3)),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Dropout2d(0.25),
            nn.Flatten(),
            nn.Linear(60, 1))
        
        self.final = nn.Sigmoid()
        self.name = name
        
    def forward(self, x, y1):
#         for i in self.net:
            
        out = self.net(x)
        out = out.squeeze() * y1
        return out

# Neighbor function

In [None]:
import geopandas as gpd
import shapely
import libpysal

def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [None]:
df15 = load_data(2015)
df16 = load_data(2016)
wt = libpysal.weights.DistanceBand.from_dataframe(df15, threshold=150, binary = True, silence_warnings = True)
wv = libpysal.weights.DistanceBand.from_dataframe(df16, threshold=150, binary = True, silence_warnings = True)

In [None]:

scaler = StandardScaler()
X15 = df15[df15.columns[:-4]]
Y15 = df15.y

X16 = df16[df16.columns[:-4]]
Y16 = df16.y

X15 = pd.DataFrame(scaler.fit_transform(X15))
X16 = pd.DataFrame(scaler.transform(X16))

In [None]:
cluster_model = KMeans(n_clusters=5, random_state=0)
labels15 = pd.DataFrame(cluster_model.fit_predict(X15))
labels16 = pd.DataFrame(cluster_model.predict(X16))

labels15["X"] = X15.index
labels16["X"] = X16.index


labels15 = labels15.set_index(0)
labels16 = labels16.set_index(0)

In [None]:
with open('cbs_id_koppel.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [None]:
# br.C28992R100 = br.C28992R100.map(b) # change C28992code for id
labels15.C28 = labels15.C28.map(b)

In [None]:
labels15

In [None]:
X_train[labels15.loc[0]["X"]].shape

In [None]:
labels15.loc[0].shape

In [None]:
ID_train

In [None]:

# preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_train)
neighbors = [w15.neighbors[x] for x in ID_train]
transitions = [preds.loc[x].values for x in neighbors]



n_function = np.zeros((len(preds), w15.max_neighbors + 1))
for i, (t, idx) in enumerate(zip(transitions, ID_train)):
    n_function[i, 1:len(t) + 1] = t.squeeze()
    n_function[i, 0] = preds.loc[idx]

In [None]:
params = {"max_depth": [6, 8, 10, 12], "oob_score" : [True, False]}
clf_bagger = GridSearchCV(RandomForestClassifier(), params, cv = 5, scoring = "balanced_accuracy",
                               verbose = 3)
oversample = SMOTE()
x, y = oversample.fit_resample(n_function, Y_train)

clf_bagger.fit(x,y)

In [None]:
clf_bagger.best_params_

In [None]:
np.array(preds.detach())

In [None]:
X_val = torch.from_numpy(X_val).float()
preds = model(X_val)

In [None]:
with open('w16.pickle', 'rb') as handle:
    w16 = pickle.load(handle)

In [None]:
preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_val)
neighbors = [w16.neighbors[x] for x in ID_val]
transitions = [preds.loc[x].values for x in neighbors]



n_function = np.zeros((len(preds), w16.max_neighbors + 1))
for i, (t, idx) in enumerate(zip(transitions, ID_val)):
    n_function[i, 1:len(t) + 1] = t.squeeze()
    n_function[i, 0] = preds.loc[idx]

In [None]:
preds = clf_bagger.predict(n_function)

In [None]:
len(neighbors)

In [None]:
clf_bagger.score(n_function, Y_val)

In [None]:
recall_score(Y_val, preds)

In [None]:
precision_score(Y_val, preds)

In [None]:
(Y_val == 1).sum()

# No bagger on the end

In [None]:
preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_val)

In [None]:
preds["y"] = Y_val

In [None]:
preds = preds.sort_values(by=[0], ascending = False)

In [None]:
preds

In [None]:
preds[0].iloc[0:4529] = 1
preds[0].iloc[4529:] = 0

In [None]:
recall_score(preds["y"], preds[0])


In [None]:
precision_score(preds["y"], preds[0])