In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))

Device: cpu


In [2]:
import numpy as np
import os
from get_data import get_data
from imblearn.over_sampling import SMOTE 
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.tensorflow import balanced_batch_generator
from numpy.random import default_rng
rng = default_rng(seed = 31)
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  GridSearchCV
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler 


In [3]:
from torch import nn, optim
import scipy.ndimage as ndimage

In [4]:
# X, Y, ID = get_data("../Data/filled/grids/", [2015,2016,2017,2018,2019])

In [5]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [6]:
with open('w15c.pickle', 'rb') as handle:
    wt = pickle.load(handle)
    
with open('w16c.pickle', 'rb') as handle:
    wv = pickle.load(handle)

In [7]:
def create_CNN_samples(grid, block, dims = 39):
    
    nonzero = np.transpose(grid[:,:,-2].nonzero()) # Get indices of nonzero componetns

    size = nonzero.shape[0]
    width = block * 2 + 1 # calculate widht and height. Needed later on
    
    X = np.zeros((size, width, width, dims))
    Y = np.zeros(size)
    ID = np.zeros(size)
    
    for idx, i in enumerate(nonzero):
        x, ID[idx], Y[idx] = get_neighbor_grid(grid, i, block)
        X[idx] = x.reshape(width,width, 39)
        
    X = np.moveaxis(X, -1, 1) # order the indices correctly to make sure it works in CNN
    X = torch.from_numpy(X).float()
    Y = torch.from_numpy(Y).float()
    
    return X,ID,Y

In [8]:
def get_neighbor_grid(full, hw, block = 1):
    
    # get the nonzero (built) blocks by checking if they have a ID

    h = hw[0]
    w = hw[1]
    
    y = full[h,w,-1]
    ID = full[h,w,-2]
    
    hu = h - block
    hd = h + block
    hshort, hextra, wshort, wextra = 0,0,0,0
    if hu < 0:
        hshort = 0 - hu
        hu = 0
    if hd >= full.shape[0]:
        hextra = (hd - full.shape[0]) + 1
        hd = full.shape[0]

    wr = w + block
    wl = w - block

    if wr >= full.shape[1]:
        wextra = (wr - full.shape[1]) + 1
        wr = full.shape[1]
    if wl < 0:
        wshort = 0 - wl
        wl = 0

    nb = full[hu : hd + 1, wl : wr + 1, :]
    nb = np.pad(nb, ((hshort, hextra), (wshort, wextra), (0,0)), mode = "constant", constant_values = 0)
    return nb[:,:,:-2], ID, y


In [9]:
X_train = []
Y_train = []
ID_train = []
for filename in os.listdir("../Data/filled/grids/2015/"):
    n = np.load("../Data/filled/grids/2015/" + filename)
    X, ID, Y = create_CNN_samples(n, 5)
    X_train.append(X)
    Y_train.append(Y)
    ID_train.append(ID)
    
Y_train = np.concatenate(Y_train)
ID_train = np.concatenate(ID_train)
X_train = np.concatenate(X_train)

X_train = np.moveaxis(X_train, 1, -1)
X_train = X_train.reshape(-1, 39)

X_train = ss.fit_transform(X_train)
X_train = X_train.reshape(-1, 11, 11, 39)

X_train = np.moveaxis(X_train, -1, 1)
X_train = torch.tensor(X_train).float()




X_val = []
Y_val = []
ID_val = []

for filename in os.listdir("../Data/filled/grids/2016/"):
    n = np.load("../Data/filled/grids/2016/" + filename)
    X, ID, Y = create_CNN_samples(n, 5)
    X_val.append(X)
    Y_val.append(Y)
    ID_val.append(ID)
    
X_val = np.concatenate(X_val)
X_val = np.moveaxis(X_val, 1, -1)
X_val = X_val.reshape(-1, 39)

X_val = ss.transform(X_val)


X_val = X_val.reshape(-1, 11, 11, 39)
X_val = np.moveaxis(X_val, -1, 1)
Y_val = np.concatenate(Y_val)
ID_val = np.concatenate(ID_val)
X_val = torch.tensor(X_val).float()

In [10]:
def get_batch(X, Y):
    
    y1 = np.argwhere(Y==1)
    percentage1 = len(y1) / len(X)
    while percentage1 < 0.4:
        X = np.append(X, X[y1].squeeze(), axis = 0)
        Y = np.append(Y, Y[y1].squeeze(), axis = 0) 
        percentage1 = (Y==1).sum() / len(X)
    

#     return X, Y


In [11]:
class Model2(nn.Module):
    def __init__(self, name):
        super(Model2, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 128, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.Conv2d(in_channels = 128, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(576, 64),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid())
            
        self.name = name
    def forward(self, x):
        out = self.net(x)
        return out

In [12]:
class Model3(nn.Module):
    def __init__(self, name):
        super(Model3, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 128, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.Conv2d(in_channels = 128, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(576, 128),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid())
            
        self.name = name
    def forward(self, x):
        out = self.net(x)
        return out

In [13]:
class Model4(nn.Module):
    def __init__(self, name):
        super(Model4, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(1152, 512),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid())
            
        self.name = name
    def forward(self, x):
        out = self.net(x)
        return out

In [14]:
# model1 = Model1("een", w)
# model2 = Model2("twee")
# model3 = Model3("drie")
# model4 = Model4("vier")
# models = [model1, model2, model3, model4]

In [15]:
ID_train

array([2893058., 2870455., 2868833., ..., 1311910., 1176428., 1314791.])

In [16]:
from sklearn.metrics import recall_score, precision_score, confusion_matrix

In [17]:
class Model1(nn.Module):
    def __init__(self, name):
        super(Model1, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 32, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.MaxPool2d(2),
            nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(64, 1))
        
        self.final = nn.Sequential(
            nn.Sigmoid())
        
        
        self.name = name

       
    def forward(self, x, w, y, train = True):

        out = self.net(x)
        neighbors = [w.neighbors[x] for x in range(len(out))]
        transitions = [out[x] for x in neighbors]
        n_function = torch.zeros((len(out), w.max_neighbors + 1))
        for i, (t, idx) in enumerate(zip(transitions, range(len(out)))):
            n_function[i, 1:len(t) + 1] = t.squeeze()
            n_function[i, 0] = out[idx]      
        if train:
#             indices = np.arange(len(n_function)).reshape(-1,1)
#             under, y  = RandomUnderSampler().fit_resample(indices, y)
#             n_function = n_function[under].squeeze()
#             print(n_function)

            return n_function.mean(axis = 1), y.reshape(-1,1)
        
        
        else:
            return self.final(n_function.sum(axis =1 )), y
            
    
    def get_no_activation(self, x):
        out = self.net(x)
        return out

In [18]:
model = Model1("een")

In [19]:
def neighbor_part(model, X, idxs, w):
    preds = model(X)
    preds = pd.DataFrame(np.array(preds.detach())).set_index(idxs)
    
    neighbors = [w.neighbors[x] for x in idxs]
    transitions = [preds.loc[x].values for x in neighbors]



    n_function = np.zeros((len(preds), w.max_neighbors + 1))
    for i, (t, idx) in enumerate(zip(transitions, idxs)):
        n_function[i, 1:len(t) + 1] = t.squeeze()
        n_function[i, 0] = preds.loc[idx]
        
    return n_function

In [22]:
import time


# # optimizer = optim.RMSprop(model.parameters(), lr=0.001) 
# BCEloss = nn.BCELoss()
# model.train()

def train_model(model, X_train, Y_train, ID_train, wt, X_val, Y_val, ID_val, wv, num_epochs, batch_per_e = 1):
    optimizer = optim.RMSprop(model.parameters(), lr=0.01) 
    BCEloss = nn.BCEWithLogitsLoss(pos_weight = torch.tensor(len(Y_train) / (Y_train == 1).sum()))
    train_loss = []
    train_loss_history = []
    acc_history = []
    ROC_history = []
    f1_score_history = []
    cmc_best = 0
    
    

    for epoch in range(num_epochs):
        epoch_loss = []
        print("epoch: {} of {}".format(epoch, num_epochs))
        
            
        model.train()

        optimizer.zero_grad()
        t0 = time.time()

        out, y_ = model(X_train, wt, Y_train)

        
        loss = BCEloss(out, torch.tensor(y_).float().squeeze())
        print(loss)
        loss.backward()
        optimizer.step()
        train_loss_history.append(loss)
                
        model.eval()
        
        out, y = model(X_val, wv, Y_val, False)

        out = out.detach().numpy()
        print(out)
        out =  (out > 0.5).astype(int)
        print(out)
        acc = accuracy_score(y, out)
        ROC = roc_auc_score(y, out)
        f1 = f1_score(y, out)

        acc_history.append(acc)
        ROC_history.append(ROC)
        train_loss_history.append(train_loss)
        f1_score_history.append(f1)
        
        
        print("training_loss: {:.4f}, acc: {:.3f}, ROC: {:.3f}, f1: {:.3f}".format(loss, acc, ROC , f1))
        train_loss = []


    return acc_history, ROC_history, train_loss_history, f1_score_history


        

In [None]:
out1 = train_model(model, X_train, Y_train, ID_train, wt, X_val, Y_val, ID_val, wv, 100)


epoch: 0 of 100
tensor(1.2605, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
[0. 0. 0. ... 0. 0. 0.]
[0 0 0 ... 0 0 0]
training_loss: 1.2605, acc: 0.921, ROC: 0.500, f1: 0.000
epoch: 1 of 100
tensor(612.3113, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
[0.9999999 1.        1.        ... 1.        1.        1.       ]
[1 1 1 ... 1 1 1]
training_loss: 612.3113, acc: 0.079, ROC: 0.500, f1: 0.147
epoch: 2 of 100
tensor(20.6967, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


In [None]:
out.shape

In [None]:
out1

In [None]:
l = nn.BCELoss()

In [None]:
torch.tensor(Y_train)

In [None]:
l(out, torch.tensor(Y_train))

In [None]:
out1.shape

In [None]:

hists = train_model(models[0], X_train, Y_train, ID_train, wt, X_val, Y_val, ID_val, 1)
for mod in models:
    hists = train_model(mod, X_train, Y_train, ID_train, wt, X_val, Y_val, ID_val, 1)
    n_function = neighbor_part(mod, X_train, ID_train, wt)
        
    oversample = SMOTE()
    x, y = oversample.fit_resample(n_function, Y_train)

    clf_bagger = RandomForestClassifier(max_depth = 12, oob_score = True)
    clf_bagger.fit(x,y)
    
    n_function = neighbor_part(mod, X_val, ID_val, wv)
    preds = clf_bagger.predict(n_function)
    totacc = accuracy_score(Y_val, preds)
    totf1 = f1_score(Y_val, preds)
    totROC = roc_auc_score(Y_val, preds)
    
    
#     with open("../results/CNN/" + mod.name + ".csv", "a+") as f:
#         f.write("loss;acc;ROC;f1_score\n")
#         f.write(str(hists[2]) + ";" + str(hists[0]) + ";" + str(hists[1]) + ";" + str(hists[3]) + "\n")
#         f.write("--;" + str(totacc) + ";" + str(totROC) + ";" + str(totf1))
    
    
    