In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))

Device: cpu


In [12]:
import numpy as np
import os
from get_data import get_data
from imblearn.over_sampling import SMOTE 
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.tensorflow import balanced_batch_generator
from numpy.random import default_rng
rng = default_rng(seed = 31)
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  GridSearchCV
import torch
from sklearn.decomposition import PCA

In [13]:
from torch import nn, optim
import scipy.ndimage as ndimage

In [14]:
# X, Y, ID = get_data("../Data/filled/grids/", [2015,2016,2017,2018,2019])

In [15]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [111]:
def create_CNN_samples(grid, block, dims = 39):
    
    nonzero = np.transpose(grid[:,:,-2].nonzero()) # Get indices of nonzero componetns

    size = nonzero.shape[0]
    width = block * 2 + 1 # calculate widht and height. Needed later on
    
    X = np.zeros((size, width, width, dims))
    Y = np.zeros(size)
    ID = np.zeros(size)
    
    for idx, i in enumerate(nonzero):
        x, ID[idx], Y[idx] = get_neighbor_grid(grid, i, block)
        X[idx] = x.reshape(width,width, 39)
        
#     X = np.moveaxis(X, -1, 1) # order the indices correctly to make sure it works in CNN
    X = torch.from_numpy(X).float()
    Y = torch.from_numpy(Y).float()
#     print(X.shape)
    
    return X,ID,Y

In [112]:
def get_neighbor_grid(full, hw, block = 1):
    
    # get the nonzero (built) blocks by checking if they have a ID

    h = hw[0]
    w = hw[1]
    
    y = full[h,w,-1]
    ID = full[h,w,-2]
    
    hu = h - block
    hd = h + block
    hshort, hextra, wshort, wextra = 0,0,0,0
    if hu < 0:
        hshort = 0 - hu
        hu = 0
    if hd >= full.shape[0]:
        hextra = (hd - full.shape[0]) + 1
        hd = full.shape[0]

    wr = w + block
    wl = w - block

    if wr >= full.shape[1]:
        wextra = (wr - full.shape[1]) + 1
        wr = full.shape[1]
    if wl < 0:
        wshort = 0 - wl
        wl = 0

    nb = full[hu : hd + 1, wl : wr + 1, :]
    nb = np.pad(nb, ((hshort, hextra), (wshort, wextra), (0,0)), mode = "constant", constant_values = 0)
    return nb[:,:,:-2], ID, y


In [126]:
X_train = []
Y_train = []
ID_train = []
pca = PCA(n_components=15)

for filename in os.listdir("../Data/filled/grids/2015/"):
    n = np.load("../Data/filled/grids/2015/" + filename)
    create_CNN_samples(n, 5)
    X, ID, Y = create_CNN_samples(n, 5)
    X_train.append(X)
    Y_train.append(Y)
    ID_train.append(ID)
    
    
X_train = np.concatenate(X_train) # join the arrays 

X_train = X_train.reshape(-1,11,11,39) # reshape to grid
X_train = X_train.reshape(-1,39) # reshape to seperate points


X_train = pca.fit_transform(X_train) # apply pca
X_train = ss.fit_transform(X_train) # normalize
X_train = X_train.reshape(-1,11,11,15) # reshape to grid

Y_train = np.concatenate(Y_train)
ID_train = np.concatenate(ID_train)

oversampler = SMOTE()

X_train = X_train.reshape(X_train.shape[0], -1) #flatten the array 
X_train_resamp, Y_train_resamp = oversampler.fit_resample(X_train, Y_train)
X_train = X_train.reshape(-1, 11, 11, 15)
X_train_resamp = X_train_resamp.reshape(-1, 11, 11, 15)

X_train = np.moveaxis(X_train,-1,1) # change axis orders for CNN
X_train_resamp = np.moveaxis(X_train_resamp,-1,1)


X_val = []
Y_val = []
ID_val = []

for filename in os.listdir("../Data/filled/grids/2016/"):
    n = np.load("../Data/filled/grids/2016/" + filename)
    X, ID, Y = create_CNN_samples(n, 5)
    X_val.append(X)
    Y_val.append(Y)
    ID_val.append(ID)
    
X_val = np.concatenate(X_val)
X_val = X_val.reshape(-1,11,11,39) # reshape to grid
X_val = X_val.reshape(-1,39) # reshape to seperate points
X_val = pca.transform(X_val) # apply pca
X_val = ss.transform(X_val) # normalize

X_val = X_val.reshape(-1, 11, 11, 15)
X_val = np.moveaxis(X_val,-1,1) # change axis orders for CNN

Y_val = np.concatenate(Y_val)
ID_val = np.concatenate(ID_val)


In [127]:
def get_batch(X, Y, batch_size = 32):
    
    idxs = rng.integers(X.shape[0], size = batch_size)
    
    X = torch.from_numpy(X[idxs]).float()
    Y = torch.from_numpy(Y[idxs]).float()
    

    return X, Y


In [128]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 39, out_channels = 128, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.Conv2d(in_channels = 128, out_channels = 64, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(576, 64),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid())
            

    def forward(self, x):
        out = self.net(x)
        return out

In [168]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels= 15, out_channels = 10, kernel_size = (3,3)), 
            nn.ReLU(), 
            nn.MaxPool2d(2),
            nn.Conv2d(in_channels = 10, out_channels = 26, kernel_size = (3,3)), 
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(26, 8),
            nn.ReLU())
        
        self.final = nn.Sequential(
            nn.Linear(8,1),
            nn.ReLU(),
            nn.Sigmoid())
            

    def forward(self, x):
        
        
        out = self.net(x)
        out = self.final(out)
        return out
    
    def get_no_activation(self, x):
        out = self.net(x)
        return out

In [169]:
model = Model()

In [170]:
from sklearn.metrics import recall_score, precision_score, confusion_matrix

In [188]:

# # optimizer = optim.RMSprop(model.parameters(), lr=0.001) 
# BCEloss = nn.BCELoss()
# model.train()

def train_model(model, X_train, Y_train, X_val, Y_val, num_epochs, batch_per_e = 500):
    optimizer = optim.Adam(model.parameters(), lr=0.0000001) 
    BCEloss = nn.BCELoss()
    train_loss = []
    cmc_history = []
    cmc_history_train = []
    acc_history = []
    acc_history_train = []
    cmc_best = 0
    
    

    for epoch in range(num_epochs):
        epoch_loss = []
        print("epoch: {} of {}".format(epoch, num_epochs))
        for batch in range(batch_per_e):
            
            x_train_, y_train_ = get_batch(X_train, Y_train)
            model.train()

            optimizer.zero_grad()
            out = model(x_train_)

            loss = BCEloss(out.squeeze(), y_train_)
            loss.backward()
            optimizer.step()
            if batch % 10 == 0:
                train_loss.append(loss)
                
        model.eval()
        x_val_, y_val_ = get_batch(X_val, Y_val, batch_size = 1000)
        predictions = model(x_val_)
        accuracy = ((predictions.squeeze() > 0.5) == y_val_).sum().item() / len(y_val_)
        print("val_acc: {}, training loss: {}".format(accuracy, np.sum(train_loss) / (batch_per_e/10)))
        train_loss = []


        

In [189]:
train_model(model,X_train_resamp, Y_train_resamp, X_val, Y_val, 100)

epoch: 0 of 100
val_acc: 0.933, training loss: 0.6931474208831787
epoch: 1 of 100
val_acc: 0.908, training loss: 0.6931474208831787
epoch: 2 of 100
val_acc: 0.926, training loss: 0.6931474208831787
epoch: 3 of 100
val_acc: 0.914, training loss: 0.6931474208831787
epoch: 4 of 100


KeyboardInterrupt: 

In [58]:

model.eval()
x_val_, _, y_val_ = get_batch(l_val, block = 5, batch_size = 100000, train = False)
predictions = model(x_val_)
# accuracy = ((predictions.squeeze() > 0.5) == y_val_).sum().item() / len(y_val_)

NameError: name 'l_val' is not defined

# Neighbor function

In [179]:
X_train = torch.from_numpy(X_train).float()
preds = model(X_train)

In [180]:
with open('w15.pickle', 'rb') as handle:
    w15 = pickle.load(handle)

In [181]:
import pandas as pd

preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_train)

In [182]:

# preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_train)
neighbors = [w15.neighbors[x] for x in ID_train]
transitions = [preds.loc[x].values for x in neighbors]



n_function = np.zeros((len(preds), w15.max_neighbors + 1))
for i, (t, idx) in enumerate(zip(transitions, ID_train)):
    n_function[i, 1:len(t) + 1] = t.squeeze()
    n_function[i, 0] = preds.loc[idx]

In [183]:
params = {"max_depth": [6, 8, 10, 12], "oob_score" : [True, False]}
clf_bagger = GridSearchCV(RandomForestClassifier(), params, cv = 5, scoring = "balanced_accuracy",
                               verbose = 3)
oversample = SMOTE()
x, y = oversample.fit_resample(n_function, Y_train)

clf_bagger.fit(x,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .......max_depth=6, oob_score=True;, score=0.569 total time=   2.2s
[CV 2/5] END .......max_depth=6, oob_score=True;, score=0.622 total time=   2.1s
[CV 3/5] END .......max_depth=6, oob_score=True;, score=0.606 total time=   2.0s
[CV 4/5] END .......max_depth=6, oob_score=True;, score=0.552 total time=   1.9s
[CV 5/5] END .......max_depth=6, oob_score=True;, score=0.603 total time=   2.1s
[CV 1/5] END ......max_depth=6, oob_score=False;, score=0.569 total time=   1.6s
[CV 2/5] END ......max_depth=6, oob_score=False;, score=0.622 total time=   1.7s
[CV 3/5] END ......max_depth=6, oob_score=False;, score=0.606 total time=   1.3s
[CV 4/5] END ......max_depth=6, oob_score=False;, score=0.552 total time=   1.5s
[CV 5/5] END ......max_depth=6, oob_score=False;, score=0.603 total time=   1.4s
[CV 1/5] END .......max_depth=8, oob_score=True;, score=0.569 total time=   1.8s
[CV 2/5] END .......max_depth=8, oob_score=True;,

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [6, 8, 10, 12],
                         'oob_score': [True, False]},
             scoring='balanced_accuracy', verbose=3)

In [184]:
clf_bagger.best_params_

{'max_depth': 6, 'oob_score': True}

In [185]:
np.array(preds.detach())

AttributeError: 'DataFrame' object has no attribute 'detach'

In [None]:
X_val = torch.from_numpy(X_val).float()
preds = model(X_val)

In [None]:
with open('w16.pickle', 'rb') as handle:
    w16 = pickle.load(handle)

In [None]:
preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_val)
neighbors = [w16.neighbors[x] for x in ID_val]
transitions = [preds.loc[x].values for x in neighbors]



n_function = np.zeros((len(preds), w16.max_neighbors + 1))
for i, (t, idx) in enumerate(zip(transitions, ID_val)):
    n_function[i, 1:len(t) + 1] = t.squeeze()
    n_function[i, 0] = preds.loc[idx]

In [None]:
preds = clf_bagger.predict(n_function)

In [94]:
len(neighbors)

57050

In [None]:
clf_bagger.score(n_function, Y_val)

In [None]:
recall_score(Y_val, preds)

In [None]:
precision_score(Y_val, preds)

In [246]:
(Y_val == 1).sum()

4529

# No bagger on the end

In [277]:
preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_val)

In [279]:
preds["y"] = Y_val

In [280]:
preds = preds.sort_values(by=[0], ascending = False)

In [281]:
preds

Unnamed: 0,0,y
3699708.0,9.999919e-01,0.0
2206980.0,9.999806e-01,0.0
2206983.0,9.999349e-01,0.0
2206982.0,9.999343e-01,0.0
2210563.0,9.999300e-01,0.0
...,...,...
2654130.0,3.897061e-21,0.0
2113362.0,1.054448e-21,0.0
3705586.0,2.995088e-22,0.0
3703981.0,6.478963e-23,0.0


In [282]:
preds[0].iloc[0:4529] = 1
preds[0].iloc[4529:] = 0

In [283]:
recall_score(preds["y"], preds[0])


0.17244424817840584

In [284]:
precision_score(preds["y"], preds[0])

0.17244424817840584