In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))

Device: cpu


In [9]:
import numpy as np
import os
from get_data import get_data
from imblearn.over_sampling import SMOTE 
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.tensorflow import balanced_batch_generator
from numpy.random import default_rng
rng = default_rng(seed = 31)
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  GridSearchCV
import geopandas as gpd
import shapely
from sklearn.metrics import recall_score, precision_score, confusion_matrix, accuracy_score

In [3]:
from torch import nn, optim
import scipy.ndimage as ndimage

In [4]:
# X, Y, ID = get_data("../Data/filled/grids/", [2015,2016,2017,2018,2019])

In [75]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [4]:
def load_data(year):
    os.getcwd()
    
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [10]:
df15 = load_data(2015)
df16 = load_data(2016)
df17 = load_data(2017)
df18 = load_data(2018)
df19 = load_data(2019)

In [76]:
X_train = df15[df15.columns[:-4]]
Y_train = df15.y
X_train = ss.fit_transform(X_train)

oversample = SMOTE(sampling_strategy=0.3)
# oversample = ADASYN()
undersample = RandomUnderSampler()
steps = [("o", oversample), ("u", undersample)]
pipeline = Pipeline(steps = steps)
# resample = SMOTEENN()
x_train,y_train = pipeline.fit_resample(X_train, Y_train)

In [77]:
X_val = df16[df15.columns[:-4]]
Y_val = df16.y
X_val = ss.transform(X_val)

In [13]:
(y_train == 1).sum()

15341

In [271]:
def get_batch(X, Y, batch_size = 32):
    
    idxs = rng.integers(X.shape[0], size = batch_size)

    X = torch.tensor(X[idxs]).float()
    Y = torch.from_numpy(Y.iloc[idxs].values).float()

    return X, Y


In [202]:
class Model(nn.Module):
    def __init__(self, name):
        super(Model, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(39, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,32),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(32,1),
            nn.Sigmoid())
        
        self.name = name
            

    def forward(self, x):
        out = self.net(x)
        return out

In [311]:
class Model1(nn.Module):
    def __init__(self, name):
        super(Model1, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(39, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,32),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(32,1),
            nn.Sigmoid())
        
        self.name = name
            

    def forward(self, x):
        out = self.net(x)
        return out

In [317]:
class Model3(nn.Module):
    def __init__(self, name):
        super(Model3, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(39, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,32),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(32,1),
            nn.Sigmoid())
        
        self.name = name
            

    def forward(self, x):
        out = self.net(x)

In [318]:
class Model4(nn.Module):
    def __init__(self, name):
        super(Model4, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(39, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(1024,128),
            nn.ReLU(),
            nn.Linear(128,32),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(32,1),
            nn.Sigmoid())
        
        self.name = name
            

    def forward(self, x):
        out = self.net(x)
        return out

In [322]:
class Model5(nn.Module):
    def __init__(self, name):
        super(Model5, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(39, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,32),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(32,1),
            nn.Sigmoid())
        
        self.name = name
            

    def forward(self, x):
        out = self.net(x)
        return out

In [323]:
model = Model("een")
model1 = Model1("twee")
model3 = Model3("drie")
model4 = Model4("vier")
model5 = Model5("vijf")
models = [model, model1, model3, model4, model5]

In [324]:
# import warnings
# warnings.filterwarnings("ignore")
# # optimizer = optim.RMSprop(model.parameters(), lr=0.001) 
# BCEloss = nn.BCELoss()
# model.train()

def train_model(model, X_train, Y_train, X_val, Y_val, num_epochs, batch_per_e = 500):
    optimizer = optim.RMSprop(model.parameters(), lr=0.0001) 
    BCEloss = nn.BCELoss()
    train_loss = []
    train_loss_history = []
    acc_history = []
    rec_history = []
    prec_history = []
    cmc_best = 0
    
    

    for epoch in range(num_epochs):
        epoch_loss = []
        print("epoch: {} of {}".format(epoch, num_epochs))
        for batch in range(batch_per_e):
            
            x_train_, y_train_ = get_batch(X_train, Y_train, 100)
            model.train()

            optimizer.zero_grad()
            out = model(x_train_)
            loss = BCEloss(out.squeeze(), y_train_)
            loss.backward()
            optimizer.step()
            if batch % 10 == 0:
                train_loss.append(loss)
                
        model.eval()
        
        x_val_, y_val_ = get_batch(X_val, Y_val, batch_size = 10000)
        predictions = model(x_val_).detach().numpy()
        predictions = (predictions > 0.5).astype(int)
        
        acc = accuracy_score(y_val_, predictions)
        rec = recall_score(y_val_, predictions)
        prec = precision_score(y_val_, predictions)
        train_loss = (np.sum(train_loss) / (batch_per_e/10)).detach().item()

        acc_history.append(acc)
        rec_history.append(rec)
        prec_history.append(prec)
        train_loss_history.append(train_loss)
        
        print("training_loss: {:.4f}, acc: {:.3f}, recall: {:.3f}, precision: {:.3f}".format(train_loss, acc, rec, prec))
        train_loss = []


    return acc_history, rec_history, prec_history, train_loss_history

In [313]:
train_model(model1,x_train, y_train, X_val, Y_val, 1500)

epoch: 0 of 1500
training_loss: 0.6331, acc: 0.719, recall: 0.537, precision: 0.144
epoch: 1 of 1500
training_loss: 0.5981, acc: 0.672, recall: 0.604, precision: 0.145
epoch: 2 of 1500
training_loss: 0.5644, acc: 0.706, recall: 0.569, precision: 0.139
epoch: 3 of 1500


KeyboardInterrupt: 

# Neighbor function

In [247]:
def neighbor_part(model, X, idxs, w):
    preds = model(X)
    preds = pd.DataFrame(np.array(preds.detach())).set_index(idxs)
    
    neighbors = [w.neighbors[x] for x in idxs]
    transitions = [preds.loc[x].values for x in neighbors]



    n_function = np.zeros((len(preds), w.max_neighbors + 1))
    for i, (t, idx) in enumerate(zip(transitions, idxs)):
        n_function[i, 1:len(t) + 1] = t.squeeze()
        n_function[i, 0] = preds.loc[idx]
        
    return n_function

In [None]:
w15 = libpysal.weights.DistanceBand.from_dataframe(df15, threshold=150, binary = True, silence_warnings = True)
w16 = libpysal.weights.DistanceBand.from_dataframe(df16, threshold=150, binary = True, silence_warnings = True)

for mod in models:
    hists = train_model(mod, x_train, y_train, X_val, Y_val, 1000)
    n_function = neighbor_part(mod, X_train, df15.index, w15)
        
    oversample = SMOTE()
    x, y = oversample.fit_resample(n_function, Y_train)

    clf_bagger = RandomForestClassifier(max_depth = 12, oob_score = True)
    clf_bagger.fit(x,y)
    
    n_function = neighbor_part(mod, X_val, df16.index, w16)
    preds = clf_bagger.predict(n_function)
    totacc = accuracy_score(Y_val, preds)
    totprec = precision_score(Y_val, preds)
    totrec = recall_score(Y_val, preds)
    
    
    with open("results/mlp/" + model.name + ".csv", "a+") as f:
        f.write("loss;acc;recall;precision\n")
        f.write(str(hists[0]) + ";" + str(hists[1]) + ";" + str(hists[2]) + ";" + str(hists[3]) + "\n")
        f.write("--;" + str(totacc) + ";" + str(totprec) + ";" + str(totrec))
    
    
    

epoch: 0 of 1000
training_loss: 0.5947, acc: 0.609, recall: 0.608, precision: 0.121
epoch: 1 of 1000
training_loss: 0.5679, acc: 0.702, recall: 0.538, precision: 0.143
epoch: 2 of 1000
training_loss: 0.5505, acc: 0.717, recall: 0.496, precision: 0.148
epoch: 3 of 1000
training_loss: 0.5373, acc: 0.674, recall: 0.567, precision: 0.125
epoch: 4 of 1000
training_loss: 0.5227, acc: 0.740, recall: 0.520, precision: 0.157
epoch: 5 of 1000
training_loss: 0.5187, acc: 0.707, recall: 0.549, precision: 0.144
epoch: 6 of 1000
training_loss: 0.5057, acc: 0.709, recall: 0.537, precision: 0.132
epoch: 7 of 1000
training_loss: 0.4910, acc: 0.737, recall: 0.470, precision: 0.142
epoch: 8 of 1000
training_loss: 0.4626, acc: 0.756, recall: 0.430, precision: 0.147
epoch: 9 of 1000
training_loss: 0.4783, acc: 0.772, recall: 0.432, precision: 0.156
epoch: 10 of 1000
training_loss: 0.4588, acc: 0.769, recall: 0.417, precision: 0.154
epoch: 11 of 1000
training_loss: 0.4703, acc: 0.661, recall: 0.541, precisi

training_loss: 0.1637, acc: 0.701, recall: 0.457, precision: 0.119
epoch: 98 of 1000
training_loss: 0.1659, acc: 0.770, recall: 0.375, precision: 0.145
epoch: 99 of 1000
training_loss: 0.1535, acc: 0.664, recall: 0.532, precision: 0.123
epoch: 100 of 1000
training_loss: 0.1537, acc: 0.670, recall: 0.514, precision: 0.129
epoch: 101 of 1000
training_loss: 0.1606, acc: 0.750, recall: 0.402, precision: 0.137
epoch: 102 of 1000
training_loss: 0.1363, acc: 0.796, recall: 0.343, precision: 0.147
epoch: 103 of 1000
training_loss: 0.1554, acc: 0.754, recall: 0.420, precision: 0.143
epoch: 104 of 1000
training_loss: 0.1447, acc: 0.716, recall: 0.450, precision: 0.129
epoch: 105 of 1000
training_loss: 0.1765, acc: 0.739, recall: 0.401, precision: 0.127
epoch: 106 of 1000
training_loss: 0.1507, acc: 0.733, recall: 0.410, precision: 0.129
epoch: 107 of 1000
training_loss: 0.1449, acc: 0.751, recall: 0.385, precision: 0.135
epoch: 108 of 1000
training_loss: 0.1379, acc: 0.765, recall: 0.369, precis

In [246]:
neighbor_part(model, X_train, df15.index, w15)

In [282]:
model.name

'een'

In [281]:
hists

([0.7477, 0.7461],
 [0.4551122194513716, 0.45591939546599497],
 [0.1489188086495308, 0.14661806399351965],
 [0.23744001984596252, 0.2417915314435959])

In [257]:
np.sum(preds == 1)

15948

In [165]:
# X_train = torch.from_numpy(X_train).float()
preds = model(X_train)

In [166]:
w15 = libpysal.weights.DistanceBand.from_dataframe(df15, threshold=150, binary = True, silence_warnings = True)


In [167]:
import pandas as pd

preds = pd.DataFrame(np.array(preds.detach())).set_index(df15.index)

In [168]:

# preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_train)
neighbors = [w15.neighbors[x] for x in df15.index]
transitions = [preds.loc[x].values for x in neighbors]



n_function = np.zeros((len(preds), w15.max_neighbors + 1))
for i, (t, idx) in enumerate(zip(transitions, df15.index)):
    n_function[i, 1:len(t) + 1] = t.squeeze()
    n_function[i, 0] = preds.loc[idx]

In [169]:
params = {"max_depth": [12], "oob_score" : [True, False]}
clf_bagger = GridSearchCV(RandomForestClassifier(), params, cv = 5, scoring = "balanced_accuracy",
                               verbose = 3)
clf_bagger = RandomForestClassifier(max_depth = 12, oob_score = True)
oversample = SMOTE()
x, y = oversample.fit_resample(n_function, Y_train)

clf_bagger.fit(x,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .......max_depth=6, oob_score=True;, score=0.855 total time=   6.5s
[CV 2/5] END .......max_depth=6, oob_score=True;, score=0.865 total time=   6.4s
[CV 3/5] END .......max_depth=6, oob_score=True;, score=0.863 total time=   6.5s
[CV 4/5] END .......max_depth=6, oob_score=True;, score=0.829 total time=   6.2s
[CV 5/5] END .......max_depth=6, oob_score=True;, score=0.833 total time=   6.2s
[CV 1/5] END ......max_depth=6, oob_score=False;, score=0.855 total time=   5.7s
[CV 2/5] END ......max_depth=6, oob_score=False;, score=0.865 total time=   5.9s
[CV 3/5] END ......max_depth=6, oob_score=False;, score=0.862 total time=   6.7s
[CV 4/5] END ......max_depth=6, oob_score=False;, score=0.829 total time=   6.1s
[CV 5/5] END ......max_depth=6, oob_score=False;, score=0.833 total time=   6.0s
[CV 1/5] END .......max_depth=8, oob_score=True;, score=0.853 total time=   9.3s
[CV 2/5] END .......max_depth=8, oob_score=True;,

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [6, 8, 10, 12],
                         'oob_score': [True, False]},
             scoring='balanced_accuracy', verbose=3)

In [170]:
clf_bagger.best_params_

{'max_depth': 12, 'oob_score': False}

In [171]:
np.array(preds.detach())

AttributeError: 'DataFrame' object has no attribute 'detach'

In [173]:
# X_val = torch.from_numpy(X_val).float()
preds = model(X_val)

In [174]:
w16 = libpysal.weights.DistanceBand.from_dataframe(df16, threshold=150, binary = True, silence_warnings = True)


In [175]:
preds = pd.DataFrame(np.array(preds.detach())).set_index(df16.index)
neighbors = [w16.neighbors[x] for x in df16.index]
transitions = [preds.loc[x].values for x in neighbors]



n_function = np.zeros((len(preds), w16.max_neighbors + 1))
for i, (t, idx) in enumerate(zip(transitions, df16.index)):
    n_function[i, 1:len(t) + 1] = t.squeeze()
    n_function[i, 0] = preds.loc[idx]

In [176]:
preds = clf_bagger.predict(n_function)

In [177]:
len(neighbors)

57050

In [178]:
clf_bagger.score(n_function, Y_val)

0.5819144337554594

In [179]:
recall_score(Y_val, preds)

0.3210421726650475

In [180]:
precision_score(Y_val, preds)

0.14972711358253527

In [246]:
(Y_val == 1).sum()

4529

# No bagger on the end

In [277]:
preds = pd.DataFrame(np.array(preds.detach())).set_index(ID_val)

In [279]:
preds["y"] = Y_val

In [280]:
preds = preds.sort_values(by=[0], ascending = False)

In [281]:
preds

Unnamed: 0,0,y
3699708.0,9.999919e-01,0.0
2206980.0,9.999806e-01,0.0
2206983.0,9.999349e-01,0.0
2206982.0,9.999343e-01,0.0
2210563.0,9.999300e-01,0.0
...,...,...
2654130.0,3.897061e-21,0.0
2113362.0,1.054448e-21,0.0
3705586.0,2.995088e-22,0.0
3703981.0,6.478963e-23,0.0


In [282]:
preds[0].iloc[0:4529] = 1
preds[0].iloc[4529:] = 0

In [283]:
recall_score(preds["y"], preds[0])


0.17244424817840584

In [284]:
precision_score(preds["y"], preds[0])

0.17244424817840584