In [13]:
from torch import nn, optim
import os
import shapely
import pandas as pd
import geopandas as gpd
import numpy as np
from numpy.random import default_rng
import torch
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import libpysal

In [3]:
def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [4]:
df15 = load_data(2015).set_index("C28992R100")
df16 = load_data(2016).set_index("C28992R100")
df17 = load_data(2017).set_index("C28992R100")
df18 = load_data(2018).set_index("C28992R100")
df19 = load_data(2019).set_index("C28992R100")


In [15]:
# Get data that is available in all years
overlapping1517 = df17.index[df17.index.isin(df16.index[df16.index.isin(df15.index)])]
overlapping1518 = df18.index[df18.index.isin(overlapping1517)]
overlapping1519 = df19.index[df19.index.isin(overlapping1518)]

# Fill X and Y
X = np.zeros((len(overlapping1519), 5, 40))
Y = np.zeros((len(overlapping1519), 5))
Y_1 = np.zeros((len(overlapping1519), 5))
ss = StandardScaler()

for i, df in enumerate([df15, df16, df17, df18, df19]):
    X[:,i] = df[df.columns[:-3]].loc[overlapping1519]
    Y[:,i] = df.loc[overlapping1519]["y"]
    Y_1[:,i] = df.loc[overlapping1519]["y-1"]


In [22]:
# Create neighbor lists for all years

df15 = df15.loc[overlapping1519]
df16 = df16.loc[overlapping1519]
df17 = df17.loc[overlapping1519]
df18 = df18.loc[overlapping1519]
df19 = df19.loc[overlapping1519]


w15 = libpysal.weights.DistanceBand.from_dataframe(df15.reset_index(), threshold=150, binary = True, silence_warnings = True)
w16 = libpysal.weights.DistanceBand.from_dataframe(df16.reset_index(), threshold=150, binary = True, silence_warnings = True)
w17 = libpysal.weights.DistanceBand.from_dataframe(df17.reset_index(), threshold=150, binary = True, silence_warnings = True)
w18 = libpysal.weights.DistanceBand.from_dataframe(df18.reset_index(), threshold=150, binary = True, silence_warnings = True)
w19 = libpysal.weights.DistanceBand.from_dataframe(df19.reset_index(), threshold=150, binary = True, silence_warnings = True)

In [23]:
# create Y(t-1) for all years

def get_y_1(y1, w):
    neighbors = [w.neighbors[x] for x in w.neighbors]
    y1 = np.array([y1[x].sum() for x in neighbors])
    y1[np.where(y1 == 0)[0]] = 0.5

    return y1

Y1_15 = get_y_1(Y_1[:,0], w15)
Y1_16 = get_y_1(Y_1[:,1], w16)
Y1_17 = get_y_1(Y_1[:,2], w17)
Y1_18 = get_y_1(Y_1[:,3], w18)
Y1_19 = get_y_1(Y_1[:,4], w19)



In [73]:
rng = default_rng()
def get_batch(X, Y, Y1, batch_size = 32):
    idxs = rng.integers(len(X), size = batch_size)
    return torch.tensor(X[idxs]).float(), torch.tensor(Y[idxs]).float(), torch.tensor(Y1[idxs]).float()



In [121]:
# X train is the first three years
X_train = X[:,:3]

#fill X_train Y(t-1) with actual values for resampling
X_train[:,0,39] = Y1_15
X_train[:,1,39] = Y1_16
X_train[:,2,39] = Y1_17

X_train = X_train.reshape(-1, 40*3)


Y_train = Y[:,2]


In [122]:
# oversample X_train using SMOTE

smote = SMOTE()
X_train_resample,Y_train_resample = smote.fit_resample(X_train,Y_train)
X_train = X_train.reshape(-1,3,40)
X_train_resample = X_train_resample.reshape(-1,3,40)

# transform Y(t-1) to proper values
X_train_resample[X_train_resample[:,0,-1]>0.5][:,0,-1] =  np.ceil(X_train_resample[X_train_resample[:,0,-1] > 0.5][:,0,-1])
X_train_resample[X_train_resample[:,1,-1]>0.5][:,1,-1] =  np.ceil(X_train_resample[X_train_resample[:,1,-1] > 0.5][:,1,-1])
X_train_resample[X_train_resample[:,2,-1]>0.5][:,2,-1] =  np.ceil(X_train_resample[X_train_resample[:,2,-1] > 0.5][:,2,-1])

Y1_15r = X_train_resample[:,0,-1]
Y1_16r = X_train_resample[:,1,-1]
Y1_17r = X_train_resample[:,2,-1]

X_train_resample = np.delete(X_train_resample, -1, 2)
X_train = np.delete(X_train, -1, 2)

X_train = X_train.reshape(-1, 39)
X_train_resample = X_train_resample.reshape(-1, 39)

# Fit and transform the standard scaler
X_train = ss.fit_transform(X_train)
X_train_resample = ss.transform(X_train_resample)

X_train_resample = X_train_resample.reshape(-1,3, 39)

In [123]:
# create X_val and X_test
X_val = X[:,:4]
Y_val = Y[:,3]
X_val = np.delete(X_val, -1, 2)

X_val = X_val.reshape(-1, 39)
X_val = ss.transform(X_val)
X_val = X_val.reshape(-1, 4, 39)



X_test = X[:,:5]
Y_test = Y[:,4]
X_test = np.delete(X_test, -1, 2)
X_test = X_test.reshape(-1, 39)

X_test = ss.transform(X_test)
X_test = X_test.reshape(-1, 4, 39)




In [175]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.GRU = nn.GRU(input_size = 39,num_layers = 2, hidden_size = 1024, batch_first = True)
        self.fc = nn.Linear(1024, 512)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(0.3)
        self.fc1 = nn.Linear(512,1)
        self.final = nn.Sigmoid()
        
    def forward(self, X):
        h0 = torch.zeros(2, X.size(0), 1024)
        X, _ = self.GRU(X, h0)
        X = X[:,-1, :].unsqueeze(1)
        X = self.drop(X)
        X = self.fc(X[:,-1])
        X = self.relu(X)
        X = self.drop(X)
        X = self.fc1(X)
        
        return X
    def sig(self, X):
        return self.final(X)

In [176]:
model = RNN()

In [177]:
def train(model, X_train, Y_train, Y1_train, X_val, Y_val, Y1_val, num_epochs, batch_per_e =500 ):
    optimizer = optim.RMSprop(model.parameters(), lr=0.0001) 
    SigBCEloss = nn.BCEWithLogitsLoss()
    train_loss = []
    train_loss_history = []
    acc_history = []
    ROC_history = []
    f1_score_history = []
    
    for epoch in range(num_epochs):
        epoch_loss = []
        print("epoch: {} of {}".format(epoch, num_epochs))
        for batch in range(batch_per_e):
            
            x, y, y1 = get_batch(X_train, Y_train, Y1_train)
            
            
            model.train()

            optimizer.zero_grad()
            out = model(x).squeeze()
            out = out * y1
            loss = SigBCEloss(out, y)
            loss.backward()
            optimizer.step()
            if batch % 10 == 0:
                train_loss.append(loss)
                
        model.eval()
        
        x, y, y1 = get_batch(X_val, Y_val, Y1_val, batch_size = 1000)
        out = model(x)
        out = model.sig(out).squeeze()
        out *= y1
        preds = np.zeros(len(out))
        pos = out.argsort()[-((y==1).sum()):]
        preds[pos] = 1

        
        acc = accuracy_score(y, preds)
        ROC = roc_auc_score(y, preds)
        f1 = f1_score(y, preds)
        train_loss = (np.sum(train_loss) / (batch_per_e/10)).detach().item()

        acc_history.append(acc)
        ROC_history.append(ROC)
        train_loss_history.append(train_loss)
        f1_score_history.append(f1)
        
        
        print("training_loss: {:.4f}, acc: {:.3f}, ROC: {:.3f}, f1: {:.3f}".format(train_loss, acc, ROC , f1))
        train_loss = []


    return acc_history, ROC_history, train_loss_history, f1_score_history

In [None]:
hists = train(model, X_train_resample, Y_train_resample, Y1_17r, X_val, Y_val, Y1_18, 50)

epoch: 0 of 50
training_loss: 0.6059, acc: 0.824, ROC: 0.558, f1: 0.214
epoch: 1 of 50
training_loss: 0.5946, acc: 0.826, ROC: 0.594, f1: 0.287
epoch: 2 of 50
