In [1]:
from torch import nn, optim
import os
import shapely
import pandas as pd
import geopandas as gpd
import numpy as np
from numpy.random import default_rng
import torch
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

In [2]:
def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [3]:
df15 = load_data(2015).set_index("C28992R100")
df16 = load_data(2016).set_index("C28992R100")
df17 = load_data(2017).set_index("C28992R100")
df18 = load_data(2018).set_index("C28992R100")
df19 = load_data(2019).set_index("C28992R100")


In [4]:
overlapping1517 = df17.index[df17.index.isin(df16.index[df16.index.isin(df15.index)])]
overlapping1518 = df18.index[df18.index.isin(overlapping1517)]
overlapping1519 = df19.index[df19.index.isin(overlapping1518)]

X = np.zeros((len(overlapping1519), 5, 39))
Y = np.zeros((len(overlapping1519), 5))
ss = StandardScaler()

for i, df in enumerate([df15, df16, df17, df18, df19]):
    X[:,i] = df[df.columns[:-3]].loc[overlapping1519]
    Y[:,i] = df.loc[overlapping1519]["y"]
    


In [5]:
rng = default_rng()
def get_batch(X, Y, batch_size, train = "train"):
    idxs = rng.integers(len(X), size = batch_size)
    return torch.tensor(X[idxs]).float(), torch.tensor(Y[idxs]).float()



In [6]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.GRU = nn.GRU(input_size = 39, hidden_size = 1024, batch_first = True)
        self.fc = nn.Linear(1024, 512)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(512,1)
        self.sig = nn.Sigmoid()
        
    def forward(self, X):
        X, _ = self.GRU(X)
        X = self.fc(X[:,-1])
        X = self.relu(X)
        X = self.fc1(X)
        X = self.sig(X)
        return X

In [7]:
X_train = X[:,:3]
Y_train = Y[:,:3]
Y_train = (Y_train==1).any(axis = 1).astype(int)
X_train = X_train.reshape(-1, 39*3)

In [8]:
smote = SMOTE()
X_train_resample,Y_train_resample = smote.fit_resample(X_train,Y_train)
X_train = X_train.reshape(-1,3,39)
X_train_resample = X_train_resample.reshape(-1,3,39)

X_train = X_train.reshape(-1, 39)
X_train_resample = X_train_resample.reshape(-1, 39)

X_train = ss.fit_transform(X_train)
X_train_resample = ss.transform(X_train_resample)

X_train = X_train.reshape(-1,3, 39)
X_train_resample = X_train_resample.reshape(-1,3, 39)

In [9]:
X_val = X[:,:4]
Y_val = Y[:,4]

X_val = X_val.reshape(-1, 39)
X_val = ss.transform(X_val)
X_val = X_val.reshape(-1, 4, 39)

In [10]:
model = RNN()

In [14]:
def train(model, X_train, Y_train, X_val, Y_val, num_epochs, batch_per_e):
    optimizer = optim.RMSprop(model.parameters(), lr=0.001) 
    BCEloss = nn.BCELoss()
    train_loss = []
    train_loss_history = []
    acc_history = []
    ROC_history = []
    f1_score_history = []
    
    for epoch in range(num_epochs):
        epoch_loss = []
        print("epoch: {} of {}".format(epoch, num_epochs))
        for batch in range(batch_per_e):
            
            x_train_, y_train_ = get_batch(X_train, Y_train, 32)
            
            
            model.train()

            optimizer.zero_grad()
            out = model(x_train_)
            loss = BCEloss(out.squeeze(), y_train_)
            loss.backward()
            optimizer.step()
            if batch % 10 == 0:
                train_loss.append(loss)
                
        model.eval()
        
        x_val_, y_val_ = get_batch(X_val, Y_val, batch_size = 1000, train = "val")
        predictions = model(x_val_).detach().numpy()
        predictions = (predictions > 0.5).astype(int).squeeze()
        y_val_ = y_val_
        
        acc = accuracy_score(y_val_, predictions)
        ROC = roc_auc_score(y_val_, predictions)
        f1 = f1_score(y_val_, predictions)
        train_loss = (np.sum(train_loss) / (batch_per_e/10)).detach().item()

        acc_history.append(acc)
        ROC_history.append(ROC)
        train_loss_history.append(train_loss)
        f1_score_history.append(f1)
        
        
        print("training_loss: {:.4f}, acc: {:.3f}, ROC: {:.3f}, f1: {:.3f}".format(train_loss, acc, ROC , f1))
        train_loss = []


    return acc_history, ROC_history, train_loss_history, f1_score_history

In [15]:
hists = train(model, X_train_resample, Y_train_resample, X_val, Y_val, 100, 500)

epoch: 0 of 100
training_loss: 0.8022, acc: 0.478, ROC: 0.571, f1: 0.267
epoch: 1 of 100
training_loss: 0.6939, acc: 0.567, ROC: 0.583, f1: 0.277
epoch: 2 of 100
training_loss: 0.5405, acc: 0.604, ROC: 0.623, f1: 0.300
epoch: 3 of 100
training_loss: 0.5215, acc: 0.566, ROC: 0.576, f1: 0.249
epoch: 4 of 100
training_loss: 0.4945, acc: 0.554, ROC: 0.553, f1: 0.276
epoch: 5 of 100
training_loss: 0.5114, acc: 0.560, ROC: 0.627, f1: 0.279
epoch: 6 of 100
training_loss: 0.4759, acc: 0.532, ROC: 0.568, f1: 0.276
epoch: 7 of 100
training_loss: 0.4813, acc: 0.563, ROC: 0.583, f1: 0.280
epoch: 8 of 100
training_loss: 0.4657, acc: 0.512, ROC: 0.552, f1: 0.278
epoch: 9 of 100
training_loss: 0.4848, acc: 0.528, ROC: 0.586, f1: 0.280
epoch: 10 of 100
training_loss: 0.4557, acc: 0.506, ROC: 0.599, f1: 0.284
epoch: 11 of 100
training_loss: 0.4308, acc: 0.494, ROC: 0.578, f1: 0.297
epoch: 12 of 100
training_loss: 0.4457, acc: 0.345, ROC: 0.544, f1: 0.257
epoch: 13 of 100
training_loss: 0.4268, acc: 0.3

KeyboardInterrupt: 