In [8]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, roc_auc_score
from itertools import combinations
import os
import geopandas as gpd
import shapely

In [9]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

In [10]:
# EHpreds = pd.DataFrame(columns = ["model", "preds"])
with open("../results/EHRFpreds", 'rb') as file:
    RF = pickle.load(file)
with open("../results/EHXGBpreds", 'rb') as file:
    XGB =  pickle.load(file)
with open("../results/EHMLPpreds", 'rb') as file:
    MLP =  sigmoid(pickle.load(file))
with open("../results/EHCNNpreds", 'rb') as file:
    CNN =  sigmoid(pickle.load(file))
with open("../results/EHRNNpreds", 'rb') as file:
    RNN =  sigmoid(pickle.load(file))
with open("../results/EHRNNCNNpreds", 'rb') as file:
    RNNCNN = sigmoid(pickle.load(file))
    

In [11]:
def load_data(city):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + city + ".csv"
    

    df1 = pd.read_csv(path )
    if df1.geometry.isna().any():
        print(filename)
    df = pd.concat([df, df1])
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [161]:
DH15 = load_data("'s-Gravenhage15.csv_filled_").set_index("C28992R100")
DH16 = load_data("'s-Gravenhage16.csv_filled_").set_index("C28992R100")
DH17 = load_data("'s-Gravenhage17.csv_filled_").set_index("C28992R100")
DH18 = load_data("'s-Gravenhage18.csv_filled_").set_index("C28992R100")
DH19 = load_data("'s-Gravenhage19.csv_filled_").set_index("C28992R100")
overlapping1517 = DH17.index[DH17.index.isin(DH16.index[DH16.index.isin(DH15.index)])]
overlapping1518 = DH18.index[DH18.index.isin(overlapping1517)]
overlapping1519 = DH19.index[DH19.index.isin(overlapping1518)]
DH = DH19.loc[overlapping1519]
dic = dict(zip(DH19.index, list(range(len(DH19)))))
overlapDH = [dic[x] for x in overlapping1519]

NM15 = load_data("Nijmegen15.csv_filled_").set_index("C28992R100")
NM16 = load_data("Nijmegen16.csv_filled_").set_index("C28992R100")
NM17 = load_data("Nijmegen17.csv_filled_").set_index("C28992R100")
NM18 = load_data("Nijmegen18.csv_filled_").set_index("C28992R100")
NM19 = load_data("Nijmegen19.csv_filled_").set_index("C28992R100")
overlapping1517 = NM17.index[NM17.index.isin(NM16.index[NM16.index.isin(NM15.index)])]
overlapping1518 = NM18.index[NM18.index.isin(overlapping1517)]
overlapping1519 = NM19.index[NM19.index.isin(overlapping1518)]
NM = NM19.loc[overlapping1519]
dic = dict(zip(NM19.index, list(range(len(NM19)))))
overlapNM = [dic[x] for x in overlapping1519]

EH15 = load_data("eindhoven15.csv_filled_").set_index("C28992R100")
EH16 = load_data("eindhoven16.csv_filled_").set_index("C28992R100")
EH17 = load_data("eindhoven17.csv_filled_").set_index("C28992R100")
EH18 = load_data("eindhoven18.csv_filled_").set_index("C28992R100")
EH19 = load_data("eindhoven19.csv_filled_").set_index("C28992R100")
overlapping1517 = EH17.index[EH17.index.isin(EH16.index[EH16.index.isin(EH15.index)])]
overlapping1518 = EH18.index[EH18.index.isin(overlapping1517)]
overlapping1519 = EH19.index[EH19.index.isin(overlapping1518)]
EH = EH19.loc[overlapping1519]
dic = dict(zip(EH19.index, list(range(len(EH19)))))
overlapEH = [dic[x] for x in overlapping1519]

In [162]:
def get_ensemble_scores(citycode, y, overlap):
    with open("../results/"+ citycode + "RFpreds", 'rb') as file:
        RF = pickle.load(file)
        RF = RF[overlap]
    with open("../results/"+ citycode + "XGBpreds", 'rb') as file:
        XGB =  pickle.load(file)
        XGB = XGB[overlap]
    with open("../results/"+ citycode + "MLPpreds", 'rb') as file:
        MLP =  sigmoid(pickle.load(file))
        MLP = MLP[overlap]
    with open("../results/"+ citycode + "CNNpreds", 'rb') as file:
        CNN =  sigmoid(pickle.load(file))
        CNN = CNN[overlap]
    with open("../results/"+ citycode + "RNNpreds", 'rb') as file:
        RNN =  sigmoid(pickle.load(file))
    with open("../results/"+ citycode + "RNNCNNpreds", 'rb') as file:
        RNNCNN = sigmoid(pickle.load(file))
        
    combs = []
    ROCs = []
    F1s = []
    results = pd.DataFrame(columns = ["Models", "mean_auc", "mean_f1", "median_auc", "median_f1"])
    for L in range(2, 7):
        for models, i in zip(combinations(["RF","XGB","MLP","CNN","RNN","RNNCNN"], L),combinations([RF, XGB, MLP, CNN, RNN, RNNCNN], L)):
            mean = np.mean(i, axis = 0)
            median = np.median(i, axis = 0)
            meanpos = mean.argsort()[-np.sum(y == 1):]
            medianpos = median.argsort()[-np.sum(y == 1):]
            mea = np.zeros(len(overlap))
            med = np.zeros(len(overlap))
            
            mea[meanpos] = 1
            med[medianpos] = 1
            
            results = results.append({"Models" : models, "mean_auc" : roc_auc_score(y.values, mean), "mean_f1" : f1_score(y.values, mea),
                           "median_auc" : roc_auc_score(y.values, median), "median_f1" : f1_score(y.values, med)}, ignore_index = True)
            
            
    return results


In [163]:
def get_them_strings(inp):
    return str(inp.Models).replace("'", "").replace("(", "").replace(")", "")

In [164]:
DH = get_ensemble_scores("DH", DH.y, overlapDH)
DH.Models = DH.apply(get_them_strings, axis = 1)
DH = DH.set_index("Models")
EH = get_ensemble_scores("EH", EH.y, overlapEH)
EH.Models = EH.apply(get_them_strings, axis = 1)
EH = EH.set_index("Models")
NM = get_ensemble_scores("NM", NM.y, overlapNM)
NM.Models = NM.apply(get_them_strings, axis = 1)
NM = NM.set_index("Models")

In [165]:
arrays = [["Mean Ensemble", "Mean Ensemble", "Median Ensemble", "Median Ensemble"],
            ["AUC", "F1", "AUC", "F1"]
          ]
tuples = list(zip(*arrays))

columns = pd.MultiIndex.from_tuples(tuples, names=["Type", "Metric"])
DH.columns = columns
EH.columns = columns
NM.columns = columns

In [166]:
DH.to_excel("../results/ensembleDH.xlsx")
EH.to_excel("../results/ensembleEH.xlsx")
NM.to_excel("../results/ensembleNM.xlsx")

In [105]:
print(str(tot.index.values[0]).replace("'", "").replace("(", "").replace(")", ""))

RF, XGB


In [131]:
def get_them_strings(inp):
    return str(inp.models).replace("'", "").replace("(", "").replace(")", "")
    

In [133]:
tot.models = tot.apply(get_them_strings, axis = 1)

In [142]:
EH

Unnamed: 0_level_0,mean_auc,mean_f1,median_auc,median_f1
models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"RF, XGB",0.677992,0.194617,0.677992,0.194617
"RF, MLP",0.689441,0.207039,0.689441,0.207039
"RF, CNN",0.666166,0.194617,0.666166,0.194617
"RF, RNN",0.691524,0.202899,0.691524,0.202899
"RF, RNNCNN",0.674473,0.198758,0.674473,0.198758
"XGB, MLP",0.692827,0.20911,0.692827,0.20911
"XGB, CNN",0.676179,0.194617,0.676179,0.194617
"XGB, RNN",0.694841,0.207039,0.694841,0.207039
"XGB, RNNCNN",0.682765,0.198758,0.682765,0.198758
"MLP, CNN",0.657965,0.227743,0.657965,0.227743
