In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import shapely
import libpysal
from sklearn.model_selection import train_test_split, GridSearchCV
import os
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import Normalizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import pickle
import time
from xgboost import XGBClassifier
from xgboost import callback
from sklearn.preprocessing import StandardScaler


In [2]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [3]:
def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [4]:
df15 = load_data(2015)
df16 = load_data(2016)
df17 = load_data(2017)
df18 = load_data(2018)
df19 = load_data(2019)

In [41]:
X_train = df15[df15.columns[:-5]]
Y_train = df15.y
Y1_train = df15["y-1"]
ss = StandardScaler()
X_train = ss.fit_transform(X_train)

X_val = df16[df16.columns[:-5]]
Y_val = df16.y
Y1_val = df16["y-1"]
X_val = ss.transform(X_val)

X_val1 = df17[df17.columns[:-5]]
Y_val1 = df17.y
Y1_val1 = df17["y-1"]
X_val1 = ss.transform(X_val1)


# weights
wt = libpysal.weights.DistanceBand.from_dataframe(df15, threshold=150, binary = True, silence_warnings = True)
wv = libpysal.weights.DistanceBand.from_dataframe(df16, threshold=150, binary = True, silence_warnings = True)
wv1 = libpysal.weights.DistanceBand.from_dataframe(df17, threshold=150, binary = True, silence_warnings = True)

In [42]:
oversamp = SMOTE()
x,y = oversamp.fit_resample(X_train, Y_train)



In [44]:
neighbors = [wv.neighbors[x] for x in wv.neighbors]  # ONly calculate Yt-1 val because at training it is just fitting the model
Y1_val = np.array([Y1_val[x].sum() for x in neighbors])
Y1_val[np.where(Y1_val == 0)[0]] = 0.5

neighbors = [wv1.neighbors[x] for x in wv1.neighbors]  # ONly calculate Yt-1 val because at training it is just fitting the model
Y1_val1 = np.array([Y1_val1[x].sum() for x in neighbors])
Y1_val1[np.where(Y1_val1 == 0)[0]] = 0.5

In [46]:
param_hist = []
f1_hist = []
auc_hist = []
for md in [4,3,2]:
    for n_est in [100, 250, 500]:
        for eta in [0.01, 0.05, 0.001]:
                params =[md, n_est, eta]

                tm = XGBClassifier(max_depth = md, n_estimators = n_est, eta = eta, eval_metric = "logloss").fit(x, y.astype(int))
                preds = tm.predict(X_val)
                
                print(roc_auc_score(Y_val, preds))
                print(f1_score(Y_val, preds))
    #             
                preds = tm.predict_proba(X_val)[:,1]


                preds = Y1_val * preds
                p = np.zeros(len(preds))
                pos = preds.argsort()[-(Y_val == 1).sum() :]
                p[pos] = 1 
                auc = roc_auc_score(Y_val, p)
                f1 = f1_score(Y_val, p)

                param_hist.append(params)
                f1_hist.append(f1)
                auc_hist.append(auc)

                preds = tm.predict_proba(X_val1)[:,1]
                preds = Y1_val1 * preds
                p = np.zeros(len(preds))
                pos = preds.argsort()[-(Y_val1 == 1).sum() :]
                p[pos] = 1 
                auc1 = roc_auc_score(Y_val1, p)
                f11 = f1_score(Y_val1, p)
                
                print("2016: {}, {}\n2017: {}, {}".format(f1, auc, f11, auc1))
                
                
                
            



0.6066036650288684
0.3271716015049893
2016: 0.34824281150159747, 0.6194531669256299
2017: 0.366593567251462, 0.6382510360690664




0.5816063448444971
0.28230943086089993
2016: 0.34345047923322686, 0.6166550284471419
2017: 0.36805555555555564, 0.6390860019408054




0.6064160436880154
0.32361778846153844
2016: 0.3408945686900958, 0.6151626879252816
2017: 0.3559941520467836, 0.6321975334989584




0.602022978263185
0.3234802007808143
2016: 0.35654952076677315, 0.6243032736216757
2017: 0.3676900584795322, 0.6388772604728707




0.5410528747315476
0.1756263682802238
2016: 0.3182108626198083, 0.6019181657937717
2017: 0.3519736842105263, 0.6299013773516761




0.6064160436880154
0.32361778846153844
2016: 0.3472843450479233, 0.6188935392299324
2017: 0.3607456140350877, 0.6349111725821103




0.5826688703369923
0.2846625766871166
2016: 0.34440894568690095, 0.6172146561428394
2017: 0.3695175438596491, 0.6399209678125445




0.5272689305429461
0.12746905451672372
2016: 0.30894568690095847, 0.5965084314020281
2017: 0.34978070175438597, 0.6286489285440676




0.6062820529065811
0.32337486863834264
2016: 0.3450479233226837, 0.6175877412733046
2017: 0.36293859649122806, 0.6361636213897189




0.5926244966998028
0.30436363636363634
2016: 0.35207667731629394, 0.6216916777084204
2017: 0.36403508771929827, 0.6367898457935232




0.6033513962609979
0.3259341472438032
2016: 0.35175718849840254, 0.6215051351431878
2017: 0.3702485380116959, 0.6403384507484139




0.5862152338871163
0.29194382852919437
2016: 0.34217252396166137, 0.6159088581862118
2017: 0.3534356725146199, 0.6307363432234152




0.6120861568430163
0.33942108890420397
2016: 0.36134185303514377, 0.6271014121001638
2017: 0.37426900584795314, 0.6426346068956963




0.5522819529057934
0.21035892776010903
2016: 0.3274760383386581, 0.6073279001855152
2017: 0.35818713450292405, 0.6334499823065669




0.5862152338871163
0.29194382852919437
2016: 0.34792332268370607, 0.6192666243603975
2017: 0.35855263157894735, 0.6336587237745017




0.6060875821967091
0.3308931185944363
2016: 0.35303514376996803, 0.6222513054041179
2017: 0.3695175438596491, 0.6399209678125445




0.531124012278179
0.14717542120911795
2016: 0.307667731629393, 0.595762261141098
2017: 0.34612573099415206, 0.62656151386472




0.5864574618224733
0.29243884358784283
2016: 0.3498402555910543, 0.6203858797517925
2017: 0.36330409356725146, 0.6363723628576536




0.6290781057522029
0.34915773353751917
2016: 0.35047923322683705, 0.6207589648822576
2017: 0.36988304093567254, 0.6401297092804792




0.6306296933160775
0.3638863428047663
2016: 0.35527156549520766, 0.6235571033607457
2017: 0.36878654970760233, 0.639503484876675




0.6292120965336372
0.3493806665815349
2016: 0.339297124600639, 0.6142299750991188
2017: 0.3519736842105263, 0.6299013773516761




0.6363271868237319
0.3625418060200669
2016: 0.3549520766773163, 0.6233705607955131
2017: 0.3713450292397661, 0.6409646751522182




0.5635914255488998
0.24093816631130063
2016: 0.33865814696485624, 0.6138568899686538
2017: 0.3669590643274854, 0.6384597775370012




0.6292120965336372
0.3493806665815349
2016: 0.3440894568690096, 0.6170281135776069
2017: 0.3541666666666667, 0.6311538261592847




0.6285613722299868
0.3612783696155628
2016: 0.3539936102236422, 0.6228109330998155
2017: 0.3684210526315789, 0.6392947434087402




0.5375134290441328
0.17054263565891473
2016: 0.3198083067092652, 0.6028508786199344
2017: 0.34868421052631576, 0.6280227041402633




0.6292120965336372
0.3493806665815349
2016: 0.34600638977635784, 0.6181473689690021
2017: 0.35635964912280704, 0.6324062749668932


In [52]:
a = np.array(auc_hist)

In [53]:
a.argmax()

12

In [54]:
param_hist[12]

[3, 250, 0.01]

In [55]:
auc_hist[12]

0.6271014121001638

In [56]:
f1_hist[12]

0.36134185303514377