In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import shapely
import libpysal
from sklearn.model_selection import train_test_split, GridSearchCV
import os
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import Normalizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import pickle
import time
from xgboost import XGBClassifier
from xgboost import callback
from sklearn.preprocessing import StandardScaler


In [2]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [3]:
def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):

        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [4]:
# df15 = load_data(2015)
# df16 = load_data(2016)
df17 = load_data(2017)
df18 = load_data(2018)
df19 = load_data(2019)

In [5]:
X_train = df17[df17.columns[:-5]]
Y_train = df17.y
Y1_train = df17["y-1"]
ss = StandardScaler()
X_train = ss.fit_transform(X_train)

X_val = df18[df18.columns[:-5]]
Y_val = df18.y
Y1_val = df18["y-1"]
X_val = ss.transform(X_val)

X_val1 = df19[df19.columns[:-5]]
Y_val1 = df19.y
Y1_val1 = df19["y-1"]
X_val1 = ss.transform(X_val1)


# weights
wt = libpysal.weights.DistanceBand.from_dataframe(df17, threshold=150, binary = True, silence_warnings = True)
wv = libpysal.weights.DistanceBand.from_dataframe(df18, threshold=150, binary = True, silence_warnings = True)
wv1 = libpysal.weights.DistanceBand.from_dataframe(df19, threshold=150, binary = True, silence_warnings = True)

In [6]:
oversamp = SMOTE()
x,y = oversamp.fit_resample(X_train, Y_train)



In [7]:
neighbors = [wv.neighbors[x] for x in wv.neighbors]  # ONly calculate Yt-1 val because at training it is just fitting the model
Y1_val = np.array([Y1_val[x].sum() for x in neighbors])
Y1_val[np.where(Y1_val == 0)[0]] = 0.5

neighbors = [wv1.neighbors[x] for x in wv1.neighbors]  # ONly calculate Yt-1 val because at training it is just fitting the model
Y1_val1 = np.array([Y1_val1[x].sum() for x in neighbors])
Y1_val1[np.where(Y1_val1 == 0)[0]] = 0.5

In [8]:
df17.shape

(9018, 44)

In [9]:
param_hist = []
f1_hist = []
auc_hist = []

best_auc = 0
for md in [2, 4, 6, 10]:
    for eta in [0.1, 0.01, 0.001, 0.0001]:
        for n_est in [250, 350, 500, 750, 1000]:
            for subsample in [0.1, 0.5, 0.01, 0.001]:
                params =[md, n_est, eta, subsample]

                tm = XGBClassifier(max_depth = md, n_estimators = n_est, eta = eta, eval_metric = "auc", subsample = subsample, use_label_encoder = False).fit(x, y.astype(int))
#                 preds = tm.predict(X_val)
                
#                 print(roc_auc_score(Y_val, preds))
#                 print(f1_score(Y_val, preds))
    #             
                preds = tm.predict_proba(X_val)[:,1]
                preds = Y1_val * preds
                auc = roc_auc_score(Y_val, preds)
                
                if auc > best_auc:
                    best_auc = auc
                    print(auc, params)
                p = np.zeros(len(preds))
                pos = preds.argsort()[-(Y_val == 1).sum() :]
                p[pos] = 1 
                
                f1 = f1_score(Y_val, p)

                param_hist.append(params)
                f1_hist.append(f1)
                auc_hist.append(auc)

                preds = tm.predict_proba(X_val1)[:,1]
                preds = Y1_val1 * preds
                
                p = np.zeros(len(preds))
                pos = preds.argsort()[-(Y_val1 == 1).sum() :]
                p[pos] = 1 
                
                auc1 = roc_auc_score(Y_val1, preds)
                f11 = f1_score(Y_val1, p)
                
#                 print("2016: {}, {}\n2017: {}, {}".format(f1, auc, f11, auc1))
                
                
                
            

0.6700125773593634 [2, 250, 0.1, 0.1]
0.6780954582199049 [2, 250, 0.1, 0.001]
0.6823674504822314 [2, 350, 0.1, 0.001]


In [37]:
a = np.array(auc_hist)

In [None]:
b = a.argmax()

In [None]:
param_hist[b]

In [31]:
auc_hist[b]

0.6781213508368527

In [32]:
f1_hist[b]

0.31451612903225806