In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import shapely
import libpysal
from sklearn.model_selection import train_test_split, GridSearchCV
import os
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import Normalizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC

In [41]:
from imblearn.over_sampling import SMOTE

count    68406.000000
mean         0.500000
std          0.500004
min          0.000000
25%          0.000000
50%          0.500000
75%          1.000000
max          1.000000
Name: y, dtype: float64

In [2]:
def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [115]:
# df15 = load_data(2015)
# df16 = load_data(2016)
df17 = load_data(2017)
df18 = load_data(2018)
df19 = load_data(2019)

In [98]:
X = df15[df15.columns[:-4]]
Y = df15.y


# weights
w = libpysal.weights.DistanceBand.from_dataframe(df15, threshold=150, binary = True, silence_warnings = True)

In [101]:
oversample = SMOTE()
x, y = oversample.fit_resample(X, Y)


In [103]:
# Train the transition model:
clf = SVC(class_weight='balanced', probability=True, verbose = True)


params = {"max_depth": [2,4,6,8], "oob_score" : [True, False]}
clf = GridSearchCV( RandomForestClassifier(), params, cv = 5, scoring = "balanced_accuracy",
                               verbose = 3)
clf_bagger = GridSearchCV(RandomForestClassifier(), params, cv = 5, scoring = "balanced_accuracy",
                               verbose = 3)

clf.fit(x,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .......max_depth=2, oob_score=True;, score=0.650 total time=   4.9s
[CV 2/5] END .......max_depth=2, oob_score=True;, score=0.710 total time=   5.2s
[CV 3/5] END .......max_depth=2, oob_score=True;, score=0.674 total time=   5.1s
[CV 4/5] END .......max_depth=2, oob_score=True;, score=0.676 total time=   4.8s
[CV 5/5] END .......max_depth=2, oob_score=True;, score=0.610 total time=   4.9s
[CV 1/5] END ......max_depth=2, oob_score=False;, score=0.645 total time=   4.0s
[CV 2/5] END ......max_depth=2, oob_score=False;, score=0.713 total time=   4.0s
[CV 3/5] END ......max_depth=2, oob_score=False;, score=0.669 total time=   4.1s
[CV 4/5] END ......max_depth=2, oob_score=False;, score=0.673 total time=   4.4s
[CV 5/5] END ......max_depth=2, oob_score=False;, score=0.622 total time=   4.7s
[CV 1/5] END .......max_depth=4, oob_score=True;, score=0.663 total time=   7.9s
[CV 2/5] END .......max_depth=4, oob_score=True;,

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 4, 6, 8], 'oob_score': [True, False]},
             scoring='balanced_accuracy', verbose=3)

In [104]:
grid_transitions = pd.DataFrame(clf.predict_proba(X)).set_index(X.index)[1]

In [105]:
#create neighborhood function:
neighbors = [w.neighbors[x] for x in X.index] # get train neighbors
transitions = [grid_transitions.loc[x].values for x in neighbors] # get the trans ps of neighbors
n_function = np.zeros((len(transitions), w.max_neighbors + 1)) # create array to hold

for i, (t, idx) in enumerate(zip(transitions, X.index)): # fill array
    n_function[i, 1:len(t) + 1] = t
    n_function[i, 0] = grid_transitions.loc[idx]
    
n_function = pd.DataFrame(n_function).set_index(X.index)

In [106]:
oversample = SMOTE()
x, y = oversample.fit_resample(n_function, Y)
# fit bag model:
clf_bagger.fit(x, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .......max_depth=2, oob_score=True;, score=0.737 total time=   4.8s
[CV 2/5] END .......max_depth=2, oob_score=True;, score=0.719 total time=   5.0s
[CV 3/5] END .......max_depth=2, oob_score=True;, score=0.705 total time=   5.1s
[CV 4/5] END .......max_depth=2, oob_score=True;, score=0.723 total time=   4.4s
[CV 5/5] END .......max_depth=2, oob_score=True;, score=0.642 total time=   4.6s
[CV 1/5] END ......max_depth=2, oob_score=False;, score=0.739 total time=   4.2s
[CV 2/5] END ......max_depth=2, oob_score=False;, score=0.716 total time=   4.4s
[CV 3/5] END ......max_depth=2, oob_score=False;, score=0.711 total time=   4.2s
[CV 4/5] END ......max_depth=2, oob_score=False;, score=0.722 total time=   4.1s
[CV 5/5] END ......max_depth=2, oob_score=False;, score=0.640 total time=   4.3s
[CV 1/5] END .......max_depth=4, oob_score=True;, score=0.747 total time=   7.8s
[CV 2/5] END .......max_depth=4, oob_score=True;,

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 4, 6, 8], 'oob_score': [True, False]},
             scoring='balanced_accuracy', verbose=3)

In [107]:
# predicting:
X = df16[df16.columns[:-4]]
Y = df16.y

# weights
w = libpysal.weights.DistanceBand.from_dataframe(df16, threshold=150, binary = True, silence_warnings = True)
grid_transitions = pd.DataFrame(clf.predict_proba(X)).set_index(X.index)[1]


In [108]:
#create neighborhood function:
neighbors = [w.neighbors[x] for x in X.index] # get train neighbors
transitions = [grid_transitions.loc[x].values for x in neighbors] # get the trans ps of neighbors
n_function = np.zeros((len(transitions), w.max_neighbors + 1)) # create array to hold

for i, (t, idx) in enumerate(zip(transitions, X.index)): # fill array
    n_function[i, 1:len(t) + 1] = t
    n_function[i, 0] = grid_transitions.loc[idx]

In [109]:
preds = clf_bagger.predict(n_function)

In [110]:
clf_bagger.score(n_function, Y)

0.6314418818740152

In [111]:
recall_score(Y, clf_bagger.predict(n_function))

0.5498857578065499

In [114]:
X = df17[df16.columns[:-4]]
Y = df17.y

# weights
w = libpysal.weights.DistanceBand.from_dataframe(df17, threshold=150, binary = True, silence_warnings = True)
grid_transitions = pd.DataFrame(clf.predict_proba(X)).set_index(X.index)[1]

neighbors = [w.neighbors[x] for x in X.index] # get train neighbors
transitions = [grid_transitions.loc[x].values for x in neighbors] # get the trans ps of neighbors
n_function = np.zeros((len(transitions), w.max_neighbors + 1)) # create array to hold

for i, (t, idx) in enumerate(zip(transitions, X.index)): # fill array
    n_function[i, 1:len(t) + 1] = t
    n_function[i, 0] = grid_transitions.loc[idx]
    
preds = clf_bagger.predict(n_function)

clf_bagger.score(n_function, Y)

recall_score(Y, clf_bagger.predict(n_function))

0.5139960326206745

In [None]:
df17.isna()

In [117]:
X = df18[df18.columns[:-4]]
Y = df18.y

# weights
w = libpysal.weights.DistanceBand.from_dataframe(df18, threshold=150, binary = True, silence_warnings = True)
grid_transitions = pd.DataFrame(clf.predict_proba(X)).set_index(X.index)[1]

neighbors = [w.neighbors[x] for x in X.index] # get train neighbors
transitions = [grid_transitions.loc[x].values for x in neighbors] # get the trans ps of neighbors
n_function = np.zeros((len(transitions), w.max_neighbors + 1)) # create array to hold

for i, (t, idx) in enumerate(zip(transitions, X.index)): # fill array
    n_function[i, 1:len(t) + 1] = t
    n_function[i, 0] = grid_transitions.loc[idx]
    
preds = clf_bagger.predict(n_function)

clf_bagger.score(n_function, Y)

recall_score(Y, clf_bagger.predict(n_function))

0.5036399735274653

In [118]:
X = df19[df19.columns[:-4]]
Y = df19.y

# weights
w = libpysal.weights.DistanceBand.from_dataframe(df19, threshold=150, binary = True, silence_warnings = True)
grid_transitions = pd.DataFrame(clf.predict_proba(X)).set_index(X.index)[1]

neighbors = [w.neighbors[x] for x in X.index] # get train neighbors
transitions = [grid_transitions.loc[x].values for x in neighbors] # get the trans ps of neighbors
n_function = np.zeros((len(transitions), w.max_neighbors + 1)) # create array to hold

for i, (t, idx) in enumerate(zip(transitions, X.index)): # fill array
    n_function[i, 1:len(t) + 1] = t
    n_function[i, 0] = grid_transitions.loc[idx]
    
preds = clf_bagger.predict(n_function)

clf_bagger.score(n_function, Y)

recall_score(Y, clf_bagger.predict(n_function))

0.44545196939642956