In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import shapely
import libpysal
from sklearn.model_selection import train_test_split, GridSearchCV
import os
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import Normalizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import pickle
import time
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler


In [2]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [3]:
def load_data(year):
    os.getcwd()
    df = pd.DataFrame()
    path = "../Data/filled/" + str(year) + "/"
    for filename in os.listdir(path):
        df1 = pd.read_csv(path + filename)
        if df1.geometry.isna().any():
            print(filename)
        df = pd.concat([df, df1])
    df = gpd.GeoDataFrame(df)
    df.geometry = df.geometry.apply(shapely.wkt.loads)
    
    df = df.reset_index()
    df = df.drop(["Unnamed: 0", "index"], axis = 1)
    return df

In [4]:
df15 = load_data(2015)
df16 = load_data(2016)
df17 = load_data(2017)
df18 = load_data(2018)
df19 = load_data(2019)

In [5]:
X_train = df15[df15.columns[:-4]]
Y_train = df15.y

X_val = df16[df15.columns[:-4]]
Y_val = df16.y

ss = StandardScaler()
X_train = pd.DataFrame(ss.fit_transform(X_train)).set_index(df15.index)
X_val = pd.DataFrame(ss.transform(X_val)).set_index(df16.index)

# weights
wt = libpysal.weights.DistanceBand.from_dataframe(df15, threshold=150, binary = True, silence_warnings = True)
wv = libpysal.weights.DistanceBand.from_dataframe(df16, threshold=150, binary = True, silence_warnings = True)

In [6]:
class grid_searcher():
    def __init__(self, trans_model, rule_model, X_train, Y_train, X_val, Y_val, w_train, w_val):
        self.t0 = time.time()
        self.tm = trans_model
        self.rm = rule_model
        self.X_train = X_train
        self.Y_train = Y_train
        self.X_val = X_val
        self.Y_val = Y_val
        self.w_train = w_train
        self.w_val = w_val
        
        self.t_idx = X_train.index
        self.v_idx = X_val.index

        
    def transition_fit(self):
#         print("fitting transition model: ", self.tm)
        oversample = SMOTE()
        x, y = oversample.fit_resample(self.X_train, self.Y_train)
        self.tm.fit(x, y)
        
        
    def neighbor_function(self, X, idxs, w):

        grid_transitions = pd.DataFrame(self.tm.predict_proba(X)).set_index(idxs)[1]
        
        neighbors = [w.neighbors[x] for x in idxs] # get train neighbors
        transitions = [grid_transitions.loc[x].values for x in neighbors] # get the trans probs of neighbors
        n_function = np.zeros((len(transitions), w.max_neighbors + 1)) # create array to hold

        for i, (t, idx) in enumerate(zip(transitions, idxs)): # fill array
            n_function[i, 1:len(t) + 1] = t
            n_function[i, 0] = grid_transitions.loc[idx]

        return pd.DataFrame(n_function).set_index(idxs)
        
    
    def neighbor_fit(self):
        n_function = self.neighbor_function(self.X_train, self.t_idx, self.w_train)
        r
        oversample = ADASYN()
        x, y = oversample.fit_resample(n_function, self.Y_train)
        self.rm.fit(x, y)
        
    def val(self):
        n_func = self.neighbor_function(self.X_val, self.v_idx, self.w_val)
        preds = self.rm.predict(n_func)
        acc = accuracy_score(self.Y_val, preds)
        AUC = roc_auc_score(self.Y_val, preds)
        print(AUC)
        f1 = f1_score(self.Y_val, preds)
        print("acc: {:.3f}, AUC: {:.3f}, f1: {:.3f}\n".format(
            acc, AUC, f1), time.time() - self.t0)
        
        return acc, AUC, f1, self.tm, self.rm 
        

In [None]:
c = [0.1, 1, 10, 100, 1000]
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
kernel = ['rbf', 'linear', 'poly']
max_iter = [1000,10000, 100000]

rng = np.random.default_rng()

for _ in range(50):    
    tm = SVC(C = c[rng.integers(len(c))], gamma = gamma[rng.integers(len(gamma))], kernel = kernel[rng.integers(len(kernel))],
             max_iter = max_iter[rng.integers(len(max_iter))], probability = True)
    rm = RandomForestClassifier(max_depth = 10)
    gs = grid_searcher(tm,rm, X_train, Y_train, X_val, Y_val, wt, wv)

    gs.transition_fit()
    print("fit")
    gs.neighbor_fit()
    with open("../results/SVC/results.csv", "a") as f:
        f.write(str(gs.val()).strip("(").strip(")") + "\n")

fit
0.5889280999162858
acc: 0.406, AUC: 0.589, f1: 0.177
 12733.029461622238




fit
0.576894508995548
acc: 0.590, AUC: 0.577, f1: 0.179
 10683.426564693451




fit
0.5998787649141419
acc: 0.638, AUC: 0.600, f1: 0.196
 321.2061424255371




fit
0.6004601029978823
acc: 0.708, AUC: 0.600, f1: 0.205
 13172.296329975128




fit
0.5652860957626223
acc: 0.483, AUC: 0.565, f1: 0.169
 755.4863419532776




fit
0.5864461310493099
acc: 0.538, AUC: 0.586, f1: 0.181
 266.63809537887573




fit
0.5868002545062788
acc: 0.560, AUC: 0.587, f1: 0.182
 264.84015917778015




fit
0.5812231668751503
acc: 0.526, AUC: 0.581, f1: 0.178
 163.0468544960022




fit
0.5694365368594595
acc: 0.511, AUC: 0.569, f1: 0.172
 1046.8675236701965




fit
0.5614863938872822
acc: 0.545, AUC: 0.561, f1: 0.169
 140.20876359939575
