In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import openTSNE

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torchvision


import sys
sys.path.append("../scripts")
from data import SmallPatchesDataset, get_filenames_center_blocks
from models import AutoEncoderResnetExtractor, DEC
import preprocessing_census_blocks

# autoreload
%load_ext autoreload
%autoreload 2

## Loading and preprocessing data

In [21]:
n_cities_used = 100

# remove patches that are have small intersection with block
intersection_threshold = 0.25

# remove patches from blocks that have more than a threshold 
patches_count_max = 100

In [22]:
blocks_df = pd.read_csv("../data/blocks_patches_relation.csv")
blocks_df["mhi"] = blocks_df["mhi"].apply(lambda x: np.nan if x < 0 else x)
blocks_df = blocks_df.dropna()

In [23]:
# replace negative mhi with nan
from ast import literal_eval

In [24]:
blocks_df["clean_patches_relation"] = blocks_df.patches_relation.apply(literal_eval)

In [25]:
blocks_df["n_patches"] = blocks_df.clean_patches_relation.apply(lambda x : len(x))
blocks_df = blocks_df[blocks_df.n_patches > 0]

In [26]:
def clean_patches_relation(s):
    filenames = []
    data = []
    for key, value in s.items():
        value = np.array(value)
        value = value[value[:, 1] > intersection_threshold, :]
        for i in range(len(value)):
            data.append(value[i, :])
            filenames.append(key)
    data = np.array(data)

    if len(filenames) > patches_count_max:
        selected = np.random.choice(
            len(filenames),
            size=patches_count_max,
            replace=False,
            p=data[:, 1] / data[:, 1].sum(),
        )
        data = data[selected, :]
        filenames = [filenames[i] for i in selected]
    return [f"{filenames[i]} {int(data[i, 0])}" for i in range(len(filenames))]

In [27]:
blocks_df["clean_patches_relation"] = blocks_df.clean_patches_relation.apply(
    clean_patches_relation
)

In [28]:
blocks_df.clean_patches_relation.values[0]

['seattle_wa/2862468_m_4712223_nw_10_060_20191011_1197.png 2',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1009.png 1',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1159.png 3',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1313.png 0',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1162.png 1',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1084.png 2',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1123.png 2',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1121.png 2',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1276.png 3',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1161.png 1',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1047.png 0',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1160.png 3',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1161.png 2',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_971.png 2',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1314.png 2',
 'seattle_wa/2862468_m_4712223_nw_10_060_20191011_1314.p

In [3]:
blocks_df = gpd.read_file("../data/census_blocks_patches_v2.geojson")

# filtering to test with some blocks, selecting the cities with the biggest number of blocks
biggest_blocks = blocks_df.groupby(["state", "county"]).agg("count").reset_index().iloc[:, 0:3].sort_values("tract", ascending = False).head(n_cities_used)
blocks_df = blocks_df[(blocks_df.state.isin(biggest_blocks.state) & blocks_df.county.isin(biggest_blocks.county))]

# cleaning blocks with missing data
blocks_df = blocks_df[blocks_df.mhi > 0]
blocks_df = blocks_df.dropna()
blocks_df = blocks_df[blocks_df.patches_relation.apply(len) > 0]

blocks_df["area_km2"] = blocks_df['geometry'].to_crs({'proj':'cea'}).area / 10**6
blocks_df["density"] = blocks_df["pop"] / blocks_df["area_km2"]

In [4]:
blocks_id = preprocessing_census_blocks.get_locations_info()
blocks_id = blocks_id.merge(
    preprocessing_census_blocks.get_states_codes(),
    left_on = "state",
    right_on = "state_abbr",
)
blocks_id["state"] = blocks_id["state_x"]
blocks_id = blocks_id.drop(["state_x", "state_y", "year", "state_abbr"], axis = 1)
blocks_id["city_state"] = blocks_id["city"] + "_" + blocks_id["state"]
blocks_df["state_abbr"] = blocks_df["state"].apply(lambda x : blocks_id[blocks_id.state_code == x].state.values[0])
blocks_df["city_name"] = blocks_df.apply(lambda x : blocks_id[(blocks_id.state_code == x.state) & (blocks_id.county_codes.str.find(x.county) >= 0)].city.values[0], axis = 1)
blocks_df["city_state"] = blocks_df["city_name"] + "_" + blocks_df["state_abbr"]

In [5]:
def clean_patches_relation(s):
    s = s.split("\n")
    s = dict([x.split(":") for x in s])
    filenames = []
    data = []
    for key, value in s.items():
        value = value.split(" ")
        idx = np.array([float(v) for v in value[0].split(",")])
        ratio = np.array([float(v) for v in value[1].split(",")])
        idx = idx[ratio > intersection_threshold]
        ratio = ratio[ratio > intersection_threshold]
        for i in range(len(idx)):
            data.append([idx[i], ratio[i]])
            filenames.append(key)
    data = np.array(data)
    if len(filenames) > patches_count_max:
        selected = np.random.choice(
            len(filenames),
            size=patches_count_max,
            replace=False,
            p=data[:, 1] / data[:, 1].sum(),
        )
        data = data[selected, :]
        filenames = [filenames[i] for i in selected]
    return [filenames, data]

blocks_df["clean_patches_relation"] = blocks_df.patches_relation.apply(
    clean_patches_relation
)
blocks_df["n_patches"] = blocks_df["clean_patches_relation"].apply(
    lambda x: x[1].shape[0]
)
blocks_df = blocks_df[blocks_df.n_patches > 0]

In [6]:
blocks_df = blocks_df.reset_index(drop = True)

In [7]:
patches_blocks = {}
for i, row in blocks_df.iterrows():
    relation_list = row["patches_relation"].strip(" ").split(" ")
    relation_list = [x.split(";") for x in relation_list]
    relation_list = row["clean_patches_relation"][0]
    idx = row["clean_patches_relation"][1][:, 0]
    files = [f"{relation_list[j]} {int(idx[j])}" for j in range(len(relation_list))]
    for file in files:
        if file in patches_blocks.keys():
            patches_blocks[file].append(i)
        else:
            patches_blocks[file] = [i]
print(f"Number of unique patches: {len(patches_blocks.keys())}")

Number of unique patches: 3655529


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoEncoderResnetExtractor(128)
model.load_state_dict(torch.load("../models/AE_extractor_resnet50_small_128/model.pt"))
model.to(device);

### Clustering a subset of the data

In [9]:
filenames_subset = get_filenames_center_blocks()
dataset_subset = SmallPatchesDataset(filenames_subset, resize = (224, 224))
dl_subset = torch.utils.data.DataLoader(dataset_subset, batch_size = 300)

In [10]:
embeddings = []
with torch.no_grad():
    for x in tqdm(dl_subset):
        x = x.to(device)
        embeddings.append(model.encoder(x).cpu().numpy())
embeddings = np.concatenate(embeddings)

  0%|          | 0/1789 [00:00<?, ?it/s]

100%|██████████| 1789/1789 [35:27<00:00,  1.19s/it]


In [11]:
k = 50
kmeans = KMeans(
    n_clusters = k, 
    random_state = 0, 
    n_init = 20
).fit(embeddings)


### Clustering the dataset

In [12]:
filenames = list(patches_blocks.keys())
dataset = SmallPatchesDataset(filenames, resize = (224, 224))
dl = torch.utils.data.DataLoader(dataset, batch_size = 300)

In [13]:
def get_clusters(dl, model, kmeans):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    clusters = []
    clusters_distance = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dl):
            batch = batch.to(device)
            z = model.encoder(batch).detach().cpu().numpy()
            c = kmeans.predict(z)
            d = kmeans.transform(z)
            clusters.append(c)
            clusters_distance.append(d)

    clusters = np.concatenate(clusters)
    clusters_distance = np.concatenate(clusters_distance)
    return clusters, clusters_distance


In [14]:
clusters, clusters_distance = get_clusters(dl, model, kmeans)

  0%|          | 0/12186 [00:00<?, ?it/s]

100%|██████████| 12186/12186 [3:29:14<00:00,  1.03s/it] 


## Functions to generate block features

In [28]:
# functions to generate features from model output
def count_of_patches_cluster():
    x = np.zeros((blocks_df.shape[0], k))
    for i, (file, cluster) in enumerate(zip(filenames, clusters)):
        for b in patches_blocks[file]:
            x[b, cluster] += 1
    x = pd.DataFrame(x, columns = [f"cluster_{i}" for i in range(k)])
    x = x.loc[:, x.sum(axis = 0) > 0]
    return x

def fraction_of_patches_cluster():
    x = np.zeros((blocks_df.shape[0], k))
    for i, (file, cluster) in enumerate(zip(filenames, clusters)):
        for b in patches_blocks[file]:
            x[b, cluster] += 1
    x_sum = x.sum(axis = 1)
    x = x / x_sum[:, None]
    x = pd.DataFrame(x, columns = [f"cluster_{i}" for i in range(k)])
    x = x.loc[:, x.sum(axis = 0) > 0]
    x["count"] = x_sum
    return x

def fraction_of_patches_cluster_coords():
    x = np.zeros((blocks_df.shape[0], k))
    for i, (file, cluster) in enumerate(zip(filenames, clusters)):
        for b in patches_blocks[file]:
            x[b, cluster] += 1
    x_sum = x.sum(axis = 1)
    x = x / x_sum[:, None]
    x = pd.DataFrame(x, columns = [f"cluster_{i}" for i in range(k)])
    x = x.loc[:, x.sum(axis = 0) > 0]
    x["count"] = x_su
    x["x"] = blocks_df.geometry.centroid.x
    x["y"] = blocks_df.geometry.centroid.y
    return x

def distance_of_patches_cluster():
    x = np.zeros((blocks_df.shape[0], k))
    for i, (file, distances) in enumerate(zip(filenames, clusters_distance)):
        for b in patches_blocks[file]:
            x[b] += distances
    x_sum = blocks_df.n_patches.values.reshape(-1)
    x = x / x_sum[:, None]
    x = pd.DataFrame(x, columns = [f"cluster_{i}" for i in range(k)])
    x = x.loc[:, x.sum(axis = 0) > 0]
    x["count"] = x_sum
    return x

def distance_of_patches_cluster_coords():
    x = np.zeros((blocks_df.shape[0], k))
    for i, (file, distances) in enumerate(zip(filenames, clusters_distance)):
        for b in patches_blocks[file]:
            x[b] += distances
    x_sum = blocks_df.n_patches.values.reshape(-1)
    x = x / x_sum[:, None]
    x = pd.DataFrame(x, columns = [f"cluster_{i}" for i in range(k)])
    x = x.loc[:, x.sum(axis = 0) > 0]
    x["count"] = x_sum
    x["x"] = blocks_df.geometry.centroid.x
    x["y"] = blocks_df.geometry.centroid.y
    return x

def get_x(method):
    if method == "count":
        return count_of_patches_cluster()
    elif method == "fraction":
        return fraction_of_patches_cluster()
    elif method == "fraction_coords":
        return fraction_of_patches_cluster_coords()
    elif method == "distance":
        return distance_of_patches_cluster()
    elif method == "distance_coords":
        return distance_of_patches_cluster_coords()

## Exploratory analysis

## Training all the models

In [16]:
def eval(clf, x_train, y_train, x_test, y_test):
    r2_train = r2_score(y_train, clf.predict(x_train))
    r2_test = r2_score(y_test, clf.predict(x_test))
    return r2_train, r2_test

In [17]:
def grid_search_rf(x_train, y_train, x_test, y_test):
    rf = RandomForestRegressor()
    parameters = {
        "n_estimators": [10, 100, 1000],
        "max_depth": [10, 25],
        #"min_samples_split": [2, 10, 100],
    }
    clf = GridSearchCV(rf, parameters, n_jobs=-1)
    clf.fit(x_train, y_train)
    return eval(clf, x_train, y_train, x_test, y_test)

In [18]:
class MLP(nn.Module):
    def __init__(self, dims):
        super(MLP, self).__init__()
        self.layers = []
        for in_dim, out_dim in zip(dims[:-1], dims[1:]):
            self.layers.append(nn.Linear(in_dim, out_dim))
            if out_dim != dims[-1]:
                self.layers.append(nn.ReLU())
        self.layers = nn.Sequential(*self.layers)

    def forward(self, x):
        return self.layers(x)

    def predict(self, x):
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.eval()
        if type(x) == pd.DataFrame:
            x_ = torch.from_numpy(x.values)
        elif type(x) == np.ndarray:
            x_ = torch.from_numpy(x)
            
        with torch.no_grad():
            x_ = x_.to(device)
            y = self.layers(x_)
            return y.detach().cpu().numpy()


In [19]:

def train_mlp(model, dl_train, dl_test):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()
    test_loss = []
    for i in range(100):
        iter_loss = 0
        for x, y in dl_train:
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = criterion(y_pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            iter_loss += loss.item()

        if i % 3 == 0:
            iter_loss = 0
            with torch.no_grad():
                for x, y in dl_test:
                    x, y = x.to(device), y.to(device)
                    y_pred = model(x)
                    loss = criterion(y_pred, y)
                    iter_loss += loss.item()
                test_loss.append(iter_loss)

            if i > 10 and test_loss[-1] > test_loss[-2]:
                break        

def grid_search_mlp(x_train, y_train, x_test, y_test):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    idx_train_, idx_val = train_test_split(np.arange(x_train.shape[0]), test_size = 0.2, random_state = 0)
    x_val_, y_val_ = x_train.values[idx_val, :], y_train[idx_val]
    x_train_, y_train_ = x_train.values[idx_train_, :], y_train[idx_train_]
    x_test_, y_test_ = x_test.values, y_test
    scaler = StandardScaler()
    x_train_ = scaler.fit_transform(x_train_)
    x_val_ = scaler.transform(x_val_)
    x_test_ = scaler.transform(x_test_)
    dl_train_ = DataLoader(TensorDataset(torch.tensor(x_train_), torch.tensor(y_train_.reshape(-1, 1))), batch_size = 128)
    dl_val = DataLoader(TensorDataset(torch.tensor(x_val_), torch.tensor(y_val_).reshape(-1, 1)), batch_size = 128)

    best_r2 = -np.inf
    best_model = None
    for dims in [[x_train.shape[1], 64, 32, 1], [x_train.shape[1], 64, 256, 32, 1], [x_train.shape[1], 64, 512, 128, 1]]:
        model_1 = MLP(dims)
        model_1.to(device, dtype = torch.double)
        train_mlp(model_1, dl_train_, dl_val)
        r2_train, r2_test = eval(model_1, x_train_, y_train_, x_val_, y_val_)

        if r2_test > best_r2:
            best_r2 = r2_test
            best_model = model_1
    
    return eval(best_model, x_train_, y_train_, x_test_, y_test_)

In [None]:
idx_train, idx_test = train_test_split(np.arange(blocks_df.shape[0]), test_size = 0.2, random_state = 0)
df_results = []
for method in ["count", "fraction", "fraction_coords", "distance", "distance_coords"]:
    x = get_x(method)
    x_train, x_test = x.loc[idx_train, :], x.loc[idx_test, :]
    for target in ["mhi", "ed_attain", "density"]:
        y_train, y_test = blocks_df[target].values[idx_train], blocks_df[target].values[idx_test] 
        
        for regression in ["mlp", "rf"]:
            if regression == "rf":
                r2_train, r2_test = grid_search_rf(x_train, y_train, x_test, y_test)
            elif regression == "mlp":
                r2_train, r2_test = grid_search_mlp(x_train, y_train, x_test, y_test)

            print(f"method: {method}, target: {target}, regression: {regression}, r2_train: {r2_train:.2f}, r2_test: {r2_test:.2f}")
            df_results.append([method, target, regression, r2_train, r2_test])

In [30]:
pd.DataFrame(
    df_results, 
    columns = ["method", "target", "regression", "r2_train", "r2_test"]
).sort_values("r2_test", ascending = False).groupby(["target"]).head(1)

Unnamed: 0,method,target,regression,r2_train,r2_test
17,fraction_coords,density,rf,0.846018,0.762669
15,fraction_coords,ed_attain,rf,0.882434,0.564365
13,fraction_coords,mhi,rf,0.855794,0.412117


In [31]:
pd.DataFrame(
    [x for x in df_results if not "coords" in x[0]], 
    columns = ["method", "target", "regression", "r2_train", "r2_test"]
).sort_values("r2_test", ascending = False).groupby(["target"]).head(1)

Unnamed: 0,method,target,regression,r2_train,r2_test
17,distance,density,rf,0.85455,0.725477
14,distance,ed_attain,mlp,0.335202,0.23367
13,distance,mhi,rf,0.839111,0.178135
