In [2]:
import pandas as pd
import geopandas as gpd
from scipy.spatial import distance
import numpy as np
import arviz as az
import xarray as xr
import pystan
import nest_asyncio
import matplotlib.pyplot as plt

In [20]:
shapefile_path = '/content/drive/MyDrive/stat836-final-project/neighborhoods'
sf_neighborhoods = gpd.read_file(shapefile_path)

# SF case and population data
sf_cases = pd.read_csv('/content/drive/MyDrive/stat836-final-project/master_merged.csv', index_col=False)
sf_cases.rename(columns={'Neighborhood': 'nhood'}, inplace=True)

In [21]:
sf_cases = sf_cases.drop(41)

# Calculating distance matrix
sf_neighborhoods_projected = sf_neighborhoods.to_crs(epsg=32610)
sf_neighborhoods_projected['centroid'] = sf_neighborhoods_projected.geometry.centroid

neighborhood_x = sf_neighborhoods_projected['centroid'].geometry.x
neighborhood_y = sf_neighborhoods_projected['centroid'].geometry.y

neighborhood_xy = np.column_stack((neighborhood_x, neighborhood_y))

distance_matrix = distance.cdist(neighborhood_xy, neighborhood_xy, 'euclidean')
distance_matrix /=  1000.0

# Compute weight matrix based off of if they are under 20% quantile
quantile_20 = np.quantile(distance_matrix, 0.2)
weight_matix = (distance_matrix < quantile_20).astype(int)
weight_matix

array([[1, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [22]:
sf_neighborhoods = pd.merge(sf_neighborhoods, sf_cases, on = 'nhood', how = 'left')

sf_neighborhoods['Homeless Cases'] = sf_neighborhoods['Homeless Cases'].replace(0,2)
sf_neighborhoods['Homeless Cases']

0     1238.0
1      101.0
2       13.0
3        2.0
4     4339.0
5        2.0
6      359.0
7        2.0
8      437.0
9      118.0
10     131.0
11     243.0
12     263.0
13     417.0
14      78.0
15     139.0
16     130.0
17     185.0
18       6.0
19       2.0
20      79.0
21     369.0
22    2625.0
23      86.0
24     434.0
25      51.0
26    2047.0
27       2.0
28      44.0
29     125.0
30    1163.0
31     312.0
32     756.0
33     397.0
34      45.0
35       2.0
36      66.0
37      83.0
38     620.0
39     287.0
40     619.0
Name: Homeless Cases, dtype: float64

In [24]:
SF_code = """
data {
    int n;
    int y[n];
    vector[n] logEi;
    vector<lower=0>[n] income;
    vector<lower=0>[n] homevalue;
    vector<lower=0>[n] poverty;
    vector<lower=0>[n] unemployed;
    vector<lower=0>[n] education;
    matrix<lower=0>[n,n] wmat;
}
parameters {
    vector[n] theta;
    vector[n] u;
    real beta1;
    real beta3;
    real beta4;
    real beta5;
    real beta6;
    real<lower=0> sigma2_u;
    real<lower=0> sigma2_v;
    real<lower=0> lambda;
}
model {
    lambda ~ gamma(2, 1);
    sigma2_u ~ inv_gamma(0.0005, 0.5);
    sigma2_v ~ inv_gamma(0.0005, 0.5);
    beta1 ~ double_exponential(0, lambda);
    beta3 ~ double_exponential(0, lambda);
    beta4 ~ double_exponential(0, lambda);
    beta5 ~ double_exponential(0, lambda);
    beta6 ~ double_exponential(0, lambda);

    target += -0.5 * n * log(sigma2_u);
    for (i in 1:n) {
        for (j in 1:n) {
            target += -0.5 * (u[i] - u[j])^2 * wmat[i, j] / sigma2_u;
        }
    }

    for (i in 1:n) {
        theta[i] ~ normal(logEi[i] + beta1 * income[i] + beta3 * homevalue[i] + beta4 * poverty[i] + beta5 * unemployed[i] + beta6 * education[i] + u[i], sqrt(sigma2_v));
        y[i] ~ poisson(exp(theta[i]));
    }
}
"""

In [23]:
data = {
    'n': 41,
    'y': np.array(sf_neighborhoods['Homeless Cases']).astype(int).tolist(),
    'logEi': sf_neighborhoods['Log Expected Cases'].tolist(),
    'income': sf_neighborhoods['Median Household Income'].tolist(),
    #'foreign': sf_neighborhoods['Foreign Born'].tolist(),
    'homevalue': sf_neighborhoods['Median Home Value'].tolist(),
    'poverty': sf_neighborhoods['Percent in Poverty'].tolist(),
    'unemployed': sf_neighborhoods['A_Unemployment Rate'].tolist(),
    'education': sf_neighborhoods['Bachelor\'s degree or higher'].tolist(),
    'wmat': weight_matix.tolist()
}

In [25]:
sm = pystan.StanModel(model_code=SF_code)

In [None]:
fit = sm.sampling(data=data, chains=1, iter=20000, warmup=7500, control={'max_treedepth': 14, 'adapt_delta': 0.95})

In [None]:
idata = az.from_pystan(fit)

az.plot_trace(idata, var_names=['theta'])
plt.show()

az.plot_trace(idata, var_names=['beta1'])
plt.show()

az.plot_trace(idata, var_names=['beta3'])
plt.show()

az.plot_trace(idata, var_names=['beta4'])
plt.show()

az.plot_trace(idata, var_names=['beta5'])
plt.show()

az.plot_trace(idata, var_names=['beta6'])
plt.show()

az.plot_trace(idata, var_names=['sigma2_u'])
plt.show()

az.plot_trace(idata, var_names=['sigma2_v'])
plt.show()

In [None]:
samples = fit.extract(permuted=True)
new_samples = {}


for key, value in samples.items():
    if len(value.shape) > 1:
        for i in range(value.shape[1]):
            new_samples[f"{key}_{i}"] = value[:, i]
    else:
        new_samples[key] = value

samples.update(new_samples)

keys_to_remove = [key for key, value in samples.items() if isinstance(value, np.ndarray) and len(value.shape) > 1]
for key in keys_to_remove:
    del samples[key]

samples_df = pd.DataFrame(samples)

In [19]:
samples_df.to_csv('model_results/bayesian_lasso_sample.csv', index=False)

In [3]:
samples_df = pd.read_csv('model_results/bayesian_lasso_sample.csv')

In [4]:
betas = ['beta1', 'beta3', 'beta4', 'beta5', 'beta6']

results = {}
for beta in betas:
    mean = samples_df[beta].mean()
    ci_lower = np.percentile(samples_df[beta], 2.5)
    ci_upper = np.percentile(samples_df[beta], 97.5)
    results[beta] = {'mean': mean, '95% CI': (ci_lower, ci_upper)}

for beta, vals in results.items():
    print(f"{beta}: Mean = {vals['mean']}, 95% Credible Interval = {vals['95% CI']}")

beta1: Mean = -1.1128381086499884e-05, 95% Credible Interval = (-2.2218918466174046e-05, 5.8445346391411605e-08)
beta3: Mean = -2.3001572869524977e-06, 95% Credible Interval = (-3.4370202551780097e-06, -1.186770108413141e-06)
beta4: Mean = 0.6988388447431003, 95% Credible Interval = (0.6904819125459664, 0.7056114294615995)
beta5: Mean = 0.0004671618886451492, 95% Credible Interval = (-0.0009177838598376849, 0.0017800986824795873)
beta6: Mean = -1.3445910824247778, 95% Credible Interval = (-1.3571435706877035, -1.3295380809817936)


In [9]:
original_width = -3.77e-06 + 9.53e-07
bayesian_lasso_width = -3.4370e-06 + 1.1868e-06

original_width, bayesian_lasso_width, original_width - bayesian_lasso_width


(-2.817e-06, -2.2502e-06, -5.667999999999998e-07)