In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import geopandas as gpd
import os


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
# specify working directory
os.chdir('.../replication_codes')

In [3]:
# Naive approach
import statsmodels.api as sm

# GASS CDIOM
from terms import ConstantTerm, LinearTerm, SATerm, DIOTerm
from gass import CDIOM

# record running time
import datetime

# Generate points distributed in cluster

In [381]:
# for generating point patterns
from pointpats import PoissonClusterPointProcess, as_window
from libpysal.cg import Point, Polygon 

## Repeated simulations

In [456]:
bgm_coefs = [] #k, rho, nu, beta 

ncdm_coefs = [] #k, rho, nu, beta, delta 

gcdm_coefs = [] #k, rho, nu, beta, delta
gcdm_sigmas = [] 
gcdm_awci = [] #lower, upper

niom_coefs = [] #k, rho, nu, beta, gamma 

giom_coefs = [] #k, rho, nu, beta, gamma
giom_sigmas = [] 
giom_awci = [] #lower, upper

ncdiom_coefs = [] #k, rho, nu, beta, delta, gamma

gcdiom_coefs = [] #k, rho, nu, beta, delta, gamma
gcdiom_sigmas = []
gcdiom_awci = [] #lower1, upper1, lower2, upper2

square = Polygon([Point((0, 0)), Point((0, 500)), Point((500, 500)), Point((500, 0))])
squwin = as_window(square)

for seed in np.arange(100):
    np.random.seed(seed)
    
    #𝑁-conditioned CSR:  𝑁 is fixed
    squcsamples = PoissonClusterPointProcess(squwin, 49, 5, 50, 1, asPP=False, conditioning = False)
    squpts_cluster = squcsamples.realizations[0]

    # Create DataFrame
    clu_df = pd.DataFrame(squpts_cluster, columns = ['Lon', 'Lat'])
    clu_df = clu_df.assign(Name = np.arange(clu_df.shape[0]))
    digits = len(str(49))
    clu_df.Name = clu_df.Name.astype(str).str.zfill(digits).apply(lambda x: "{}{}".format('U', x))

    # Add attributes, X1
    np.random.seed(seed)
    attr1 = np.random.randint(500, 500001, size=49)
    clu_df = clu_df.assign(X1 = attr1)

    # Create GeoDataFrame
    clu_gdf = gpd.GeoDataFrame(clu_df.copy(), geometry=gpd.points_from_xy(clu_df.Lon, clu_df.Lat))
    
    # Calculate Euclidean Distance
    clu_lon_lat = np.column_stack((clu_gdf.Lon.to_numpy(), clu_gdf.Lat.to_numpy()))
    clu_dist_mat = np.linalg.norm(clu_lon_lat - clu_lon_lat[:,None], axis=-1)
    clu_dists = clu_dist_mat[clu_dist_mat != 0].reshape((-1,1))
    
    # Create Distance DataFrame
    places = clu_df.Name.values.reshape((-1,1))
    orig = np.repeat(places, len(places)).reshape((-1,1))
    dest = np.tile(places, (len(places),1))
    codes_df = pd.DataFrame(np.column_stack((orig, dest)))
    mask = codes_df.iloc[:,0] != codes_df.iloc[:,1]
    codes = orig + dest
    codes = codes[mask].reshape((-1,1))
    clu_dist_df = pd.DataFrame(np.column_stack((codes, clu_dists)), columns = ['Code', 'dist'])
    
    # Create Simulated Flow Data
    simul = pd.DataFrame(codes, columns = ['Code'])
    simul = simul.assign(Origin = simul.Code.str[:digits+1], Destination = simul.Code.str[digits+1:])

    simul = simul.join(clu_df.set_index(clu_df.Name)[['Name','X1']], how = 'left', on = 'Origin')
    simul = simul.drop(['Name'], axis = 1)
    simul = simul.rename(columns={"X1": "pop_o"})

    simul = simul.join(clu_df.set_index(clu_df.Name)[['Name','X1']], how = 'left', on = 'Destination')
    simul = simul.drop(['Name'], axis = 1)
    simul = simul.rename(columns={"X1": "pop_d"})

    clu_simul = simul.assign(dist = clu_dist_df.dist)
    
    clu_simul_lin_sd = LinearTerm(clu_simul, 3, 4, 5, log = True, standard = True) 
    
    sa_pop_sd = SATerm(od_data = clu_simul, dest_data = clu_gdf, 
                o_ids = 'Origin', d_ids = 'Destination', 
                dest_ids = 'Name', dest_attr = 'X1', 
                log = True, standard = True)
    
    spop_sa_sd = sa_pop_sd.cal(-0.5)
    
    dio_pop_sd = DIOTerm(od_data = clu_simul, orig_data = clu_gdf, 
                  o_ids = 'Origin', d_ids = 'Destination', 
                  orig_ids = 'Name', orig_attr = 'X1', 
                  log = True, standard = True)
    
    spop_dio_sd = dio_pop_sd.cal(-2)
    
    coefs = np.array([[1], [1], [-1], [-1], [-1]])
    clu_simul_X = np.hstack((clu_simul_lin_sd.X, spop_sa_sd, spop_dio_sd))
    
    clu_simul_y_mean = np.exp(np.dot(clu_simul_X, coefs)).reshape((-1,1))
    np.random.seed(seed)
    clu_simul_y = np.random.poisson(lam=clu_simul_y_mean).flatten()
    
    pop_origin = pd.to_numeric(clu_simul.pop_o.values).reshape((-1,1))
    pop_destination = pd.to_numeric(clu_simul.pop_d.values).reshape((-1,1))
    
    # BG model
    vi = np.hstack([pop_origin])
    mj = np.hstack([pop_destination])
    dij = pd.to_numeric(clu_simul.dist.values).reshape((-1,1))

    X = np.hstack((vi, mj, dij))
    logX = np.log(X)
    mean = np.mean(logX, axis=0)
    std = np.std(logX, axis=0)
    standardized_logX = (logX - mean) / std

    X_df = pd.DataFrame(standardized_logX)
    X_df = sm.add_constant(X_df, prepend=False)

    bgm = sm.GLM(clu_simul_y, X_df, family=sm.families.Poisson()).fit()
    bgm_coefs.append(bgm.params.values) 
    
    # Conventional CD model
    sa_pop = SATerm(od_data = clu_simul, dest_data = clu_gdf, 
                o_ids = 'Origin', d_ids = 'Destination', 
                dest_ids = 'Name', dest_attr = 'X1', 
                log = False, standard = False)
    spop_sa = sa_pop.cal(-1)
    
    vi = np.hstack([pop_origin])
    mj = np.hstack([pop_destination])
    dij = pd.to_numeric(clu_simul.dist.values).reshape((-1,1))

    sij = np.hstack([spop_sa])

    X = np.hstack((vi, mj, dij, sij))
    logX = np.log(X)
    mean = np.mean(logX, axis=0)
    std = np.std(logX, axis=0)
    standardized_logX = (logX - mean) / std

    X_df = pd.DataFrame(standardized_logX)
    X_df = sm.add_constant(X_df, prepend=False)

    ncdm = sm.GLM(clu_simul_y, X_df, family=sm.families.Poisson()).fit()
    ncdm_coefs.append(ncdm.params.values) 

    # Data-driven CD model
    clu_simul_lin_sd = LinearTerm(clu_simul, 3, 4, 5, log = True, standard = True) 
    
    sa_pop_sd = SATerm(od_data = clu_simul, dest_data = clu_gdf, 
                   o_ids = 'Origin', d_ids = 'Destination', 
                   dest_ids = 'Name', dest_attr = 'X1', 
                   log = True, standard = True)
    
    gcdm = CDM(clu_simul_y, clu_simul_lin_sd, sa_pop_sd, constant = True) 
    gcdm.fit_Poisson(printed = False, verbose = False) 
    gcdm.inference_Poisson()
    gcdm.calculate_AWCI_sigmas()
    
    gcdm_coefs.append(gcdm.coefficients.flatten())  
    gcdm_sigmas.append(np.array(gcdm.sigmas).flatten())
    gcdm_awci.append(np.array(gcdm.AWCI_sigmas).flatten())
    
    # Conventional IO model
    dio_pop = DIOTerm(od_data = clu_simul, orig_data = clu_gdf, 
                  o_ids = 'Origin', d_ids = 'Destination', 
                  orig_ids = 'Name', orig_attr = 'X1', 
                  log = False, standard = False)
    spop_dio = dio_pop.cal(-1)
    
    vi = np.hstack([pop_origin])
    mj = np.hstack([pop_destination])
    dij = pd.to_numeric(clu_simul.dist.values).reshape((-1,1))

    sij = np.hstack([spop_dio])

    X = np.hstack((vi, mj, dij, sij))
    logX = np.log(X)
    mean = np.mean(logX, axis=0)
    std = np.std(logX, axis=0)
    standardized_logX = (logX - mean) / std
    
    X_df = pd.DataFrame(standardized_logX)
    X_df = sm.add_constant(X_df, prepend=False)

    # Fit the Poisson GLM of IOM
    niom = sm.GLM(clu_simul_y, X_df, family=sm.families.Poisson()).fit()
    niom_coefs.append(niom.params.values) 
    
    # Data-driven IO model
    clu_simul_lin_sd = LinearTerm(clu_simul, 3, 4, 5, log = True, standard = True)
    
    dio_pop_sd = DIOTerm(od_data = clu_simul, orig_data = clu_gdf, 
                  o_ids = 'Origin', d_ids = 'Destination', 
                  orig_ids = 'Name', orig_attr = 'X1', 
                  log = True, standard = True)
    
    giom = CDM(clu_simul_y, clu_simul_lin_sd, dio_pop_sd, constant = True) 
    giom.fit_Poisson(printed = False, verbose = False) 
    giom.inference_Poisson()
    giom.calculate_AWCI_sigmas()
    
    giom_coefs.append(giom.coefficients.flatten())  
    giom_sigmas.append(np.array(giom.sigmas).flatten())
    giom_awci.append(np.array(giom.AWCI_sigmas).flatten())
    
    # Conventional CDIO model
    sa_pop = SATerm(od_data = clu_simul, dest_data = clu_gdf, 
                o_ids = 'Origin', d_ids = 'Destination', 
                dest_ids = 'Name', dest_attr = 'X1', 
                log = False, standard = False)
    spop_sa = sa_pop.cal(-1)
    
    dio_pop = DIOTerm(od_data = clu_simul, orig_data = clu_gdf, 
                  o_ids = 'Origin', d_ids = 'Destination', 
                  orig_ids = 'Name', orig_attr = 'X1', 
                  log = False, standard = False)
    spop_dio = dio_pop.cal(-1)
    
    vi = np.hstack([pop_origin])
    mj = np.hstack([pop_destination])
    dij = pd.to_numeric(clu_simul.dist.values).reshape((-1,1))

    sij = np.hstack([spop_sa, spop_dio])

    X = np.hstack((vi, mj, dij, sij))
    logX = np.log(X)
    mean = np.mean(logX, axis=0)
    std = np.std(logX, axis=0)
    standardized_logX = (logX - mean) / std

    X_df = pd.DataFrame(standardized_logX)
    X_df = sm.add_constant(X_df, prepend=False)

    ncdiom = sm.GLM(clu_simul_y, X_df, family=sm.families.Poisson()).fit()
    ncdiom_coefs.append(ncdiom.params.values) 

    # Data-driven CDIO model
    clu_simul_lin_sd = LinearTerm(clu_simul, 3, 4, 5, log = True, standard = True) 
    
    sa_pop_sd = SATerm(od_data = clu_simul, dest_data = clu_gdf, 
                   o_ids = 'Origin', d_ids = 'Destination', 
                   dest_ids = 'Name', dest_attr = 'X1', 
                   log = True, standard = True)
    
    dio_pop_sd = DIOTerm(od_data = clu_simul, orig_data = clu_gdf, 
                  o_ids = 'Origin', d_ids = 'Destination', 
                  orig_ids = 'Name', orig_attr = 'X1', 
                  log = True, standard = True)
    
    gcdiom = CDM(clu_simul_y, clu_simul_lin_sd, sa_pop_sd, dio_pop_sd, constant = True) 
    gcdiom.fit_Poisson(printed = False, verbose = False) 
    gcdiom.inference_Poisson()
    gcdiom.calculate_AWCI_sigmas()
    
    gcdiom_coefs.append(gcdiom.coefficients.flatten())  
    gcdiom_sigmas.append(np.array(gcdiom.sigmas).flatten())
    gcdiom_awci.append(np.array(gcdiom.AWCI_sigmas).flatten())
    
    print(seed)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [458]:
bgm_coefs_df = pd.DataFrame(bgm_coefs)
bgm_coefs_df.columns = ['rho', 'nu', 'beta', 'k']
bgm_coefs_df = bgm_coefs_df[['k', 'rho', 'nu', 'beta']]

ncdm_coefs_df = pd.DataFrame(ncdm_coefs)
ncdm_coefs_df.columns = ['rho', 'nu', 'beta', 'delta', 'k']
ncdm_coefs_df = ncdm_coefs_df[['k', 'rho', 'nu', 'beta', 'delta']]

gcdm_coefs_df = pd.DataFrame(gcdm_coefs)
gcdm_coefs_df.columns = ['k', 'rho', 'nu', 'beta', 'delta']
gcdm_sigmas_df = pd.DataFrame(gcdm_sigmas)
gcdm_sigmas_df.columns = ['sigma']
gcdm_coefs_df['sigma'] = gcdm_sigmas_df.sigma

gcdm_awci_df = pd.DataFrame(gcdm_awci)
gcdm_awci_df.columns = ['lower', 'upper']
gcdm_coefs_df['lower'] = gcdm_awci_df.lower
gcdm_coefs_df['upper'] = gcdm_awci_df.upper

niom_coefs_df = pd.DataFrame(niom_coefs)
niom_coefs_df.columns = ['rho', 'nu', 'beta', 'gamma', 'k']
niom_coefs_df = niom_coefs_df[['k', 'rho', 'nu', 'beta', 'gamma']]

giom_coefs_df = pd.DataFrame(giom_coefs)
giom_coefs_df.columns = ['k', 'rho', 'nu', 'beta', 'gamma']
giom_sigmas_df = pd.DataFrame(giom_sigmas)
giom_sigmas_df.columns = ['sigma']
giom_coefs_df['sigma'] = giom_sigmas_df.sigma

giom_awci_df = pd.DataFrame(giom_awci)
giom_awci_df.columns = ['lower', 'upper']
giom_coefs_df['lower'] = giom_awci_df.lower
giom_coefs_df['upper'] = giom_awci_df.upper

ncdiom_coefs_df = pd.DataFrame(ncdiom_coefs)
ncdiom_coefs_df.columns = ['rho', 'nu', 'beta', 'delta', 'gamma', 'k']
ncdiom_coefs_df = ncdiom_coefs_df[['k', 'rho', 'nu', 'beta', 'delta', 'gamma']]

gcdiom_coefs_df = pd.DataFrame(gcdiom_coefs)
gcdiom_coefs_df.columns = ['k', 'rho', 'nu', 'beta', 'delta', 'gamma']
gcdiom_sigmas_df = pd.DataFrame(gcdiom_sigmas)
gcdiom_sigmas_df.columns = ['sigma_cd', 'sigma_io']
gcdiom_coefs_df['sigma_cd'] = gcdiom_sigmas_df.sigma_cd
gcdiom_coefs_df['sigma_io'] = gcdiom_sigmas_df.sigma_io
gcdiom_awci_df = pd.DataFrame(gcdiom_awci)
gcdiom_awci_df.columns = ['lower_cd', 'upper_cd', 'lower_io', 'upper_io']
gcdiom_coefs_df['lower_cd'] = gcdiom_awci_df.lower_cd
gcdiom_coefs_df['upper_cd'] = gcdiom_awci_df.upper_cd
gcdiom_coefs_df['lower_io'] = gcdiom_awci_df.lower_io
gcdiom_coefs_df['upper_io'] = gcdiom_awci_df.upper_io

In [429]:
bgm_coefs_df

Unnamed: 0,k,rho,nu,beta
0,0.924508,0.826191,0.740535,-0.357313
1,1.250308,0.792361,0.739876,-0.512044


In [430]:
ncdm_coefs_df

Unnamed: 0,k,rho,nu,beta,delta
0,0.249358,0.926662,0.849819,-0.909264,-1.221537
1,-0.230329,1.08806,0.773429,-1.255624,-1.667545


In [431]:
gcdm_coefs_df

Unnamed: 0,k,rho,nu,beta,delta,sigma,lower,upper
0,0.3018,0.912596,0.852546,-0.837701,-1.172826,-0.59,-0.67,-0.5
1,-0.110053,0.998126,0.717908,-1.189399,-1.568226,-0.4,-0.59,-0.22


In [432]:
niom_coefs_df

Unnamed: 0,k,rho,nu,beta,gamma
0,0.325012,0.925436,0.832421,-0.868809,-1.152391
1,-0.130944,0.838867,1.015236,-1.226718,-1.608264


In [433]:
giom_coefs_df

Unnamed: 0,k,rho,nu,beta,delta,sigma
0,0.298603,0.914882,0.836202,-0.888298,-1.184164,-1.14
1,-0.176286,0.835544,1.039933,-1.248731,-1.645264,-1.15


In [460]:
ncdiom_coefs_df

Unnamed: 0,k,rho,nu,beta,delta,gamma
0,0.049144,1.073943,0.995620,-0.975001,-0.978064,-0.909762
1,0.096871,1.040834,0.960696,-0.965868,-0.989600,-0.903527
2,0.103281,0.985106,1.030791,-0.957636,-0.944739,-0.894405
3,0.112615,0.892341,1.054764,-0.970524,-1.026340,-0.841341
4,0.104486,0.994799,1.008680,-0.944470,-1.006244,-0.830402
...,...,...,...,...,...,...
95,0.052162,0.970574,1.050070,-0.976566,-0.869240,-1.012181
96,0.101967,0.956246,1.011686,-0.988827,-0.897148,-0.960085
97,0.124914,0.991428,1.130685,-0.952964,-0.857284,-0.958371
98,0.025647,1.034911,1.119376,-0.970887,-1.013416,-0.879920


In [461]:
gcdiom_coefs_df

Unnamed: 0,k,rho,nu,beta,delta,gamma,sigma_cd,sigma_io,lower_cd,upper_cd,lower_io,upper_io
0,-0.021753,1.011627,0.991157,-1.012002,-1.016991,-0.982523,-0.51,-2.06,-0.54,-0.46,-2.19,-1.93
1,0.000060,1.010201,1.010188,-1.005048,-0.999593,-1.004628,-0.54,-2.02,-0.57,-0.40,-2.13,-1.95
2,-0.005854,1.002135,0.975241,-1.009901,-1.016725,-1.003048,-0.50,-2.02,-0.54,-0.46,-2.13,-1.94
3,0.003896,0.992250,0.995588,-1.008635,-1.004955,-0.995454,-0.50,-2.03,-0.54,-0.47,-2.12,-1.95
4,-0.000059,1.013709,0.992965,-0.985039,-0.998225,-0.996529,-0.49,-1.96,-0.54,-0.45,-2.04,-1.88
...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.022660,1.007838,1.002856,-1.002287,-0.995723,-1.008349,-0.50,-2.02,-0.52,-0.48,-2.15,-1.91
96,-0.011072,1.011703,0.978049,-1.009854,-0.996912,-1.014084,-0.49,-2.08,-0.52,-0.48,-2.18,-1.99
97,0.032685,0.983117,0.990561,-0.992857,-0.996840,-0.989792,-0.49,-1.97,-0.52,-0.46,-2.02,-1.92
98,-0.013636,1.032506,0.990491,-0.998123,-1.003295,-1.006320,-0.51,-2.02,-0.55,-0.48,-2.14,-1.91


In [459]:
bgm_coefs_df.to_csv('res/clu_bgm2s_coefs_df.csv', index=False)  
ncdm_coefs_df.to_csv('res/clu_ncdm_coefs_df.csv', index=False) 
gcdm_coefs_df.to_csv('res/clu_gcdm_coefs_df.csv', index=False) 
niom_coefs_df.to_csv('res/clu_niom_coefs_df.csv', index=False) 
giom_coefs_df.to_csv('res/clu_giom_coefs_df.csv', index=False)
ncdiom_coefs_df.to_csv('res/clu_ncdiom_coefs_df.csv', index=False) 
gcdiom_coefs_df.to_csv('res/clu_gcdiom_coefs_df.csv', index=False) 