In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import geopandas as gpd
import os


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
# specify working directory
os.chdir('.../replication_codes')

In [3]:
# Naive approach
import statsmodels.api as sm

# GASS CDIOM
from terms import ConstantTerm, LinearTerm, SATerm, DIOTerm
from gass import CDIOM

# record running time
import datetime

# Generate points distributed in cluster

In [4]:
# for generating point patterns
from pointpats import PoissonClusterPointProcess, as_window
from libpysal.cg import Point, Polygon 

## Repeated simulations

In [6]:
bgm_coefs = [] #k, rho, nu, beta 

niom_coefs = [] #k, rho, nu, beta, delta 

giom_coefs = [] #k, rho, nu, beta, delta
giom_sigmas = [] 
giom_awci = [] #lower, upper

square = Polygon([Point((0, 0)), Point((0, 500)), Point((500, 500)), Point((500, 0))])
squwin = as_window(square)

for seed in np.arange(100):
    np.random.seed(seed)
    
    #𝑁-conditioned CSR:  𝑁 is fixed
    squcsamples = PoissonClusterPointProcess(squwin, 49, 5, 50, 1, asPP=False, conditioning = False)
    squpts_cluster = squcsamples.realizations[0]

    # Create DataFrame
    clu_df = pd.DataFrame(squpts_cluster, columns = ['Lon', 'Lat'])
    clu_df = clu_df.assign(Name = np.arange(clu_df.shape[0]))
    digits = len(str(49))
    clu_df.Name = clu_df.Name.astype(str).str.zfill(digits).apply(lambda x: "{}{}".format('U', x))

    # Add attributes, X1
    np.random.seed(seed)
    attr1 = np.random.randint(500, 500001, size=49)
    clu_df = clu_df.assign(X1 = attr1)

    # Create GeoDataFrame
    clu_gdf = gpd.GeoDataFrame(clu_df.copy(), geometry=gpd.points_from_xy(clu_df.Lon, clu_df.Lat))
    
    # Calculate Euclidean Distance
    clu_lon_lat = np.column_stack((clu_gdf.Lon.to_numpy(), clu_gdf.Lat.to_numpy()))
    clu_dist_mat = np.linalg.norm(clu_lon_lat - clu_lon_lat[:,None], axis=-1)
    clu_dists = clu_dist_mat[clu_dist_mat != 0].reshape((-1,1))
    
    # Create Distance DataFrame
    places = clu_df.Name.values.reshape((-1,1))
    orig = np.repeat(places, len(places)).reshape((-1,1))
    dest = np.tile(places, (len(places),1))
    codes_df = pd.DataFrame(np.column_stack((orig, dest)))
    mask = codes_df.iloc[:,0] != codes_df.iloc[:,1]
    codes = orig + dest
    codes = codes[mask].reshape((-1,1))
    clu_dist_df = pd.DataFrame(np.column_stack((codes, clu_dists)), columns = ['Code', 'dist'])
    
    # Create Simulated Flow Data
    simul = pd.DataFrame(codes, columns = ['Code'])
    simul = simul.assign(Origin = simul.Code.str[:digits+1], Destination = simul.Code.str[digits+1:])

    simul = simul.join(clu_df.set_index(clu_df.Name)[['Name','X1']], how = 'left', on = 'Origin')
    simul = simul.drop(['Name'], axis = 1)
    simul = simul.rename(columns={"X1": "pop_o"})

    simul = simul.join(clu_df.set_index(clu_df.Name)[['Name','X1']], how = 'left', on = 'Destination')
    simul = simul.drop(['Name'], axis = 1)
    simul = simul.rename(columns={"X1": "pop_d"})

    clu_simul = simul.assign(dist = clu_dist_df.dist)
    
    clu_simul_lin_sd = LinearTerm(clu_simul, 3, 4, 5, log = True, standard = True) 
    
    dio_pop_sd = DIOTerm(od_data = clu_simul, orig_data = clu_gdf, 
                  o_ids = 'Origin', d_ids = 'Destination', 
                  orig_ids = 'Name', orig_attr = 'X1', 
                  log = True, standard = True)
    
    spop_dio_sd = dio_pop_sd.cal(-2)
    
    coefs = np.array([[1], [1], [-1], [-1]])
    clu_simul_X = np.hstack((clu_simul_lin_sd.X, spop_dio_sd))
    
    clu_simul_y_mean = np.exp(np.dot(clu_simul_X, coefs)).reshape((-1,1))
    np.random.seed(seed)
    clu_simul_y = np.random.poisson(lam=clu_simul_y_mean).flatten()
    
    pop_origin = pd.to_numeric(clu_simul.pop_o.values).reshape((-1,1))
    pop_destination = pd.to_numeric(clu_simul.pop_d.values).reshape((-1,1))
    
    # BG model
    vi = np.hstack([pop_origin])
    mj = np.hstack([pop_destination])
    dij = pd.to_numeric(clu_simul.dist.values).reshape((-1,1))

    X = np.hstack((vi, mj, dij))
    logX = np.log(X)
    mean = np.mean(logX, axis=0)
    std = np.std(logX, axis=0)
    standardized_logX = (logX - mean) / std

    X_df = pd.DataFrame(standardized_logX)
    X_df = sm.add_constant(X_df, prepend=False)

    bgm = sm.GLM(clu_simul_y, X_df, family=sm.families.Poisson()).fit()
    bgm_coefs.append(bgm.params.values) 
    
    # Conventional CD model
    dio_pop = DIOTerm(od_data = clu_simul, orig_data = clu_gdf, 
                  o_ids = 'Origin', d_ids = 'Destination', 
                  orig_ids = 'Name', orig_attr = 'X1', 
                  log = True, standard = False)
    spop_dio = dio_pop.cal(-1)
    
    vi = np.hstack([pop_origin])
    mj = np.hstack([pop_destination])
    dij = pd.to_numeric(clu_simul.dist.values).reshape((-1,1))

    sij = np.hstack([spop_dio])

    X = np.hstack((vi, mj, dij, sij))
    logX = np.log(X)
    mean = np.mean(logX, axis=0)
    std = np.std(logX, axis=0)
    standardized_logX = (logX - mean) / std

    X_df = pd.DataFrame(standardized_logX)
    X_df = sm.add_constant(X_df, prepend=False)

    niom = sm.GLM(clu_simul_y, X_df, family=sm.families.Poisson()).fit()
    niom_coefs.append(niom.params.values) 

    # Data-driven CD model
    clu_simul_lin_sd = LinearTerm(clu_simul, 3, 4, 5, log = True, standard = True) 
    
    dio_pop_sd = DIOTerm(od_data = clu_simul, orig_data = clu_gdf, 
                  o_ids = 'Origin', d_ids = 'Destination', 
                  orig_ids = 'Name', orig_attr = 'X1', 
                  log = True, standard = True)
    
    giom = CDIOM(clu_simul_y, clu_simul_lin_sd, dio_pop_sd, constant = True) 
    giom.fit_Poisson(printed = False, verbose = False) 
    giom.inference_Poisson()
    giom.calculate_AWCI_sigmas()
    
    giom_coefs.append(giom.coefficients.flatten())  
    giom_sigmas.append(np.array(giom.sigmas).flatten())
    giom_awci.append(np.array(giom.AWCI_sigmas).flatten())
    
    print(seed)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [7]:
bgm_coefs_df = pd.DataFrame(bgm_coefs)
bgm_coefs_df.columns = ['rho', 'nu', 'beta', 'k']
bgm_coefs_df = bgm_coefs_df[['k', 'rho', 'nu', 'beta']]

niom_coefs_df = pd.DataFrame(niom_coefs)
niom_coefs_df.columns = ['rho', 'nu', 'beta', 'delta', 'k']
niom_coefs_df = niom_coefs_df[['k', 'rho', 'nu', 'beta', 'delta']]

giom_coefs_df = pd.DataFrame(giom_coefs)
giom_coefs_df.columns = ['k', 'rho', 'nu', 'beta', 'delta']
giom_sigmas_df = pd.DataFrame(giom_sigmas)
giom_sigmas_df.columns = ['sigma']
giom_coefs_df['sigma'] = giom_sigmas_df.sigma

giom_awci_df = pd.DataFrame(giom_awci)
giom_awci_df.columns = ['lower', 'upper']
giom_coefs_df['lower'] = giom_awci_df.lower
giom_coefs_df['upper'] = giom_awci_df.upper

In [8]:
bgm_coefs_df.to_csv('res/clu_iobgm1s_coefs_df.csv', index=False)  
niom_coefs_df.to_csv('res/clu_niom1s_coefs_df.csv', index=False) 
giom_coefs_df.to_csv('res/clu_giom1s_coefs_df.csv', index=False) 