In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import geopandas as gpd
import os


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
# specify working directory
os.chdir('.../replication_codes')

In [3]:
# Naive approach
import statsmodels.api as sm

# GASS CDIOM
from terms import ConstantTerm, LinearTerm, SATerm, DIOTerm
from gass import CDIOM

# record running time
import datetime

# Generate points distributed uniformly

In [4]:
# for generating point patterns
from shapely.geometry import Point, Polygon

## Repeated simulations

In [42]:
bgm_coefs = [] #k, rho, nu, beta 

ncdm_coefs = [] #k, rho, nu, beta, delta 

gcdm_coefs = [] #k, rho, nu, beta, delta
gcdm_sigmas = [] 
gcdm_awci = [] #lower, upper

for seed in np.arange(100):
    np.random.seed(seed)
    
    #𝑁-conditioned CSR:  𝑁 is fixed
    square = Polygon([Point((0, 0)), Point((0, 500)), Point((500, 500)), Point((500, 0))])

    # Number of points along one dimension (7x7 grid for 49 points)
    n_points = 7

    # Generate evenly spaced coordinates
    x_coords = np.linspace(0, 500, n_points)
    y_coords = np.linspace(0, 500, n_points)

    # Create a meshgrid to get all combinations of x and y coordinates
    x_grid, y_grid = np.meshgrid(x_coords, y_coords)

    # Flatten the grid to get a list of point coordinates
    uni_points = np.vstack([x_grid.ravel(), y_grid.ravel()]).T

    # Create DataFrame
    uni_df = pd.DataFrame(uni_points, columns = ['Lon', 'Lat'])
    uni_df = uni_df.assign(Name = np.arange(uni_df.shape[0]))
    digits = len(str(49))
    uni_df.Name = uni_df.Name.astype(str).str.zfill(digits).apply(lambda x: "{}{}".format('U', x))

    # Add attributes, X1
    np.random.seed(seed)
    attr1 = np.random.randint(500, 500001, size=49)
    uni_df = uni_df.assign(X1 = attr1)

    # Create GeoDataFrame
    uni_gdf = gpd.GeoDataFrame(uni_df.copy(), geometry=gpd.points_from_xy(uni_df.Lon, uni_df.Lat))
    
    # Calculate Euclidean Distance
    uni_lon_lat = np.column_stack((uni_gdf.Lon.to_numpy(), uni_gdf.Lat.to_numpy()))
    uni_dist_mat = np.linalg.norm(uni_lon_lat - uni_lon_lat[:,None], axis=-1)
    uni_dists = uni_dist_mat[uni_dist_mat != 0].reshape((-1,1))
    
    # Create Distance DataFrame
    places = uni_df.Name.values.reshape((-1,1))
    orig = np.repeat(places, len(places)).reshape((-1,1))
    dest = np.tile(places, (len(places),1))
    codes_df = pd.DataFrame(np.column_stack((orig, dest)))
    mask = codes_df.iloc[:,0] != codes_df.iloc[:,1]
    codes = orig + dest
    codes = codes[mask].reshape((-1,1))
    uni_dist_df = pd.DataFrame(np.column_stack((codes, uni_dists)), columns = ['Code', 'dist'])
    
    # Create Simulated Flow Data
    simul = pd.DataFrame(codes, columns = ['Code'])
    simul = simul.assign(Origin = simul.Code.str[:digits+1], Destination = simul.Code.str[digits+1:])

    simul = simul.join(uni_df.set_index(uni_df.Name)[['Name','X1']], how = 'left', on = 'Origin')
    simul = simul.drop(['Name'], axis = 1)
    simul = simul.rename(columns={"X1": "pop_o"})

    simul = simul.join(uni_df.set_index(uni_df.Name)[['Name','X1']], how = 'left', on = 'Destination')
    simul = simul.drop(['Name'], axis = 1)
    simul = simul.rename(columns={"X1": "pop_d"})

    uni_simul = simul.assign(dist = uni_dist_df.dist)
    
    uni_simul_lin_sd = LinearTerm(uni_simul, 3, 4, 5, log = True, standard = True) 
    
    sa_pop_sd = SATerm(od_data = uni_simul, dest_data = uni_gdf, 
                o_ids = 'Origin', d_ids = 'Destination', 
                dest_ids = 'Name', dest_attr = 'X1', 
                log = True, standard = True)
    
    spop_sa_sd = sa_pop_sd.cal(-2)
    
    coefs = np.array([[1], [1], [-1], [-1]])
    uni_simul_X = np.hstack((uni_simul_lin_sd.X, spop_sa_sd))
    
    uni_simul_y_mean = np.exp(np.dot(uni_simul_X, coefs)).reshape((-1,1))
    np.random.seed(seed)
    uni_simul_y = np.random.poisson(lam=uni_simul_y_mean).flatten()
    
    pop_origin = pd.to_numeric(uni_simul.pop_o.values).reshape((-1,1))
    pop_destination = pd.to_numeric(uni_simul.pop_d.values).reshape((-1,1))
    
    # BG model
    vi = np.hstack([pop_origin])
    mj = np.hstack([pop_destination])
    dij = pd.to_numeric(uni_simul.dist.values).reshape((-1,1))

    X = np.hstack((vi, mj, dij))
    logX = np.log(X)
    mean = np.mean(logX, axis=0)
    std = np.std(logX, axis=0)
    standardized_logX = (logX - mean) / std

    X_df = pd.DataFrame(standardized_logX)
    X_df = sm.add_constant(X_df, prepend=False)

    bgm = sm.GLM(uni_simul_y, X_df, family=sm.families.Poisson()).fit()
    bgm_coefs.append(bgm.params.values) 
    
    # Conventional CD model
    sa_pop = SATerm(od_data = uni_simul, dest_data = uni_gdf, 
                o_ids = 'Origin', d_ids = 'Destination', 
                dest_ids = 'Name', dest_attr = 'X1', 
                log = False, standard = False)
    spop_sa = sa_pop.cal(-1)
    
    vi = np.hstack([pop_origin])
    mj = np.hstack([pop_destination])
    dij = pd.to_numeric(uni_simul.dist.values).reshape((-1,1))

    sij = np.hstack([spop_sa])

    X = np.hstack((vi, mj, dij, sij))
    logX = np.log(X)
    mean = np.mean(logX, axis=0)
    std = np.std(logX, axis=0)
    standardized_logX = (logX - mean) / std

    X_df = pd.DataFrame(standardized_logX)
    X_df = sm.add_constant(X_df, prepend=False)

    ncdm = sm.GLM(uni_simul_y, X_df, family=sm.families.Poisson()).fit()
    ncdm_coefs.append(ncdm.params.values) 

    # Data-driven CD model
    uni_simul_lin_sd = LinearTerm(uni_simul, 3, 4, 5, log = True, standard = True) 
    
    sa_pop_sd = SATerm(od_data = uni_simul, dest_data = uni_gdf, 
                   o_ids = 'Origin', d_ids = 'Destination', 
                   dest_ids = 'Name', dest_attr = 'X1', 
                   log = True, standard = True)
    
    gcdm = CDIOM(uni_simul_y, uni_simul_lin_sd, sa_pop_sd, constant = True) 
    gcdm.fit_Poisson(printed = False, verbose = False) 
    gcdm.inference_Poisson()
    gcdm.calculate_AWCI_sigmas()
    
    gcdm_coefs.append(gcdm.coefficients.flatten())  
    gcdm_sigmas.append(np.array(gcdm.sigmas).flatten())
    gcdm_awci.append(np.array(gcdm.AWCI_sigmas).flatten())
    
    print(seed)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [43]:
bgm_coefs_df = pd.DataFrame(bgm_coefs)
bgm_coefs_df.columns = ['rho', 'nu', 'beta', 'k']
bgm_coefs_df = bgm_coefs_df[['k', 'rho', 'nu', 'beta']]

ncdm_coefs_df = pd.DataFrame(ncdm_coefs)
ncdm_coefs_df.columns = ['rho', 'nu', 'beta', 'delta', 'k']
ncdm_coefs_df = ncdm_coefs_df[['k', 'rho', 'nu', 'beta', 'delta']]

gcdm_coefs_df = pd.DataFrame(gcdm_coefs)
gcdm_coefs_df.columns = ['k', 'rho', 'nu', 'beta', 'delta']
gcdm_sigmas_df = pd.DataFrame(gcdm_sigmas)
gcdm_sigmas_df.columns = ['sigma']
gcdm_coefs_df['sigma'] = gcdm_sigmas_df.sigma

gcdm_awci_df = pd.DataFrame(gcdm_awci)
gcdm_awci_df.columns = ['lower', 'upper']
gcdm_coefs_df['lower'] = gcdm_awci_df.lower
gcdm_coefs_df['upper'] = gcdm_awci_df.upper

In [44]:
bgm_coefs_df.to_csv('results/uni_bgm1s_coefs_df.csv', index=False)  
ncdm_coefs_df.to_csv('results/uni_ncdm1s_coefs_df.csv', index=False) 
gcdm_coefs_df.to_csv('results/uni_gcdm1s_coefs_df.csv', index=False) 