In [18]:
import numpy as np
from copy import deepcopy
from spglm.iwls import iwls, _compute_betas_gwr
from spglm.family import *
from mgwr.search import golden_section
import pandas as pd
import statsmodels.api as sm
import geopandas as gp
import matplotlib.pyplot as plt
import libpysal.weights as sw
import time

## Some Issues about calculating Aij

#### 1. `Dja` and `sw.distance.DistanceBand.from_dataframe()` are not corresponding.

That is how we calculate accessibility.

In [None]:
W = sw.distance.DistanceBand.from_dataframe(austria_shp,  threshold=99999, binary=False, alpha=sigma)
W = W.full()[0]
mask = W!=0
Aij = (W * Dja[0:9])
Aij = Aij[mask].reshape((9,8))
Aij = np.tile(np.sum(Aij, axis=1), 9).reshape((9,9))
Aij = Aij[mask].reshape((-1,1))

`Dja` stroes the qualities of destinations, whose rows have the order like below:

In [116]:
austria['Destination'][0:9]

1    AT12
2    AT13
3    AT21
4    AT22
5    AT31
6    AT32
7    AT33
8    AT34
9    AT11
Name: Destination, dtype: object

And when using `sw.distance.DistanceBand.from_dataframe()`, it used `austria_shp`, whose rows have the order like below:

In [9]:
austria_shp['NUTS_ID']

0    AT33
1    AT34
2    AT11
3    AT13
4    AT31
5    AT21
6    AT32
7    AT12
8    AT22
Name: NUTS_ID, dtype: object

So I was wondering in this the command `Aij = (W * Dja[0:9])`, whether we should change either the order of the rows of `austria_shp` nor of `Dja`. So I created `SpatialSmooth2()`, in this method, I re-index the rows of `j_df` which stores the quality and the name of all destinations, followed by the order of the rows of `austria_shp`. 

But when validating the results with `SpatialSmooth2()` and `ComepteDestiantion2()`, the betas of `iwls()` and of `CompeteDestination2()` are not same and the predicted sigma value is not the same as the sigma we set. 

In [15]:
def SpatialSmooth2(m, X_df, pid, geomap, sigma):
    
    Aij = np.zeros_like(X_df.iloc[:,m]).reshape((-1,1))
    X = X_df.copy()
        
    #obtain the quality of all destinations, mk
    j_df = X.drop_duplicates(subset = ['Destination'])
        
    #rearrange the order of rows which will be followed by the order of the rows of geomap
    j_df = j_df.set_index(j_df['Destination'])#.sort_index()
    j_df = j_df.reindex(geomap.iloc[:,pid])
    mk = j_df.iloc[:,m].values.reshape((-1,1))
        
    #obtain all destinations
    all_places = geomap.iloc[:,pid].values.reshape((-1,1)) #j_df['Destination'].values.reshape((-1,1)) 
    num_j = len(all_places)
        
    #create the flow codes
    temp_codes = [['DELETE']]
    for place in all_places:
        temp_places = np.tile(place, (num_j-1,1))
        other_places = all_places[all_places != place].reshape((-1,1))
        r = other_places + temp_places 
        temp_codes = np.vstack((temp_codes,r))
                
    temp_codes = temp_codes[temp_codes != 'DELETE'].reshape((-1,1))

    #calculate the accessibility
    W = sw.distance.DistanceBand.from_dataframe(geomap, threshold=99999, binary=False, alpha=sigma)
    W = W.full()[0]
    mask = W!=0
    temp_Aij = (W * mk)
    temp_Aij = temp_Aij[mask].reshape((num_j,num_j-1))
    Aijs = np.zeros(num_j).reshape((-1,1))

    for i in range(num_j-1):
        temp_aij = np.delete(temp_Aij, i, axis=1)
    
        temp_aijs = np.sum(temp_aij, axis = 1).reshape((-1,1))
        Aijs = np.hstack((Aijs, temp_aijs)) 

    Aijs = np.delete(Aijs, 0, axis =1).reshape((-1,1))

    temp_Aij = pd.DataFrame(np.hstack((temp_codes, Aijs)), columns = ['Code', 'Aij'])
        
    X_Aij_df = pd.merge(X, temp_Aij, how = 'left', on =  ['Code'])

    Aij = pd.to_numeric(X_Aij_df['Aij'].values).reshape((-1,1))


    return Aij

In [18]:
def CompeteDestination2(y, v1, v2, v3, X_df, pid, geomap, is_Spatial=True, verbose=False):
    
    # build X
    def build_X(v1, v2, v3, X_df, pid, geomap, is_Spatial):
        
        Oi = X_df.iloc[:,v1]
        Wj = X_df.iloc[:,v2]
        Dij = X_df.iloc[:,v3]
        X = np.column_stack((Oi, Wj, Dij))
        
        if is_Spatial == True:
            #initialize the accissibility term with sigma set as -1
            init_Aij = SpatialSmooth2(v2, X_df, pid, geomap, -1)
            X = np.hstack((X,init_Aij)) 
        return X
    
    X = build_X(v1, v2, v3, X_df, pid, geomap, is_Spatial)

    s_0 = np.log(np.mean(y))
    eta = s_0.reshape((-1, 1))
    s_old = np.zeros_like(X)
    crit = 9999
    sigma = np.inf
    
    # backfitting - inner loop
    def backfit2(y, X, w, v2, X_df, pid, geomap, is_Spatial, verbose):
        n,k = X.shape
        betas = _compute_betas_gwr(y, np.log(X), w.reshape((-1, 1)))[0]
        #print(betas)
        XB = np.multiply(betas.T, np.log(X))
        yhat = np.dot(np.log(X), betas)
        err = y.reshape((-1, 1)) - yhat
        iters = 0
        scores = []
        delta = 1e6
        tol = 1e-8
        max_iter = 50
        sig = np.inf
        
        if is_Spatial == True:
            for iters in range(1, max_iter + 1):
                new_XB = np.zeros_like(X)
                params = np.zeros_like(betas)

                for j in range(k):
                    temp_y = XB[:, j].reshape((-1, 1))
                    temp_y = temp_y + err.reshape((-1, 1))
                    temp_X = np.log(X[:, j]).reshape((-1, 1))

                    if j == k-1:
                        score = lambda x: sm.OLS(temp_y, np.log(SpatialSmooth2(v2, X_df, pid, geomap, x))).fit().aic
                        sig  = golden_section(-6, 6, 0.38197, score, 1e-2, 50)[0]
                        Aij = SpatialSmooth2(v2, X_df, pid, geomap, sig)
                        X[:, j] = Aij.flatten()
                        temp_X = np.log(Aij).reshape((-1, 1))

                    beta = _compute_betas_gwr(temp_y, temp_X, w.reshape((-1, 1)))[0]
                    #print(beta)
                    yhat = np.dot(temp_X, beta)
                    new_XB[:, j] = yhat.flatten()
                    err = (temp_y - yhat).reshape((-1, 1))
                    params[j, :] = beta[0][0]
    
                num = np.sum((XB-new_XB)**2)
                den = 1 + np.sum(np.sum(XB, axis=1)**2)
                score = (num / den)
                XB = new_XB
        
                scores.append(deepcopy(score))
                delta = score

                if verbose:
                    print("Current iteration:", iters, ",SOC:", np.round(score, 8))
                if delta < tol:
                    break
                    
        elif is_Spatial == False:
            for iters in range(1, max_iter + 1):
                new_XB = np.zeros_like(X)
                params = np.zeros_like(betas)

                for j in range(k):
                    temp_y = XB[:, j].reshape((-1, 1))
                    temp_y = temp_y + err.reshape((-1, 1))
                    temp_X = np.log(X[:, j]).reshape((-1, 1))
                    beta = _compute_betas_gwr(temp_y, temp_X, w.reshape((-1, 1)))[0]
                    #print(beta)
                    yhat = np.dot(temp_X, beta)
                    new_XB[:, j] = yhat.flatten()
                    err = (temp_y - yhat).reshape((-1, 1))
                    params[j, :] = beta[0][0]
                    
                num = np.sum((XB-new_XB)**2)
                den = 1 + np.sum(np.sum(XB, axis=1)**2)
                score = (num / den)
                XB = new_XB
        
                scores.append(deepcopy(score))
                delta = score

                if verbose:
                    print("Current iteration:", iters, ",SOC:", np.round(score, 8))
                if delta < tol:
                    break

        return params, X, sig
    
    #local scoring - outer loop 
    while crit > 1e-8:
        mu = np.exp(eta).reshape((-1, 1))
        w = mu.reshape((-1,1))
        z = eta + ((y.reshape((-1, 1)) - mu) / mu)
        betas, X, sigma = backfit2(z, X, w, v2, X_df, pid, geomap,is_Spatial, verbose)
        s_new = np.multiply(betas.T, np.log(X))
        inner = np.sum((s_old - s_new)**2, axis=1)
        num = np.sum(w*inner)
        den = np.sum(w*np.sum((1 + s_old)**2, axis=1).reshape((-1, 1)))
        crit = num / den
        eta = np.sum(s_new, axis=1).reshape((-1, 1))
        s_old = s_new
        
    return betas, eta, sigma
    

In [19]:
Aij2 = SpatialSmooth2(5, X_df, 1, austria_shp, -1.35)
X2 = np.column_stack((Oi, Wj, Dij, Aij2))
w = np.ones(72).reshape((-1,1))
betas2 = iwls(flows.reshape((-1,1)), np.log(X2), family=Poisson(), offset=w.reshape((-1, 1)), y_fix=None, wi=w.reshape((-1, 1)))[0]
f2 = np.exp(np.dot(np.log(X2), betas2) + np.random.normal(0, .0001, (72,1))).reshape((-1,1))
start2 = time.time()
betas_cd2, eta_cd2, sigma_cd2 = CompeteDestination2(f2, 4, 5, 6, X_df, 1, austria_shp, is_Spatial=True, verbose=False)
end2 = time.time()

In [20]:
print('betas_iwls2:', betas2, 'betas_cd2:', betas_cd2,'sigma_cd2:', sigma_cd2, 'Runtime for ComepeteDestination2():', end2 - start2, sep='\n')

betas_iwls2:
[[ 0.62239105]
 [ 1.23615628]
 [-1.06479231]
 [-0.45072998]]
betas_cd2:
[[ 0.63758331]
 [ 1.38446578]
 [-1.04594236]
 [-0.59791932]]
sigma_cd2:
-1.05
Runtime for ComepeteDestination2():
26.739388465881348


#### 2. Which could be considered as an alternative destination for a flow.

So far, when calculating the accessibilty for each flow in the case of Austria, we assume the possible destinations will be all the places except the origin as for a flow. For example, for the flow 'AT11AT12', the target destination is 'AT12', and other possible destinations are 'AT13', 'AT21', 'AT22', 'AT31', 'AT32', 'AT33', 'AT34'.

However, one situation would happen, which is that not every origin will also be a destination. Continuing to take the flow 'AT11AT12' as an example, in fact, the alternative destinations for this flow are those who have the interactions with the origin, 'AT11'. And it is possible that except the origin 'AT11' and the target destination 'AT12', only 'AT13','AT21' and 'AT22' have interactions with 'AT11'.(Of course, in the case of Austria, every place has the interaction with each other, however, this situation is possible). 

So I wrote `SpatialSmooth3()` to consider the situation above and I hope the users could tell whether each origin will also be a destination before modelling. 

In [21]:
def SpatialSmooth3(m, X_df, pid, geomap, sigma, is_Each = False):
    
    '''
    is_Each: whether every destination will also be an origin, by default it is false

    '''
    Aij = np.zeros_like(X_df.iloc[:,m]).reshape((-1,1))
    
    if is_Each == False:
        X = X_df.copy()
        temp_df = pd.DataFrame()
        for temp_or in X['Origin']: 
    
            #find all flows whose origin is temp_or
            j_df = X[X['Origin'] == temp_or]        
            temp_j = j_df['Destination'].values
            #split the geomap and only remain the geometries of the destinations of temp_j 
            temp_shp = geomap.loc[geomap.iloc[:,pid].isin(temp_j)] 
            
            j_df = j_df.set_index(j_df['Destination'])
            j_df = j_df.reindex(temp_shp.iloc[:,pid])
            
            mk = j_df.iloc[:,m].values.reshape((-1,1)) 
            num_j = len(temp_j)
            
            #obtain all destinations
            all_places = j_df['Destination']#temp_shp.iloc[:,pid].values.reshape((-1,1)) 
            
            #create the flow codes
            temp_codes = [['DELETE']]
        
            for place in all_places:
                r = temp_or + place
                temp_codes = np.vstack((temp_codes,r))
                
            temp_codes = temp_codes[temp_codes != 'DELETE'].reshape((-1,1))
            
            #calculate the accessibility
            W = sw.distance.DistanceBand.from_dataframe(temp_shp, threshold=99999, binary=False, alpha=sigma)
            W = W.full()[0]
            mask = W!=0 
            temp_Aij = (W * mk)
            temp_Aij = temp_Aij[mask].reshape((num_j,num_j-1))
            temp_Aij = np.sum(temp_Aij, axis=1).reshape((-1,1))
            temp_Aij = pd.DataFrame(np.hstack((temp_codes, temp_Aij)), columns = ['Code', 'Aij'])
            temp_df = temp_df.append(temp_Aij)
            
        temp_df = temp_df.drop_duplicates(subset = ['Code'])
        temp_df = temp_df.set_index(temp_df['Code'])
        temp_df = temp_df.reindex(X['Code'])
            
        #X_Aij_df = pd.merge(X, temp_df, how = 'left', on =  ['Code'])
        Aij = pd.to_numeric(temp_df['Aij'].values).reshape((-1,1))   

    elif is_Each == True:
        X = X_df.copy()
        
        #obtain the quality of all destinations, mk
        j_df = X.drop_duplicates(subset = ['Destination'])
        j_df = j_df.set_index(j_df['Destination']).sort_index()
        mk = j_df.iloc[:,m].values.reshape((-1,1))
        
        #obtain all destinations
        all_places = geomap.iloc[:,pid].values.reshape((-1,1)) 
        num_j = len(all_places)
        
        #create the flow codes
        temp_codes = [['DELETE']]
        
        for place in all_places:
            temp_places = np.tile(place, (num_j-1,1))
            other_places = all_places[all_places != place].reshape((-1,1))
            r = temp_places + other_places
            temp_codes = np.vstack((temp_codes,r))
                
        temp_codes = temp_codes[temp_codes != 'DELETE'].reshape((-1,1))
     
        #calculate the accessibility
        W = sw.distance.DistanceBand.from_dataframe(geomap, threshold=99999, binary=False, alpha=sigma)
        W = W.full()[0]
        mask = W!=0
        temp_Aij = (W * mk)
        temp_Aij = temp_Aij[mask].reshape((num_j,num_j-1))
        temp_Aij = np.tile(np.sum(temp_Aij, axis=1), num_j).reshape((num_j,num_j))
        temp_Aij = temp_Aij[mask].reshape((-1,1))

        temp_Aij = pd.DataFrame(np.hstack((temp_codes, temp_Aij)), columns = ['Code', 'Aij'])
        X_Aij_df = pd.merge(X, temp_Aij, how = 'left', on =  ['Code'])

        Aij = pd.to_numeric(X_Aij_df['Aij'].values).reshape((-1,1))

    return Aij

In [29]:
Aij1 = SpatialSmooth(5, X_df, 1, austria_shp, -1.35)
Aij[0:5]

array([[18881.77497771],
       [20545.76577865],
       [29964.34847772],
       [ 7553.70956538],
       [30972.31179077]])

In [30]:
Aij2 = SpatialSmooth2(5, X_df, 1, austria_shp, -1.35)
Aij2[0:5]

array([[99220.55946816],
       [89299.76447806],
       [13989.01559767],
       [29308.82630006],
       [27164.76253095]])

In [31]:
Aij3 = SpatialSmooth3(5, X_df, 1, austria_shp, -1.35, is_Each = False)
Aij3[0:5]

array([[99220.55946816],
       [89299.76447806],
       [13989.01559767],
       [29308.82630006],
       [27164.76253095]])

As we could see, the results from `SpatialSmooth2()` and `SpatialSmooth3()` are the same because in the case of Austria, each origin is also a destination and `SpatialSmooth2()` is one special case of `SpatialSmooth3()`. 

In [22]:
def CompeteDestination3(y, v1, v2, v3, X_df, pid, geomap, is_Spatial=True, is_Each =False, verbose=False):
    
    # build X
    def build_X(v1, v2, v3, X_df, pid, geomap, is_Spatial, is_Each):
        
        Oi = X_df.iloc[:,v1]
        Wj = X_df.iloc[:,v2]
        Dij = X_df.iloc[:,v3]
        X = np.column_stack((Oi, Wj, Dij))
        
        if is_Spatial == True:
            #initialize the accissibility term with sigma set as -1
            init_Aij = SpatialSmooth3(v2, X_df, pid, geomap, 1, is_Each)
            X = np.hstack((X,init_Aij)) 
        return X
    
    X = build_X(v1, v2, v3, X_df, pid, geomap, is_Spatial, is_Each)

    s_0 = np.log(np.mean(y))
    eta = s_0.reshape((-1, 1))
    s_old = np.zeros_like(X)
    crit = 9999
    sigma = np.inf
    
    # backfitting - inner loop
    def backfit3(y, X, w, v2, X_df, pid, geomap, is_Spatial, is_Each, verbose):
        n,k = X.shape
        betas = _compute_betas_gwr(y, np.log(X), w.reshape((-1, 1)))[0]
        #print(betas)
        XB = np.multiply(betas.T, np.log(X))
        yhat = np.dot(np.log(X), betas)
        err = y.reshape((-1, 1)) - yhat
        iters = 0
        scores = []
        delta = 1e6
        tol = 1e-8
        max_iter = 50
        sig = np.inf
        
        if is_Spatial == True:
            for iters in range(1, max_iter + 1):
                new_XB = np.zeros_like(X)
                params = np.zeros_like(betas)

                for j in range(k):
                    temp_y = XB[:, j].reshape((-1, 1))
                    temp_y = temp_y + err.reshape((-1, 1))
                    temp_X = np.log(X[:, j]).reshape((-1, 1))

                    if j == k-1:
                        score = lambda x: sm.OLS(temp_y, np.log(SpatialSmooth3(v2, X_df, pid, geomap, x, is_Each))).fit().aic
                        sig  = golden_section(-6, 6, 0.38197, score, 1e-2, 50)[0]
                        Aij = SpatialSmooth3(v2, X_df, pid, geomap, sig, is_Each)
                        X[:, j] = Aij.flatten()
                        temp_X = np.log(Aij).reshape((-1, 1))

                    beta = _compute_betas_gwr(temp_y, temp_X, w.reshape((-1, 1)))[0]
                    #print(beta)
                    yhat = np.dot(temp_X, beta)
                    new_XB[:, j] = yhat.flatten()
                    err = (temp_y - yhat).reshape((-1, 1))
                    params[j, :] = beta[0][0]
    
                num = np.sum((XB-new_XB)**2)
                den = 1 + np.sum(np.sum(XB, axis=1)**2)
                score = (num / den)
                XB = new_XB
        
                scores.append(deepcopy(score))
                delta = score

                if verbose:
                    print("Current iteration:", iters, ",SOC:", np.round(score, 8))
                if delta < tol:
                    break
                    
        elif is_Spatial == False:
            
            for iters in range(1, max_iter + 1):
                new_XB = np.zeros_like(X)
                params = np.zeros_like(betas)

                for j in range(k):
                    temp_y = XB[:, j].reshape((-1, 1))
                    temp_y = temp_y + err.reshape((-1, 1))
                    temp_X = np.log(X[:, j]).reshape((-1, 1))
                    beta = _compute_betas_gwr(temp_y, temp_X, w.reshape((-1, 1)))[0]
                    #print(beta)
                    yhat = np.dot(temp_X, beta)
                    new_XB[:, j] = yhat.flatten()
                    err = (temp_y - yhat).reshape((-1, 1))
                    params[j, :] = beta[0][0]
                    
                num = np.sum((XB-new_XB)**2)
                den = 1 + np.sum(np.sum(XB, axis=1)**2)
                score = (num / den)
                XB = new_XB
        
                scores.append(deepcopy(score))
                delta = score

                if verbose:
                    print("Current iteration:", iters, ",SOC:", np.round(score, 8))
                if delta < tol:
                    break

        return params, X, sig
    
    #local scoring - outer loop 
    while crit > 1e-8:
        
        mu = np.exp(eta).reshape((-1, 1))
        w = mu.reshape((-1,1))
        z = eta + ((y.reshape((-1, 1)) - mu) / mu)
        betas, X, sigma = backfit3(z, X, w, v2, X_df, pid, geomap,is_Spatial, is_Each, verbose)    
        s_new = np.multiply(betas.T, np.log(X))
        inner = np.sum((s_old - s_new)**2, axis=1)
        num = np.sum(w*inner)
        den = np.sum(w*np.sum((1 + s_old)**2, axis=1).reshape((-1, 1)))
        crit = num / den
        eta = np.sum(s_new, axis=1).reshape((-1, 1))
        s_old = s_new
        
    return betas, eta, sigma
    

In [23]:
Aij3 = SpatialSmooth3(5, X_df, 1, austria_shp, -1.35, is_Each =False)
X3 = np.column_stack((Oi, Wj, Dij, Aij3))
w = np.ones(72).reshape((-1,1))
betas3 = iwls(flows.reshape((-1,1)), np.log(X3), family=Poisson(), offset=w.reshape((-1, 1)), y_fix=None, wi=w.reshape((-1, 1)))[0]
f3 = np.exp(np.dot(np.log(X3), betas3) + np.random.normal(0, .0001, (72,1))).reshape((-1,1))
start3 = time.time()
betas_cd3, eta_cd3, sigma_cd3 = CompeteDestination3(f3, 4, 5, 6, X_df, 1, austria_shp, is_Spatial=True, is_Each = False, verbose=False)
end3 = time.time()

In [24]:
print('betas_iwls3:', betas3, 'betas_cd2:', betas_cd2,'sigma_cd2:', sigma_cd2, betas_cd3,'sigma_cd3:', sigma_cd3, 'Runtime for ComepeteDestination2():', end2 - start2, 'Runtime for ComepeteDestination3():', end3 - start3, sep='\n')

betas_iwls3:
[[ 0.62239105]
 [ 1.23615628]
 [-1.06479231]
 [-0.45072998]]
betas_cd2:
[[ 0.63758331]
 [ 1.38446578]
 [-1.04594236]
 [-0.59791932]]
sigma_cd2:
-1.05
[[ 0.66013255]
 [ 0.64521874]
 [-1.11226874]
 [ 0.03381349]]
sigma_cd3:
5.96
Runtime for ComepeteDestination2():
26.739388465881348
Runtime for ComepeteDestination3():
442.38308119773865


## Soulution

1. assume that the dataframe X_df must have those columns: 
origin names/codes; 
destination names/codes;
features of origin and destination

In [113]:
austria.head()

Unnamed: 0.1,Unnamed: 0,Origin,Destination,Data,Oi,Dj,Dij,Code
1,1,AT11,AT12,1131,4016,25741,103.001845,AT11AT12
2,2,AT11,AT13,1887,4016,26980,84.204666,AT11AT13
3,3,AT11,AT21,69,4016,4117,220.811933,AT11AT21
4,4,AT11,AT22,738,4016,8634,132.00748,AT11AT22
5,5,AT11,AT31,98,4016,8193,214.511814,AT11AT31


2. we will create a dataframe setup_df based on `austria`. In setup_df, we will have those columns: Code(e.g.,'AT11AT31'); Origin; Destination; Oi; Wj; Dij; mk; dkj. And we can calculate Aij by doing a matrix element-wise multiplication:
sum(X['mk']* (X['dkj'] ** sigma)

In [114]:
def setup_dist(v1, v2, X_df, pid, geomap):
    '''
    the index of the column 'Oi'
    the index of the column 'Wj'
    the index of the column storing the names/codes of the places in geomap
    geomap: it must have a column storing the names/codes of the origins and destination in X_df. 
            And the name/codes must be same as those in the column 'Origin' and 'Destination' in X_df.
    '''
        
    X_df['Code'] = X_df['Origin'] + X_df['Destination']
    v1_name = X_df.columns[v1]
    v2_name = X_df.columns[v2]
    X = X_df.copy()

    distances = sw.distance.DistanceBand.from_dataframe(geomap, threshold=99999, binary=False, alpha=1)
    distances = distances.full()[0]
    mask = distances!=0
    distances = distances[mask].reshape((-1,1))
    places = geomap.iloc[:,pid].values.reshape((-1,1))
        
    fcodes = [['DELETE']]
    for place in places:
        temp_places = np.tile(place, (len(places)-1,1))
        other_places = places[places!= place].reshape((-1,1))
        temp_fcodes = other_places + temp_places 
        fcodes = np.vstack((fcodes,temp_fcodes))
                
    fcodes = fcodes[fcodes != 'DELETE'].reshape((-1,1))
        
    distances_df = pd.DataFrame(np.hstack((fcodes, distances)), columns = ['Code', 'Distij'])
    X_distances_df = pd.merge(X, distances_df, how = 'left', on =  ['Code'])
    
    X_distances_df = X_distances_df[['Code','Origin', 'Destination', v1_name, v2_name, 'Distij']] 
    return v2_name, X_distances_df

def setup_Aij(v2_name, X_distances_df):
    
    '''
    return:
        a new dataframe with two new columns storing `mk` and `dkj` arrays with respect to each flow
    ------------------------------------------------------------------------------------------------
    We can calculate Aij based on the equation `Aij = sum(mk * (dkj ** sigma))`; 
    The implement is shown below:

        X['Aij'] = ((X['mk']* (X['dkj'] ** sigma)).apply(lambda x: sum(x).item()))
    
    '''
    
    X = X_distances_df.copy()

    for temp_or in X['Origin']: 
    
        #find all flows whose origin is temp_or
        temp_df = X[X['Origin'] == temp_or]
    
        #all destinations whose origin is temp_or
        temp_j = temp_df['Destination'].values
    
        #calculate aij for each destination temp_de
        for temp_de in temp_j:
        
            #each flow's code
            temp_code = temp_or + temp_de
        
            #alternative destination with respect to temp_de
            alters = temp_j[temp_j != temp_de]
        
            #find distances between temp_de and all other alternative destinations and attractiveness of temp_de
            temp_df2 = X[X['Origin'] == temp_de]
            temp_df3 = temp_df2[temp_df2['Destination'].isin(alters)]
        
            mk = temp_df3[v2_name].values.reshape((-1,1))
            dkj = temp_df3['Distij'].values.reshape((-1,1))
            
            # create a list containing mk and dkj repectively
            
            X.loc[X['Code'] == temp_code, 'mk'] = X['Code'].apply(lambda x: mk if x == temp_code else None)
            X.loc[X['Code'] == temp_code, 'dkj'] = X['Code'].apply(lambda x: dkj if x == temp_code else None)
 
    return X

def SpatialSmooth(setup_df, sigma):
    X = setup_df.copy()
    X['Aij'] = ((X['mk']* (X['dkj'] ** sigma)).apply(lambda x: sum(x).item()))
    Aijs = pd.to_numeric(X['Aij'].values).reshape((-1,1))
    
    return Aijs


In [115]:
Dij = setup_dist(v1 = 4, v2 = 5, X_df = austria, pid = 1, geomap = austria_shp)[1]['Distij'].astype(float)
v2_name, setup_df = setup_dist(4, 5, austria, 1, austria_shp)
setup_df = setup_Aij(v2_name, setup_df)
setup_df.head(1)

Unnamed: 0,Code,Origin,Destination,Oi,Dj,Distij,mk,dkj
0,AT11AT12,AT11,AT12,4016,25741,1.060975,"[[26980], [4117], [8634], [8193], [4902], [395...","[[0.6287635503694933], [2.3683513544769976], [..."


In [120]:
def SpatialGAM(y, v1, v2, X_df, pid, geomap, is_Spatial = True, verbose = False):
    
    def setup_dist(v1, v2, X_df, pid, geomap):
        
        X_df['Code'] = X_df['Origin'] + X_df['Destination']
        v1_name = X_df.columns[v1]
        v2_name = X_df.columns[v2]
        X = X_df.copy()

        distances = sw.distance.DistanceBand.from_dataframe(geomap, threshold=99999, binary=False, alpha=1)
        distances = distances.full()[0]
        mask = distances!=0
        distances = distances[mask].reshape((-1,1))
        places = geomap.iloc[:,pid].values.reshape((-1,1))
        
        fcodes = [['DELETE']]
        for place in places:
            temp_places = np.tile(place, (len(places)-1,1))
            other_places = places[places!= place].reshape((-1,1))
            temp_fcodes = other_places + temp_places 
            fcodes = np.vstack((fcodes,temp_fcodes))
                
        fcodes = fcodes[fcodes != 'DELETE'].reshape((-1,1))
        
        distances_df = pd.DataFrame(np.hstack((fcodes, distances)), columns = ['Code', 'Distij'])
        X_distances_df = pd.merge(X, distances_df, how = 'left', on =  ['Code'])
    
        X_distances_df = X_distances_df[['Code','Origin', 'Destination', v1_name, v2_name, 'Distij']] 
        
        return v2_name, X_distances_df

    def setup_Aij(v2_name, X_distances_df):
    
        X = X_distances_df.copy()

        for temp_or in X['Origin']: 
    
            #find all flows whose origin is temp_or
            temp_df = X[X['Origin'] == temp_or]
    
            #all destinations whose origin is temp_or
            temp_j = temp_df['Destination'].values
    
            #calculate aij for each destination temp_de
            for temp_de in temp_j:
        
                #each flow's code
                temp_code = temp_or + temp_de
        
                #alternative destination with respect to temp_de
                alters = temp_j[temp_j != temp_de]
        
                #find distances between temp_de and all other alternative destinations and attractiveness of temp_de
                temp_df2 = X[X['Origin'] == temp_de]
                temp_df3 = temp_df2[temp_df2['Destination'].isin(alters)]
        
                mk = temp_df3[v2_name].values.reshape((-1,1))
                dkj = temp_df3['Distij'].values.reshape((-1,1))
            
                # create a list containing mk and dkj repectively
                X.loc[X['Code'] == temp_code, 'mk'] = X['Code'].apply(lambda x: mk if x == temp_code else None)
                X.loc[X['Code'] == temp_code, 'dkj'] = X['Code'].apply(lambda x: dkj if x == temp_code else None)

        return X #setup_df
    
    def SpatialSmooth(setup_df, sigma):
        X = setup_df.copy()
        X['Aij'] = ((X['mk']* (X['dkj'] ** sigma)).apply(lambda x: sum(x).item()))
        Aijs = pd.to_numeric(X['Aij'].values).reshape((-1,1))
        
        return Aijs

    # build X
    def build_X(v1, v2, X_df, pid, geomap, is_Spatial):
        
        v1_name = X_df.columns[v1]
        v2_name, setup_df = setup_dist(v1, v2, X_df, pid, geomap)
        setup_df = setup_Aij(v2_name, setup_df)
        
        Oi = pd.to_numeric(setup_df[v1_name].values).reshape((-1,1))
        Wj = pd.to_numeric(setup_df[v2_name].values).reshape((-1,1))
        Dij = pd.to_numeric(setup_df['Distij'].values).reshape((-1,1)) #X_df.iloc[:,v3]
        X = np.column_stack((Oi, Wj, Dij))
         
        if is_Spatial == True:
            #initialize the accissibility term with sigma set as -1
            init_Aij = SpatialSmooth(setup_df, -1)
            X = np.column_stack((Oi, Wj, Dij, init_Aij))
        
        return X, setup_df
    
    # backfitting - inner loop
    def backfit(y, X, w, setup_df, is_Spatial, verbose):
        n,k = X.shape
        
        betas = _compute_betas_gwr(y, np.log(X), w.reshape((-1, 1)))[0]
        # sm.OLS(y, np.log(X) * w.reshape((-1,1))).fit().params.reshape((-1,1))
        # print(betas)
        XB = np.multiply(betas.T, np.log(X))
        yhat = np.dot(np.log(X), betas)
        err = y.reshape((-1, 1)) - yhat
        iters = 0
        scores = []
        delta = 1e6
        tol = 1e-8
        max_iter = 50
        sig = np.inf
        
        if is_Spatial == True:
            for iters in range(1, max_iter + 1):
                new_XB = np.zeros_like(X)
                params = np.zeros_like(betas)

                for j in range(k):
                    temp_y = XB[:, j].reshape((-1, 1))
                    temp_y = temp_y + err.reshape((-1, 1))
                    temp_X = np.log(X[:, j]).reshape((-1, 1))

                    if j == k-1:
                        score = lambda x: sm.OLS(temp_y, np.log(SpatialSmooth(setup_df, x))).fit().aic
                        sig  = golden_section(-6, 6, 0.38197, score, 1e-2, 50)[0]
                        Aij = SpatialSmooth(setup_df, sig)
                        X[:, j] = Aij.flatten()
                        temp_X = np.log(Aij).reshape((-1,1))

                    beta = _compute_betas_gwr(temp_y, temp_X, w.reshape((-1, 1)))[0]
                    #sm.OLS(temp_y, temp_X * w.reshape((-1,1))).fit().params.reshape((-1,1))
                    #print(beta)
                    yhat = np.dot(temp_X, beta)
                    new_XB[:, j] = yhat.flatten()
                    err = (temp_y - yhat).reshape((-1, 1))
                    params[j, :] = beta[0][0]
    
                num = np.sum((XB-new_XB)**2)
                den = 1 + np.sum(np.sum(XB, axis=1)**2)
                score = (num / den)
                XB = new_XB
        
                scores.append(deepcopy(score))
                delta = score

                if verbose:
                    print("Current iteration:", iters, ",SOC:", np.round(score, 8))
                if delta < tol:
                    break
                    
        elif is_Spatial == False:
            for iters in range(1, max_iter + 1):
                new_XB = np.zeros_like(X)
                params = np.zeros_like(betas)

                for j in range(k):
                    temp_y = XB[:, j].reshape((-1, 1))
                    temp_y = temp_y + err.reshape((-1, 1))
                    temp_X = np.log(X[:, j]).reshape((-1, 1))
                    beta = _compute_betas_gwr(temp_y, temp_X, w.reshape((-1, 1)))[0]
                    #print(beta)
                    yhat = np.dot(temp_X, beta)
                    new_XB[:, j] = yhat.flatten()
                    err = (temp_y - yhat).reshape((-1, 1))
                    params[j, :] = beta[0][0]
                    
                num = np.sum((XB-new_XB)**2)
                den = 1 + np.sum(np.sum(XB, axis=1)**2)
                score = (num / den)
                XB = new_XB
        
                scores.append(deepcopy(score))
                delta = score

                if verbose:
                    print("Current iteration:", iters, ",SOC:", np.round(score, 8))
                if delta < tol:
                    break

        return params, X, sig
    

    X, setup_df = build_X(v1, v2, X_df, pid, geomap, is_Spatial)
    s_0 = np.log(np.mean(y))
    eta = s_0.reshape((-1, 1))
    s_old = np.zeros_like(X)
    crit = 9999
    sigma = 9999
    
    #local scoring - outer loop 
    while crit > 1e-8:
        
        mu = np.exp(eta).reshape((-1, 1))
        w = mu.reshape((-1,1))
        z = eta + ((y.reshape((-1, 1)) - mu) / mu)
        betas, X, sigma = backfit(z, X, w, setup_df, is_Spatial, verbose)
        s_new = np.multiply(betas.T, np.log(X))
        inner = np.sum((s_old - s_new)**2, axis=1)
        num = np.sum(w*inner)
        den = np.sum(w*np.sum((1 + s_old)**2, axis=1).reshape((-1, 1)))
        crit = num / den
        eta = np.sum(s_new, axis=1).reshape((-1, 1))
        s_old = s_new
        
    return betas, eta, sigma

In [121]:
Aij4 = SpatialSmooth(setup_df, -1.35)
X4 = np.column_stack((Oi, Wj, Dij, Aij4))
w = np.ones(72).reshape((-1,1))
betas4 = iwls(flows.reshape((-1,1)), np.log(X4), family=Poisson(), offset=w.reshape((-1, 1)), y_fix=None, wi=w.reshape((-1, 1)))[0]
f4 = np.exp(np.dot(np.log(X4), betas) + np.random.normal(0, .0001, (72,1))).reshape((-1,1))
start4 = time.time()
betas_cd4, eta_cd4, sigma_cd4 = SpatialGAM(f4, 4, 5, austria, 1, austria_shp, is_Spatial = True, verbose = False)
end4 = time.time()

In [122]:
print('betas_iwls4:', betas4, 'betas_cd4:', betas_cd4,'sigma_cd4:', sigma_cd4, 'Runtime for SpatialGAM():', end4 - start4,sep='\n')

betas_iwls4:
[[ 0.56357089]
 [ 0.96364052]
 [-0.75495023]
 [-0.62447807]]
betas_cd4:
[[ 0.56358224]
 [ 0.96367122]
 [-0.75493081]
 [-0.62451253]]
sigma_cd4:
-1.35
Runtime for SpatialGAM():
16.595179319381714


## Backup

### Codes below could also be found in `CDM+GAM_new.ipynb`

Codes below worked well. The way we calculate the accessibility in `SpatialSmooth()` and `CompeteDestination()` below is based on the codes `access()` from the file 'PossionGAM.py' and 'PossionGAM.ipynb', which Dr.Taylor Oshan sent to me

## Spatial Smooth

In [7]:
def SpatialSmooth(m, X_df, pid, geomap, sigma):
    
    '''
    m: the index of the column storing the quality of destination
    X_df: the dataframe containing the predictors columns; 
          by default, we assume X_df has the columns with those names:
            `Origin`: the name or code of origin
            `Destination`: the name or code of destination
            `Code`: `Origi` + `Destination` 
            `Oi`, 'Wj', 'Dij': No need to have the same column name
            `Dij`: In the future, users do not need to provide it, the distance between origin ande destination in a flow will be calculate by using geomap automatically
    
    pid: the index of the column storing the name of the place in geomap # NUTS_ID, 1 in austrai_shp
    geomap: the file with coordinates information to calculate the distance
    sigma: sigma in the formula of accessibility in CDM
    '''
    Aij = np.zeros_like(X_df.iloc[:,m]).reshape((-1,1))
    
    X = X_df.copy()
        
    #obtain the quality of all destinations, mk
    j_df = X.drop_duplicates(subset = ['Destination'])
    j_df = j_df.set_index(j_df['Destination']).sort_index()
    mk = j_df.iloc[:,m].values.reshape((-1,1))
        
    #obtain all destinations
    all_places = geomap.iloc[:,pid].values.reshape((-1,1)) 
    num_j = len(all_places)
        
    #create the flow codes
    temp_codes = [['DELETE']]
        
    for place in all_places:
        temp_places = np.tile(place, (num_j-1,1))
        other_places = all_places[all_places != place].reshape((-1,1))
        r = temp_places + other_places
        temp_codes = np.vstack((temp_codes,r))
                
    temp_codes = temp_codes[temp_codes != 'DELETE'].reshape((-1,1))
     
    #calculate the accessibility
    W = sw.distance.DistanceBand.from_dataframe(geomap, threshold=99999, binary=False, alpha=sigma)
    W = W.full()[0]
    mask = W!=0
    temp_Aij = (W * mk)
    temp_Aij = temp_Aij[mask].reshape((num_j,num_j-1))
    temp_Aij = np.tile(np.sum(temp_Aij, axis=1), num_j).reshape((num_j,num_j))
    temp_Aij = temp_Aij[mask].reshape((-1,1))

    temp_Aij = pd.DataFrame(np.hstack((temp_codes, temp_Aij)), columns = ['Code', 'Aij'])

    X_Aij_df = pd.merge(X, temp_Aij, how = 'left', on =  ['Code'])

    Aij = pd.to_numeric(X_Aij_df['Aij'].values).reshape((-1,1))

    return Aij

## CompeteDestination

In [8]:
def CompeteDestination(y, v1, v2, v3, X_df, pid, geomap, is_Spatial=True, verbose=False):
    
    '''
    y: the array of the response variable
    v1: the index of the Oi column
    v2: the index of the Wj column
    v3: the index of the Dij column
    X_df: the dataframe containing the predictors columns; 
          by default, we assume X_df has the columns with those names:
            `Origin`: the name or code of origin
            `Destination`: the name or code of destination
            `Code`: `Origi` + `Destination` 
            `Oi`, 'Wj', 'Dij': No need to have the same column name
            `Dij`: In the future, users do not need to provide it, the distance between origin ande destination in a flow will be calculate by using geomap automatically
            
    geomap: the file with coordinate info of origin and destination
            must have a column storing the name or code of origin and destination
    pid: the index of the must-have column in geomap
    is_Spatial: whether we need to create a spatial smooth, by default it is true
    '''
    
    # build X
    def build_X(v1, v2, v3, X_df, pid, geomap, is_Spatial):
        
        Oi = X_df.iloc[:,v1]
        Wj = X_df.iloc[:,v2]
        Dij = X_df.iloc[:,v3]
        X = np.column_stack((Oi, Wj, Dij))
        
        if is_Spatial == True:
            #initialize the accissibility term with sigma set as -1
            init_Aij = SpatialSmooth(v2, X_df, pid, geomap, -1)
            X = np.hstack((X,init_Aij)) 
        return X
    
    X = build_X(v1, v2, v3, X_df, pid, geomap, is_Spatial)

    s_0 = np.log(np.mean(y))
    eta = s_0.reshape((-1, 1))
    s_old = np.zeros_like(X)
    crit = 9999
    sigma = np.inf
    
    # backfitting - inner loop
    def backfit(y, X, w, v2, X_df, pid, geomap, is_Spatial, verbose):
        n,k = X.shape
        betas = _compute_betas_gwr(y, np.log(X), w.reshape((-1, 1)))[0]
        #print(betas)
        XB = np.multiply(betas.T, np.log(X))
        yhat = np.dot(np.log(X), betas)
        err = y.reshape((-1, 1)) - yhat
        iters = 0
        scores = []
        delta = 1e6
        tol = 1e-8
        max_iter = 50
        sig = 9999
        
        if is_Spatial == True:
            for iters in range(1, max_iter + 1):
                new_XB = np.zeros_like(X)
                params = np.zeros_like(betas)

                for j in range(k):
                    temp_y = XB[:, j].reshape((-1, 1))
                    temp_y = temp_y + err.reshape((-1, 1))
                    temp_X = np.log(X[:, j]).reshape((-1, 1))

                    if j == k-1:
                        score = lambda x: sm.OLS(temp_y, np.log(SpatialSmooth(v2, X_df, pid, geomap, x))).fit().aic
                        sig  = golden_section(-6, 6, 0.38197, score, 1e-2, 50)[0]
                        Aij = SpatialSmooth(v2, X_df, pid, geomap, sig)
                        X[:, j] = Aij.flatten()
                        temp_X = np.log(Aij).reshape((-1, 1))

                    beta = _compute_betas_gwr(temp_y, temp_X, w.reshape((-1, 1)))[0]
                    #print(beta)
                    yhat = np.dot(temp_X, beta)
                    new_XB[:, j] = yhat.flatten()
                    err = (temp_y - yhat).reshape((-1, 1))
                    params[j, :] = beta[0][0]
    
                num = np.sum((XB-new_XB)**2)
                den = 1 + np.sum(np.sum(XB, axis=1)**2)
                score = (num / den)
                XB = new_XB
        
                scores.append(deepcopy(score))
                delta = score

                if verbose:
                    print("Current iteration:", iters, ",SOC:", np.round(score, 8))
                if delta < tol:
                    break
                    
        elif is_Spatial == False:
            for iters in range(1, max_iter + 1):
                new_XB = np.zeros_like(X)
                params = np.zeros_like(betas)

                for j in range(k):
                    temp_y = XB[:, j].reshape((-1, 1))
                    temp_y = temp_y + err.reshape((-1, 1))
                    temp_X = np.log(X[:, j]).reshape((-1, 1))
                    beta = _compute_betas_gwr(temp_y, temp_X, w.reshape((-1, 1)))[0]
                    #print(beta)
                    yhat = np.dot(temp_X, beta)
                    new_XB[:, j] = yhat.flatten()
                    err = (temp_y - yhat).reshape((-1, 1))
                    params[j, :] = beta[0][0]
                    
                num = np.sum((XB-new_XB)**2)
                den = 1 + np.sum(np.sum(XB, axis=1)**2)
                score = (num / den)
                XB = new_XB
        
                scores.append(deepcopy(score))
                delta = score

                if verbose:
                    print("Current iteration:", iters, ",SOC:", np.round(score, 8))
                if delta < tol:
                    break

        return params, X, sig
    
    #local scoring - outer loop 
    while crit > 1e-8:
        mu = np.exp(eta).reshape((-1, 1))
        w = mu.reshape((-1,1))
        z = eta + ((y.reshape((-1, 1)) - mu) / mu)
        betas, X, sigma = backfit(z, X, w, v2, X_df, pid, geomap,is_Spatial, verbose)
        s_new = np.multiply(betas.T, np.log(X))
        inner = np.sum((s_old - s_new)**2, axis=1)
        num = np.sum(w*inner)
        den = np.sum(w*np.sum((1 + s_old)**2, axis=1).reshape((-1, 1)))
        crit = num / den
        eta = np.sum(s_new, axis=1).reshape((-1, 1))
        s_old = s_new
        
    return betas, eta, sigma

## Test

In [9]:
austria = pd.read_csv('Data/austria.csv')
austria_shp = gp.read_file('Data/austria.shp')
Dja = austria['Dj'].values 

In [10]:
austria = austria[austria['Origin'] != austria['Destination']]
flows = austria['Data'].values 
Oi = austria['Oi'].values 
Wj = austria['Dj'].values 
Dij = austria['Dij'].values 

In [11]:
X_df = austria.copy()
X_df['Code'] = X_df['Origin'] + X_df['Destination']
X_df = X_df.loc[:, ~X_df.columns.str.contains('^Unnamed')]
X_df = X_df[['Code','Origin', 'Destination', 'Data', 'Oi', 'Dj', 'Dij']] 

In [16]:
Aij = SpatialSmooth(5, X_df, 1, austria_shp, -1.35)
X = np.column_stack((Oi, Wj, Dij, Aij))
w = np.ones(72).reshape((-1,1))
betas = iwls(flows.reshape((-1,1)), np.log(X), family=Poisson(), offset=w.reshape((-1, 1)), y_fix=None, wi=w.reshape((-1, 1)))[0]
f = np.exp(np.dot(np.log(X), betas))# + np.random.normal(0, .0001, (72,1))).reshape((-1,1))
betas_cd, eta_cd, sigma_cd = CompeteDestination(f, 4, 5, 6, X_df, 1, austria_shp, is_Spatial=True, verbose=False)

In [17]:
print('betas_iwls:', betas, 'betas_cd:', betas_cd,'sigma_cd:', sigma_cd, sep='\n')

betas_iwls:
[[ 0.71369558]
 [ 0.72409961]
 [-1.0584376 ]
 [-0.08370744]]
betas_cd:
[[ 0.71369558]
 [ 0.72409961]
 [-1.0584376 ]
 [-0.08370744]]
sigma_cd:
-1.35
