In [1]:
import os
import tqdm
import pickle
import gensim
import numpy as np
import pandas as pd
import geopandas as gpd
from geopandas.tools import sjoin
from gensim.models.doc2vec import TaggedDocument

In [2]:
# Settings
data_dir = "./LD_data/"
# Flags
sequences_exist = True
model_exist = True
distance_exist = True
adjacency_exist = True
# Seeds
seed = 1
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

## Bike points location data

In [3]:
# DataFrame 786 * (index, name, id, lat, lon, capacity)
df_locs = pd.read_csv(data_dir+'org_data/bike_point_locations.csv')
# GeoDataFrame 786 * (index, name, id, capacity, geometry) proj='epsg:27700'
gdf_locs = gpd.read_file(data_dir+'intermediate_results/bike_point_locations_saved.shp')

## Origin-destination data

In [4]:
# DataFrame 8,115,378 * (time, index_start, index_end, count)
# df_od (index_start/index_end) -> df_locs/gdf_locs (index)
df_od = pd.read_csv(data_dir+"org_data/bike_od.csv")
# Ndarray (786,786) - Daily Average (index) -> df_locs/gdf_locs (index)
od_mx = np.load(data_dir+"final_input/bike_od_mx.npy")

## Spatial units

In [5]:
# GeoDataFrame 786 * (index, name, id, capacity, geometry)
gdf_BD = gpd.read_file(data_dir+"intermediate_results/gdf_BD.shp")

## POI data

In [6]:
# DataFrame (Group-Category-Class)   (class_code, group, category, class, group_des, category_des, class_des)
df_poiclass = pd.read_csv(data_dir+"org_data/POI_CLASSIFICATION.csv")
# GeoDataFrame 490,626 * (pointx_cla, geometry)
# pointx_cla -> class_code
gdf_poi = gpd.read_file(data_dir+"intermediate_results/poi.shp")
# Merge GeoDataFrame 490,626 * (class, class_des, geometry)
gdf_poi = gdf_poi.merge(df_poiclass, left_on='pointx_cla', right_on='class_code', how='left')[["class","class_des","geometry"]]

## Land use

In [7]:
# DataFrame 786 * (cat1, cat2, ..., cat8)
df_LUarea = pd.read_csv(data_dir+"final_input/df_LUarea.csv")

## Generate sequences

In [8]:
def get_sequences_by_distancegreedy(gdf_tz, gdf_poi, minpois = 1):
    sequences = {}
    df_join = sjoin(gdf_poi, gdf_tz, how="inner",op="within")
    for tz_ind in tqdm.tqdm(df_join.index_right.unique()):
        tz_pois = df_join[df_join.index_right == tz_ind].reset_index()
        if tz_pois.shape[0] > minpois:
            pnt_num = tz_pois.shape[0]
            z = np.array([[complex(g.x, g.y) for g in tz_pois.geometry]])
            dismat = abs(z.T-z)
            visited = list(np.unravel_index(np.argmax(dismat, axis=None), dismat.shape))
            # list of to be visited points
            not_visited = [x for x in range(pnt_num) if x not in visited]
            np.random.shuffle(not_visited)
            while not_visited:
                to_be_visit = not_visited.pop()
                if len(visited) == 2:
                    visited.insert(1, to_be_visit)
                    pass
                else:
                    search_bound = list(zip(visited[0:-1], visited[1:]))
                    dis = [dismat[to_be_visit, x]+dismat[to_be_visit, y]
                           for x, y in search_bound]
                    insert_place = dis.index(min(dis))+1
                    visited.insert(insert_place, to_be_visit)
            sequences[tz_ind] = tz_pois.loc[visited, "class"].values
    return sequences

if not sequences_exist:
    sequences = get_sequences_by_distancegreedy(gdf_BD.set_index('index'),gdf_poi)
    np.save(data_dir+"intermediate_results/Sequences_greedy.npy",sequences)
else:
    sequences = np.load(data_dir+"intermediate_results/Sequences_greedy.npy",allow_pickle=True).item()

## Doc2Vec Embedding

In [9]:
if not model_exist:
    corpus = [TaggedDocument([str(x) for x in words], [f'd{idx}'])for idx, words in sequences.items()]
    model = gensim.models.doc2vec.Doc2Vec(dm=1,vector_size=72,dm_mean=1,window=5,dbow_words=1,min_count=1,epochs=100,seed=1,workers=1)
    model.build_vocab(corpus)
    model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
    model.save(data_dir+"intermediate_results/doc2vec_model")
else:
    model = gensim.models.doc2vec.Doc2Vec.load(data_dir+"intermediate_results/doc2vec_model")

In [10]:
if not model_exist:
    df_zonevec = pd.DataFrame.from_dict({'index': sequences.keys()})
    i = 1
    for v in model.dv.vectors.T:
        df_zonevec[f"ZoneVec_{i}"] = v
        i += 1
    df_zonevec = df_zonevec.set_index("index").sort_index()
    df_zonevec.to_csv(data_dir+"final_input/df_zonevec.csv")
else:
    df_zonevec = pd.read_csv(data_dir+"final_input/df_zonevec.csv")

## Distance

In [11]:
if not distance_exist:
    df_dis = gdf_BD.geometry.centroid.apply(lambda g:gdf_BD.geometry.centroid.distance(g))
    df_dis.round(0).to_csv(data_dir+"final_input/df_dis.csv",index=False)
else:
    df_dis = pd.read_csv(data_dir+"final_input/df_dis.csv")

## Neighbor

In [12]:
# Adjacency List [[index,index,weight],...]
if not adjacency_exist:
    ls_edge = []
    for _, row in gdf_BD.iterrows():
        neighbors = gdf_BD[~gdf_BD.geometry.disjoint(row.geometry)]['index'].tolist()
        for i in neighbors:
            if row['index'] < i:
                a = [row['index'],i,1]
                ls_edge.append(a)
    with open(data_dir+"final_input/ls_edge.pkl","wb") as f:
        pickle.dump(ls_edge,f)
else:
    with open(data_dir+"final_input/ls_edge.pkl","rb") as f:
        ls_edge=pickle.load(f)

## Summary (input - interaction - output)

### Ⅰ - input (property - Doc2Vec Embedding)

In [None]:
# 786 * 72
# df_zonevec

### Ⅱ - interaction (flow/distance/adjacency)

In [14]:
# Flow (786,786) Daily Average
# od_mx
# Distance-decay (786,786)
# df_dis
# Adjacency [2150*[index,index,1]]
# ls_edge

### Ⅲ - output (urban function - LU area)

In [None]:
# Land Use (786,8)
# df_LUarea