In [1]:
import os
import pandas as pd
import csv
import matplotlib.pyplot as plt
import networkx as nx
import pickle
import numpy as np
import matplotlib as mpl
import seaborn as sns 
import json
from networkx.drawing.nx_agraph import graphviz_layout
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
from shapely.geometry import Polygon
from keplergl import KeplerGl
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statistics 
from scipy.stats import chi2
from statsmodels.stats.multitest import multipletests
from collections import Counter

# Set Path for data

In [4]:
path_harvey = '/Users/liqingchun/Google Drive/Dissertation/Dataset/Safegraph Data/SafeGraph Raw Data/'
path_network = '/Users/liqingchun/Google Drive/Dissertation/Dataset/Safegraph Data/SafeGraph Raw Data/OD_network/'
path_core = '/Users/liqingchun/Google Drive/Dissertation/Dataset/Safegraph Data/Safegraph Core data/CorePlacesMay2020Release-CORE_POI-2020_04-2020-05-06/'
path_census = '/Users/liqingchun/Google Drive/Dissertation/Dataset/Safegraph Data/safegraph_open_census_data/metadata/'
plt.style.use('classic')

## Read Network Pickle

In [5]:
os.chdir(path_core)
with open('core_pickle.pkl', 'rb') as file:
    geo_id = pickle.load(file)

geo_id = geo_id[['safegraph_place_id', 'top_category','brands', 'latitude', 'longitude']]

In [6]:
os.chdir(path_census)
with open('census_geo.pkl', 'rb') as file:
    census = pickle.load(file)

## Function

In [7]:
def Degree_var(pick1, pick2, path=path_network):
    ##read pickle data
    os.chdir(path)
    with open(pick1, 'rb') as file:
        df1 = pickle.load(file)
    with open(pick2, 'rb') as file:
        df2 = pickle.load(file)
        
    #read id and cbg data
    flux_in_1 = df1[['safegraph_place_id', 'visits']]
    flux_in_2 = df2[['safegraph_place_id', 'visits']]
    
    flux_out_1 = df1[['cbg', 'visits']]
    flux_out_2 = df2[['cbg', 'visits']]
    
    #get weighted node degree centrality
    flux_in_1 = flux_in_1.groupby(['safegraph_place_id']).sum().reset_index(drop=False)
    flux_in_2 = flux_in_2.groupby(['safegraph_place_id']).sum().reset_index(drop=False)
    
    flux_out_1 = flux_out_1.groupby(['cbg']).sum().reset_index(drop=False)
    flux_out_2 = flux_out_2.groupby(['cbg']).sum().reset_index(drop=False)
    
    #get total ID and total cbg
    total_id = set(list(flux_in_1.safegraph_place_id) + list(flux_in_2.safegraph_place_id))
    total_cbg = set(list(flux_out_1.cbg) + list(flux_out_2.cbg))
    
    ##generate df with same nodes
    for each in total_id:
        if each not in set(flux_in_1.safegraph_place_id):
            flux_in_1 = flux_in_1.append({'safegraph_place_id': each, 'visits':0}, ignore_index=True)
        elif each not in set(flux_in_2.safegraph_place_id):
            flux_in_2 = flux_in_2.append({'safegraph_place_id': each, 'visits':0}, ignore_index=True)
    for each in total_cbg:
        if each not in set(flux_out_1.cbg):
            flux_out_1 = flux_out_1.append({'cbg': each, 'visits':0}, ignore_index=True)
        elif each not in set(flux_out_2.cbg):
            flux_out_2 = flux_out_2.append({'cbg': each, 'visits':0}, ignore_index=True)
            
    ##reorder, make the nodes of different momnents have the same order
    flux_in_1.sort_values("safegraph_place_id",inplace=True)
    flux_in_2.sort_values("safegraph_place_id",inplace=True)
    flux_out_1.sort_values("cbg",inplace=True)
    flux_out_2.sort_values("cbg",inplace=True)
    
    ##change pandas to array
    flux_in_1 = flux_in_1.to_numpy()
    flux_in_2 = flux_in_2.to_numpy()
    flux_out_1 = flux_out_1.to_numpy()
    flux_out_2 = flux_out_2.to_numpy()
    
    ##difference between in-degrees
    dist_list_in = []
    for each in range(len(flux_in_1)):
        dist_list_in.append((np.linalg.norm(flux_in_1[each][1]-flux_in_2[each][1]))**2)
    mean_list_in = statistics.mean(dist_list_in)
    Z_list_in = np.array(dist_list_in)/mean_list_in
    ##chi-square test
    #dist_dict_in = {}
    chi_test_in = 1 - chi2.cdf(Z_list_in, df=1) ## chi-square test
    chi_test_adjust_in = multipletests(chi_test_in, alpha=0.1, method='fdr_bh')
    #for each in range(len(flux_in_1)):
        #dist_dict_in[flux_in_1[each][0]] = chi_test_adjust[1][each]
    data_in = {'safegraph_place_id':flux_in_1[:,0], 'pValue':np.array(chi_test_adjust_in[1])}
    id_var = pd.DataFrame(data_in)
    
     ##difference between out-degrees
    dist_list_out = []
    for each in range(len(flux_out_1)):
        dist_list_out.append((np.linalg.norm(flux_out_1[each][1]-flux_out_2[each][1]))**2)
    mean_list_out = statistics.mean(dist_list_out)
    Z_list_out = np.array(dist_list_out)/mean_list_out
    ##chi-square test
    #dist_dict_out = {}
    chi_test_out = 1 - chi2.cdf(Z_list_out, df=1) ## chi-square test
    chi_test_adjust_out = multipletests(chi_test_out, alpha=0.1, method='fdr_bh')
    #for each in range(len(flux_in_1)):
        #dist_dict_in[flux_in_1[each][0]] = chi_test_adjust[1][each]
    data_out = {'cbg':flux_out_1[:,0], 'pValue':np.array(chi_test_adjust_out[1])}
    cbg_var = pd.DataFrame(data_out)
    
    return (id_var, cbg_var, total_id, total_cbg)

In [29]:
def Save_results(results, sub_path='new york/dataframes/', date='05-01'):
    Safegraph_id = results[0][results[0].pValue <= 0.01]
    cbg = results[1][results[1].pValue <= 0.01]
    Safegraph_id_merge = Safegraph_id.merge(geo_id, on=['safegraph_place_id'])
    os.chdir(path_network+sub_path)
    Safegraph_id_merge.to_csv('id_data_out_'+date+'.csv', index=False)
    
    
    category = list(Safegraph_id_merge.top_category)
    cate_dict = Counter(category)

    sorted_category = sorted(cate_dict.items(), key=lambda kv: kv[1], reverse=True)
    with open(date+'_ID_category_with_significant_change.csv', 'w', newline='') as csvfile:
        list_writer = csv.writer(csvfile)
        for each in sorted_category:
            list_writer.writerow(each)
        list_writer.writerow(('Total number of POIs with significant change', len(Safegraph_id_merge)))
        list_writer.writerow(('Total number of POIs', len(results[2])))
        list_writer.writerow(('Miss value', len(Safegraph_id)-len(Safegraph_id_merge)))
        
    with open(date+'_cbg_with_significant_change.csv', 'w', newline='') as csvfile:
        list_writer = csv.writer(csvfile)
        for each in cbg.to_numpy():
            list_writer.writerow(each)
        list_writer.writerow(('Total number of cbgs with significant change', len(cbg)))
        list_writer.writerow(('Total number of cbgs', len(results[3])))

In [175]:
results = Degree_var('2020-01-13-seattle-cbg-home-poi-visits-network.pkl', '2020-04-27-seattle-cbg-home-poi-visits-network.pkl', path=path_network+'seattle/dataframes/')

In [176]:
Save_results(results, sub_path='seattle/dataframes/', date='04-27_new')

In [None]:
category = list(Safegraph_id_merge.top_category)

In [None]:
cate_dict = Counter(category)

sorted_category = sorted(cate_dict.items(), key=lambda kv: kv[1], reverse=True)
os.chdir(path_network+'new york/dataframes/')
with open('05-11_ID_category_with_significant_change.csv', 'w', newline='') as csvfile:
    list_writer = csv.writer(csvfile)
    for each in sorted_category:
        list_writer.writerow(each)
    list_writer.writerow(('Total number of POIs with significant change', len(Safegraph_id_merge)))
    list_writer.writerow(('Total number of POIs', len(results[2])))
    list_writer.writerow(('Miss value', len(Safegraph_id)-len(Safegraph_id_merge)))

In [None]:
with open('05-11_cbg_with_significant_change.csv', 'w', newline='') as csvfile:
    list_writer = csv.writer(csvfile)
    for each in cbg.to_numpy():
        list_writer.writerow(each)
    list_writer.writerow(('Total number of cbgs with significant change', len(cbg)))
    list_writer.writerow(('Total number of cbgs', len(results[3])))

In [None]:
map_1 = KeplerGl(height=400)
map_1

In [None]:
map_1.add_data(data=Safegraph_id_merge, name='data_1')

In [None]:
cbg_merge = cbg.merge(census, on=['cbg'])

In [None]:
map_1.add_data(data=cbg_merge, name='data_2')

In [None]:
os.chdir(path_network)
map_1.save_to_html(file_name='houston_map.html')

In [None]:
len(Safegraph_id_merge)

In [None]:
len(Safegraph_id)

In [None]:
len(cbg_merge)

In [None]:
len(results[2])

In [None]:
len(results[3])

In [132]:
os.chdir(path_network)
with open('./seattle/dataframes/2019-12-30-seattle-cbg-home-poi-visits-network.pkl', 'rb') as file:
    df1_newyork = pickle.load(file)

In [133]:
with open('./seattle/dataframes/2020-04-27-seattle-cbg-home-poi-visits-network.pkl', 'rb') as file:
    df2_newyork = pickle.load(file)

In [134]:
df1_newyork_hotspots = df1_newyork[df1_newyork.hotspot_destination==True]

In [135]:
df2_newyork_hotspots = df2_newyork[df2_newyork.hotspot_destination==True]

In [136]:
df1_newyork_hotspots = df1_newyork_hotspots.merge(geo_id, on=['safegraph_place_id'])

In [137]:
df2_newyork_hotspots = df2_newyork_hotspots.merge(geo_id,on=['safegraph_place_id'])

In [138]:
df1_cate = Counter(list(df1_newyork_hotspots.top_category))

In [139]:
df2_cate = Counter(list(df2_newyork_hotspots.top_category))

In [140]:
sorted(df1_cate.items(), key=lambda kv: kv[1], reverse=True)

[('Restaurants and Other Eating Places', 1303),
 ('Museums, Historical Sites, and Similar Institutions', 533),
 ('Other Amusement and Recreation Industries', 316),
 ('Grocery Stores', 120),
 ('Sporting Goods, Hobby, and Musical Instrument Stores', 113),
 ('Health and Personal Care Stores', 105),
 ('Traveler Accommodation', 100),
 ('Gasoline Stations', 93),
 ('Other Miscellaneous Store Retailers', 62),
 ('Elementary and Secondary Schools', 58),
 ('Clothing Stores', 57),
 ('Child Day Care Services', 53),
 ('Religious Organizations', 53),
 ('Building Material and Supplies Dealers', 47),
 ('Used Merchandise Stores', 41),
 ('Office Supplies, Stationery, and Gift Stores', 38),
 ('Offices of Other Health Practitioners', 37),
 ('Offices of Physicians', 35),
 ('Lessors of Real Estate', 31),
 ('Book Stores and News Dealers', 30),
 ('Offices of Dentists', 29),
 ('Personal Care Services', 25),
 ('Electronics and Appliance Stores', 24),
 ('Building Equipment Contractors', 24),
 ('Drinking Places (A

In [141]:
sorted(df2_cate.items(), key=lambda kv: kv[1], reverse=True)

[('Restaurants and Other Eating Places', 446),
 ('Museums, Historical Sites, and Similar Institutions', 335),
 ('Other Amusement and Recreation Industries', 118),
 ('Grocery Stores', 73),
 ('Health and Personal Care Stores', 48),
 ('Gasoline Stations', 47),
 ('Sporting Goods, Hobby, and Musical Instrument Stores', 37),
 ('Offices of Physicians', 28),
 ('Building Material and Supplies Dealers', 27),
 ('Child Day Care Services', 26),
 ('Elementary and Secondary Schools', 25),
 ('Other Miscellaneous Store Retailers', 24),
 ('Lessors of Real Estate', 24),
 ('Traveler Accommodation', 19),
 ('Office Supplies, Stationery, and Gift Stores', 17),
 ('Used Merchandise Stores', 15),
 ('Clothing Stores', 15),
 ('Offices of Other Health Practitioners', 14),
 ('General Medical and Surgical Hospitals', 14),
 ('Other Motor Vehicle Dealers', 12),
 ('Religious Organizations', 12),
 ('Specialty Food Stores', 12),
 ('Personal Care Services', 12),
 ('Florists', 10),
 ('Book Stores and News Dealers', 10),
 (

In [142]:
len(list(df2_newyork_hotspots.top_category))

1606

In [143]:
len(list(df1_newyork_hotspots.top_category))

3822

In [146]:
df2_newyork_hotspots = df2_newyork_hotspots[['safegraph_place_id', 'top_category', 'visits']]

In [147]:
df1_newyork_hotspots = df1_newyork_hotspots[['safegraph_place_id', 'top_category', 'visits']]

In [148]:
df1_newyork_hotspots.groupby(['top_category']).sum().reset_index(drop=False).sort_values("visits")

Unnamed: 0,top_category,visits
44,Household Appliances and Electrical and Electr...,5
64,Other Miscellaneous Manufacturing,5
18,Consumer Goods Rental,5
59,Offices of Real Estate Agents and Brokers,5
21,Death Care Services,5
...,...,...
84,"Sporting Goods, Hobby, and Musical Instrument ...",802
38,Grocery Stores,1041
60,Other Amusement and Recreation Industries,2279
53,"Museums, Historical Sites, and Similar Institu...",4677


In [149]:
df2_newyork_hotspots.groupby(['top_category']).sum().reset_index(drop=False).sort_values("visits")

Unnamed: 0,top_category,visits
58,Other Transit and Ground Passenger Transportation,5
18,"Data Processing, Hosting, and Related Services",5
27,Employment Services,5
16,Commercial and Industrial Machinery and Equipm...,5
64,RV (Recreational Vehicle) Parks and Recreation...,5
...,...,...
36,Health and Personal Care Stores,303
35,Grocery Stores,512
52,Other Amusement and Recreation Industries,725
47,"Museums, Historical Sites, and Similar Institu...",2439


In [130]:
df2_newyork_hotspots[df2_newyork_hotspots.top_category == 'Traveler Accommodation'].groupby(['top_category']).sum().reset_index(drop=False)

Unnamed: 0,top_category,visits
0,Traveler Accommodation,747


In [131]:
df1_newyork_hotspots[df1_newyork_hotspots.top_category == 'Traveler Accommodation'].groupby(['top_category']).sum().reset_index(drop=False)

Unnamed: 0,top_category,visits
0,Traveler Accommodation,2855


In [12]:
np.linalg.norm(4-7)

3.0