In [None]:
import numpy as np
import pandas as pd
import networkx as nx


from dateutil import tz, parser
from datetime import date, time, datetime, timedelta
from dateutil.parser import parse

import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.patches import Patch
from matplotlib.dates import MonthLocator, DateFormatter, DayLocator, WeekdayLocator
from matplotlib.ticker import NullFormatter
import matplotlib.transforms as transforms
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib.lines import Line2D

import seaborn as sns
import geopandas as gpd
# import contextily as cx

plt.rcParams.update({'font.size': 12})
pd.set_option('display.max_columns', None)

import s3fs
s3 = s3fs.S3FileSystem()

from tqdm.notebook import tqdm
import gzip
import pickle
import pdb
from functools import reduce
from itertools import product
from datetime import datetime, timedelta
from scipy import sparse, hstack
from scipy.sparse import csr_matrix
import math

import pdb
import sys
import os
import re
import glob
import multiprocessing as mp
import uuid
import numpy.random as npr
import math
import itertools
import copy
import time
import scipy.stats as sps
from progressbar import ProgressBar
import boto3

# Load data and network created

## Foursquare network

In [None]:
files = s3.glob("s3://fsq-phl-data/visits/local_date=*/part*.csv")
fsq_visits = pd.concat([pd.read_csv(s3.open(f)) for f in files]) #read all parts and concatenate

# Add the file path column to the DataFrame
fsq_visits['file_path'] = pd.Series([f for f in files for i in range(len(pd.read_csv(s3.open(f))))], index=fsq_visits.index)
# Create a dictionary to map file paths to date values
date_map = {f: f.split("=")[1][:10] for f in files}
# Add the date column to the DataFrame
fsq_visits['local_date'] = fsq_visits['file_path'].map(date_map)

# convert to census tract
fsq_visits['home_cbg'] = fsq_visits['home_cbg'].astype(str).str[:-1] 

In [None]:
# Foursquare POIs in Philadelphia
venue_files = s3.glob("s3://fsq-full-data/pennu/venues/dt=2022-11-10/part*.csv.gz")
venue_dfs = [pd.read_csv(s3.open(f), compression="gzip", error_bad_lines=False, sep="\t") for f in venue_files]
venue_df = pd.concat(venue_dfs)

fsq_phl = venue_df[venue_df['city']=='Philadelphia']

In [None]:
fsq_visits = fsq_visits[fsq_visits['venue_id'].isin(fsq_phl['venue_id'].tolist())]
fsq_visits['dwell_hours'] = fsq_visits['dwell']/3600

In [None]:
# explode time hourly

# def duration_time_daily(start, duration):
#     return [start + timedelta(days=i) for i in range(duration.days + 1)]

def duration_time(start, duration):
    return [start + timedelta(hours=i) for i in range(duration.seconds // 3600 + 1)]

def explode_time(data):
    data['start'] = pd.to_datetime(data['local_date']) + pd.to_timedelta(data['local_hour'], unit='h')
    data['end'] = data['start'] + pd.to_timedelta(data['dwell_hours'], unit='h')
    data['datetime'] = data.apply(lambda x: duration_time(x['start'], pd.to_timedelta(x['dwell_hours'], unit='h')), axis=1)
    data = data.explode('datetime').reset_index(drop=True)
    return data.drop_duplicates()

exploded_fsq_visits = explode_time(fsq_visits)
exploded_fsq_visits.tail()

In [None]:
exploded_fsq_visits['date'] = exploded_fsq_visits['datetime'].apply(lambda x: str(x.date()))
exploded_fsq_visits['hour'] = exploded_fsq_visits['datetime'].apply(lambda x: x.hour) # from 0 to 23

exploded_fsq_visits.drop(['utc_date','utc_hour','start','end','local_date','local_hour','dwell'],inplace=True,axis=1)
exploded_fsq_visits.head()

In [None]:
# groupby and normalize CBGs to POIs visits
fsq_visits_groupby = exploded_fsq_visits.groupby(['venue_id','date','home_cbg','hour']).agg({'full_panel_reweighted_sag_score':sum}).reset_index()
fsq_visits_groupby.rename(columns={'full_panel_reweighted_sag_score':'count'},inplace=True)
fsq_visits_groupby.head()

In [None]:
population = pd.read_csv('s3://upenn-seas-wattscovid19lab/paco/acs_vars/safegraph_open_census_data/data/data/cbg_b01.csv', dtype={'census_block_group':str})
# Shorten census_block_group to census_tract, convert population to int and group by census_tract
population = population.assign(
    census_tract = population['census_block_group'].str[:-1],
    population = population['B01003e1'].astype(int)).groupby('census_tract')['population'].sum()

population = population[population.index.str.startswith('42101')&(population>5)]

full_census_tract_list = population.index.tolist()

In [None]:
# take 4/1 hour 1 as an example
specific_date = '2020-04-01'
hour = 1
df = fsq_visits_groupby[(fsq_visits_groupby['date']==specific_date) & (fsq_visits_groupby['hour']==hour)]
df = df[df.home_cbg.isin(full_census_tract_list)]

# make into matrix from CTs to POIs
df_matrix = pd.pivot_table(df, values='count', index='home_cbg', columns='venue_id', fill_value=0)
# matrix = df_matrix.values

# find missing CTs and concat onto the df_matrix
unique_census_tracts = df['home_cbg'].unique().tolist()
missing_census_tracts = list(set(full_census_tract_list) - set(unique_census_tracts))
missing_data = pd.DataFrame(0, index=missing_census_tracts, columns=df_matrix.columns)
df_matrix_combined = pd.concat([df_matrix, missing_data])
df_matrix_combined = df_matrix_combined.sort_index()
df_matrix_combined = df_matrix_combined.fillna(0)
matrix = df_matrix_combined.values

# M * M^T: make into CTs to CTs
matrix_transpose = np.transpose(matrix)
result = matrix @ matrix_transpose

# Add census tract ids
ct_list = df_matrix.index.tolist()
result_df = pd.DataFrame(result, columns=full_census_tract_list)
result_df['home_CT'] = full_census_tract_list
home_CT_column = result_df.pop('home_CT')
result_df.insert(0, 'home_CT', home_CT_column)

result_df

In [None]:
# # export matrix on specific date or date range
# def matrix_date(date,optional_date=None):
#     if optional_date:
#         file_name = date+'_to_'+optional_date
#         fsq_visits_groupby['local_date_datetime'] = pd.to_datetime(fsq_visits_groupby['date_str'])
#         date = pd.to_datetime(date)
#         optional_date = pd.to_datetime(optional_date)
#         CBG_POI = fsq_visits_groupby[(fsq_visits_groupby['local_date_datetime'] >= date) & 
#                                      (fsq_visits_groupby['local_date_datetime'] <= optional_date)]
#         CBG_POI = CBG_POI.groupby(['venue_id','home_cbg']).sum().reset_index()
#     else:
#         file_name = date
#         CBG_POI = fsq_visits_groupby[fsq_visits_groupby['date_str']==date]
        
#     CBG_POI_matrix = pd.pivot_table(CBG_POI, values='count', index='home_cbg', columns='venue_id', fill_value=0)

#     # Reindex the matrix to include all CBGs and POIs
#     CBG_POI_matrix = CBG_POI_matrix.reindex(index=all_cbgs, columns=all_pois, fill_value=0)
# #     print(CBG_POI_matrix.shape)

#     CBG_POI_matrix.to_csv('s3://phl-poi-networks/fsq/'+file_name+'.csv')

In [None]:
# # export matrix on specific date or date range
# def two_index_date(date):
#     file_name = date+'_indexed'
#     CBG_POI = fsq_visits_groupby[fsq_visits_groupby['date_str']==date]
#     CBG_POI.set_index(['venue_id','home_cbg'],inplace=True)

#     CBG_POI.to_csv('s3://phl-poi-networks/fsq/'+file_name+'.csv')

In [None]:
# # select the intersection
# for i in range(1,8):
#     date ='2020-04-0' + str(i)
#     df = fsq_visits_groupby[fsq_visits_groupby['date_str']==date]
#     if i == 1:
#         old = set(df.home_cbg)
#     else:
#         new = set(df.home_cbg)
#         old = old & new
        
#     print(len(set(df.home_cbg)))
    
# intersect_CT = list(old)

In [None]:
def CT_to_CT(date):
    result_matrix_daily = np.zeros((376,376))
    
    for hour in range(24):
    
        df = fsq_visits_groupby[(fsq_visits_groupby['date']==date) & (fsq_visits_groupby['hour']==hour)]
        df = df[df.home_cbg.isin(full_census_tract_list)]

        # make into matrix from CTs to POIs
        df_matrix = pd.pivot_table(df, values='count', index='home_cbg', columns='venue_id', fill_value=0)

        # find missing CTs and concat onto the df_matrix
        unique_census_tracts = df['home_cbg'].unique().tolist()
        missing_census_tracts = list(set(full_census_tract_list) - set(unique_census_tracts))
        missing_data = pd.DataFrame(0, index=missing_census_tracts, columns=df_matrix.columns)
        df_matrix_combined = pd.concat([df_matrix, missing_data])
        df_matrix_combined = df_matrix_combined.sort_index()
        df_matrix_combined = df_matrix_combined.fillna(0)
        matrix = df_matrix_combined.values

        # M * M^T: make into CTs to CTs
        matrix_transpose = np.transpose(matrix)
        result = matrix @ matrix_transpose
        result_matrix_daily += result

    n = result_matrix_daily.shape[0] - 1
    indices = np.diag_indices(n+1)
    result_matrix_daily[indices] /= 2

    # Add census tract ids
    result_df = pd.DataFrame(result_matrix_daily, columns=full_census_tract_list)
    result_df['home_CT'] = full_census_tract_list
    home_CT_column = result_df.pop('home_CT')
    result_df.insert(0, 'home_CT', home_CT_column)

    result_df.to_csv('s3://phl-poi-networks/fsq/'+date+'.csv')
    
    print('Finished exporting network: ', date)


    # Save matrix npz to s3
#     s3_client = boto3.client('s3')
#     bucket_name = 'phl-poi-networks'
#     matrix_bytes = matrix.tobytes()

#     s3_client.put_object(Body=matrix_bytes, Bucket=bucket_name, Key=file_name+'.npz')

In [None]:
result_df = CT_to_CT('2020-04-01')
result_df

In [None]:
start = datetime.strptime("04-01-2020", "%m-%d-%Y")
end = datetime.strptime("04-30-2020", "%m-%d-%Y")
date_generated = [start + timedelta(days=x) for x in range(0, (end-start).days)]
date_interval = [date.strftime("%Y-%m-%d") for date in date_generated]

In [None]:
for date in date_interval:
    CT_to_CT(date)

In [None]:
# import numpy as np

# # Load the NPZ file
# pivot_table = pd.read_csv('s3://phl-poi-networks/fsq/2020-04-01.csv').iloc[:,1:]
# non_pivot_df = pd.melt(pivot_table, id_vars=['home_CT'], var_name='column_name', value_name='value')

# non_pivot_df

In [None]:
# # export the whole month of April and May
# matrix_date('2020-04-01','2020-04-30')
# matrix_date('2020-05-01','2020-05-31')

## Safegraph network

In [None]:
#We need to concatenate and groupby to add by hour, indices might be different
def add_df(x, y):
    return(pd.concat([x, y]).groupby(level=[0,1]).sum())

#fix cbgs were incorrectly written when starting with 0, probably because they were kept as ints and not strings ¬¬
def fix_cbg(s):
    if len(s) == 12:
        return(s)
    elif len(s) == 11:
        return('0'+s)
    else:
        sys.exit("Error, FIPS code is wrong")

In [None]:
f = 'Philadelphia_Camden_Wilmington_PA_NJ_DE_MD'
net_file = f+"_2020-03-01_to_2020-05-02.pkl.gz"
cbgs_file = f+"_cbg_ids.csv"
pois_file = f+"_poi_ids.csv"

!aws s3 cp s3://stanford-networks/raw_pickles/$net_file ./../net_file.pkl.gz

In [None]:
pois = pd.read_csv('s3://stanford-networks/raw_pickles/'+pois_file)['safegraph_place_id']

# filter those POIs in Philadelphia
safegraph_phl = pd.read_csv("s3://upenn-seas-wattscovid19lab/paco/core_places/places-42101.csv")
poi_boolean = [poi in safegraph_phl['safegraph_place_id'].tolist() for poi in pois]
sg_poi = pois[poi_boolean]
sum(poi_boolean),len(poi_boolean)

In [None]:
cbg_fips = pd.read_csv('s3://stanford-networks/raw_pickles/'+cbgs_file, dtype={'cbg_id':'str'})
cbg_fips['cbg_id'] = cbg_fips.cbg_id.apply(fix_cbg)
census_block_group = cbg_fips.cbg_id
census_block_group.name='census_block_group'
census_tracts = census_block_group.apply(lambda x: x[:-1]).values

# filter those in Philadelphia
CT_boolean = np.array([x.startswith('42101') for x in census_tracts], dtype=bool)
sg_ct = census_tracts[CT_boolean]
sum(CT_boolean),len(CT_boolean)

In [None]:
sg_ct_unique = np.unique(sg_ct).tolist()

In [None]:
len(sg_ct),len(set(sg_ct))
len(np.unique(sg_ct))

In [None]:
#we load a list of sparse matrices corresponding to each hour in the time period
with gzip.open('./../net_file.pkl.gz', "rb") as file:
    nets= pickle.load(file)
T = len(nets)//24

In [None]:
nets[0] #.toarray().shape
matrix = nets[0][poi_boolean, :][:,CT_boolean]

In [None]:
matrix

In [None]:
def groupby_tanspose(matrix):
    matrix_df = pd.DataFrame(matrix.toarray(),columns=sg_ct)
    aggregated_df = matrix_df.groupby(matrix_df.columns, axis=1).sum().transpose()
    
    matrix = aggregated_df.values

    # M * M^T: make into CTs to CTs
    matrix_transpose = np.transpose(matrix)
    result = matrix @ matrix_transpose
    
    return result

In [None]:
#t represents each of the days we are going to operate on
backfill = True
first_date = datetime.strptime("2020-03-01",'%Y-%m-%d')
for t in tqdm(range(T)):
    day_net_name = 's3://phl-poi-networks/stanford/'+ (first_date + timedelta(days=t)).isoformat()[:10] + '.csv'

    check_exists = !aws s3 ls $day_net_name
    if (not backfill) and len(check_exists)>0:
        print("Network already exists")
        continue
 
    day_nets = [groupby_tanspose(nets[j][poi_boolean, :][:,CT_boolean]) for j in range(24*t, 24*(t+1))]

    m, n = day_nets[0].shape
    result = csr_matrix((m, n))

    for matrix in day_nets:
        result += matrix
    
#     ct_list = aggregated_df.index.tolist()
    result_df = pd.DataFrame(result, columns=sg_ct_unique)
    result_df['home_CT'] = sg_ct_unique
    home_CT_column = result_df.pop('home_CT')
    result_df.insert(0, 'home_CT', home_CT_column)
#     print(result_df.shape) 
#     break
    
    result_df.to_csv(day_net_name)
    print('Finished exporting network: ', day_net_name)

## Load stored networks

/multiscale_epidemic/stanford_nets.ipynb

In [None]:
date = '04-01'
fsq_path = 's3://phl-poi-networks/fsq/2020-'+date+'.csv'
sg_path = 's3://phl-poi-networks/stanford/2020-'+date+'.csv'

fsq_network = pd.read_csv(fsq_path).iloc[:,1:]
sg_network = pd.read_csv(sg_path)

In [None]:
fsq_network.shape, sg_network.shape

In [None]:
fsq_network.iloc[:5,:5]

In [None]:
sg_network.iloc[:5,:5]

In [None]:
fsq_ct = fsq_network['home_cbg'] # integer
fsq_poi = fsq_network.columns[1:]

sg_ct = sg_network['Unnamed: 0']

In [None]:
f = 'Philadelphia_Camden_Wilmington_PA_NJ_DE_MD'
net_file = f+"_2020-03-01_to_2020-05-02.pkl.gz"
cbgs_file = f+"_cbg_ids.csv"
pois_file = f+"_poi_ids.csv"

!aws s3 cp s3://stanford-networks/raw_pickles/$net_file ./../net_file.pkl.gz

In [None]:
pois = pd.read_csv('s3://stanford-networks/raw_pickles/'+pois_file)['safegraph_place_id']

# filter those POIs in Philadelphia
safegraph_phl = pd.read_csv("s3://upenn-seas-wattscovid19lab/paco/core_places/places-42101.csv")
poi_boolean = [poi in safegraph_phl['safegraph_place_id'].tolist() for poi in pois]
sg_poi = pois[poi_boolean]

In [None]:
fsq_not_in_sg = 0
fsq_not_in_sg_lst = []
for i in range(len(fsq_ct)):
    if not fsq_ct.iloc[i] in sg_ct.tolist():
        fsq_not_in_sg += 1
        fsq_not_in_sg_lst.append(fsq_ct.iloc[i])
#         print(fsq_ct.iloc[i])
print('In Foursquare, there are %i census tracts that are not in Safegraph CT list'%(fsq_not_in_sg))

sg_not_in_fsq = 0
sg_not_in_fsq_lst = []
for ct in sg_ct:
    if not ct in fsq_ct.tolist():
        sg_not_in_fsq += 1
        sg_not_in_fsq_lst.append(ct)
#         print(ct)
print('In Safegraph, there are %i census tracts that are not in Foursquare CT list'%(sg_not_in_fsq))

# Matrix calculation

In [None]:
fsq_sparse_matrix, sg_sparse_matrix

In [None]:
fsq_sparse_matrix.nnz, sg_sparse_matrix.nnz

In [None]:
np.count_nonzero(fsq_nonzero), np.count_nonzero(sg_nonzero)

In [None]:
# Convert the DataFrames to sparse matrices (assuming your DataFrames contain numeric values)
fsq_sparse_matrix = csr_matrix(fsq_network.iloc[:,1:].values)
sg_sparse_matrix = csr_matrix(sg_network.iloc[:,1:].values)

# Calculate the sparsity of each network considering values larger than 5
fsq_nonzero = fsq_sparse_matrix.data > 5
fsq_sparsity = 1.0 - (np.count_nonzero(fsq_nonzero) / np.prod(fsq_sparse_matrix.shape))

sg_nonzero = sg_sparse_matrix.data > 5
sg_sparsity = 1.0 - (np.count_nonzero(sg_nonzero) / np.prod(sg_sparse_matrix.shape))

# Calculate the density of each network considering values larger than 5
fsq_density = np.count_nonzero(fsq_nonzero) / np.prod(fsq_sparse_matrix.shape)
sg_density = np.count_nonzero(sg_nonzero) / np.prod(sg_sparse_matrix.shape)

print("Sparsity Comparison:")
print("fsq Sparsity:", fsq_sparsity)
print("Safegraph Sparsity:", sg_sparsity)

print("Density Comparison:")
print("fsq Density:", fsq_density)
print("Safegraph Density:", sg_density)


print('\n')
# Calculate the degree distribution of each network (in-degree and out-degree)
fsq_in_degree = np.asarray(fsq_sparse_matrix.sum(axis=0)).flatten()
fsq_out_degree = np.asarray(fsq_sparse_matrix.sum(axis=1)).flatten()

sg_in_degree = np.asarray(sg_sparse_matrix.sum(axis=0)).flatten()
sg_out_degree = np.asarray(sg_sparse_matrix.sum(axis=1)).flatten()

non_zero_fsq_in_degree = fsq_in_degree[fsq_in_degree > 5]
print('Foursquare in-degree (POIs) density:',len(non_zero_fsq_in_degree),'/',len(fsq_in_degree),'=',len(non_zero_fsq_in_degree)/len(fsq_in_degree))

non_zero_sg_in_degree = sg_in_degree[sg_in_degree > 5]
print('Safegraph in-degree (POIs) density:',len(non_zero_sg_in_degree),'/',len(sg_in_degree),'=',len(non_zero_sg_in_degree)/len(sg_in_degree))

non_zero_fsq_out_degree =fsq_out_degree[fsq_out_degree >5]
print('Foursquare out-degree (CBGs) density:',len(non_zero_fsq_out_degree),'/',len(fsq_out_degree),'=',len(non_zero_fsq_out_degree)/len(fsq_out_degree))

non_zero_sg_out_degree = sg_out_degree[sg_out_degree >5]
print('Safegraph out-degree (CBGs) density:',len(non_zero_sg_out_degree),'/',len(sg_out_degree),'=',len(non_zero_sg_out_degree)/len(sg_out_degree))


# Compare the degree distributions using statistical measures or visualization
# For example, you can compare the means of the in-degree distributions
fsq_mean_in_degree1 = np.mean(fsq_in_degree)
sg_mean_in_degree2 = np.mean(sg_in_degree)

# Alternatively, you can visualize the degree distributions using histograms

fig, axs = plt.subplots(1, 2, figsize=(12, 6))

axs[0].set_xlim((0, 2000))
axs[0].set_xlabel('Total Visits of POIs in Foursquare')
axs[0].set_ylabel('count of POIs')
axs[0].legend()
bin_edges = np.linspace(0, 2000, num=11)  # 11 edges for 10 bins
axs[0].hist(non_zero_fsq_in_degree, bins=bin_edges, alpha=0.5, label='Foursquare')
axs[0].legend()

axs[1].set_xlim((0, 200))
axs[1].set_xlabel('Total Visits of POIs in Safegraph')
axs[1].set_ylabel('count of POIs')
axs[1].legend()
bin_edges = np.linspace(0, 200, num=11)  # 11 edges for 10 bins
axs[1].hist(non_zero_sg_in_degree, bins=bin_edges, alpha=0.5, label='Safegraph', color='orange')
axs[1].legend()

fig.suptitle('POIs visits distribution on 4/1')
plt.tight_layout()
plt.show()

In [None]:
for i in fsq_ct[fsq_out_degree <= 5].tolist():
    if i in fsq_not_in_sg_lst:
        print(i)

In [None]:
type(fsq_not_in_sg_lst[0])

In [None]:
i = 4
sg_network.iloc[:,i][sg_network.iloc[:,i] != 0]

In [None]:
fsq_network.iloc[:,i][fsq_network.iloc[:,i] != 0]

In [None]:
len(fsq_out_degree)
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# axs[0].set_xlim((0,800))
# axs[0].hist(non_zero_fsq_out_degree, alpha=0.5, label='fsq')
# axs[0].set_xlabel('Visits in April 1st')
# axs[0].set_ylabel('number of CBGs')
# axs[0].legend()

axs[0].set_xlim((0, 10000))
axs[0].set_xlabel('Total Visits of CTs in Foursquare')
axs[0].set_ylabel('count of CTs')
axs[0].legend()
bin_edges = np.linspace(0, 10000, num=11)  # 11 edges for 10 bins
axs[0].hist(non_zero_fsq_out_degree, bins=bin_edges, alpha=0.5, label='Foursquare')
axs[0].legend()

axs[1].hist(non_zero_sg_out_degree, alpha=0.5, label='Safegraph', color='orange')
axs[1].set_xlabel('Total Visits of CTs in Safegraph')
axs[1].set_ylabel('count of CTs')
axs[1].legend()

# axs[1].set_xlim((0, 10000))
# axs[1].set_xlabel('Total Visits of CTs in Foursquare')
# axs[1].set_ylabel('count of CTs')
# axs[1].legend()
# bin_edges = np.linspace(0, 10000, num=11)  # 11 edges for 10 bins
# axs[1].hist(non_zero_fsq_out_degree, bins=bin_edges, alpha=0.5, label='Foursquare')
# axs[1].legend()

fig.suptitle('CTs outgoing visits distribution on 4/1')
plt.tight_layout()
plt.show()

In [None]:
fsq_sparse_matrix

In [None]:
import networkx as nx
from networkx.algorithms import community

# Create bipartite graphs from the non-square matrices
fsq_bipartite = nx.bipartite.from_biadjacency_matrix(fsq_sparse_matrix)
sg_bipartite = nx.bipartite.from_biadjacency_matrix(sg_sparse_matrix)

# Specify the set of nodes to project onto
fsq_nodes = {n for n, d in fsq_bipartite.nodes(data=True) if d["bipartite"] == 0}
sg_nodes = {n for n, d in sg_bipartite.nodes(data=True) if d["bipartite"] == 0}

# Project the bipartite graphs onto the specified nodes
fsq_projected = nx.bipartite.projected_graph(fsq_bipartite, fsq_nodes)
sg_projected = nx.bipartite.projected_graph(sg_bipartite, sg_nodes)

# Compute the communities using the Louvain method
fsq_communities = community.greedy_modularity_communities(fsq_projected)
sg_communities = community.greedy_modularity_communities(sg_projected)

fsq_communities = [set(c) for c in fsq_communities]
sg_communities = [set(c) for c in sg_communities]

# Compute the modularity of the communities
fsq_modularity = community.quality.modularity(fsq_projected, fsq_communities)
sg_modularity = community.quality.modularity(sg_projected, sg_communities)

# Compare the modularity values
if fsq_modularity > sg_modularity:
    print("The modularity of the fsq_network is higher.")
elif fsq_modularity < sg_modularity:
    print("The modularity of the sg_network is higher.")
else:
    print("The modularity values are equal.")


In [None]:
fsq_modularity, sg_modularity

In [None]:
import networkx as nx
from networkx.algorithms import bipartite

# Create bipartite graphs from the non-square matrices
fsq_bipartite = nx.from_scipy_sparse_matrix(fsq_sparse_matrix)
sg_bipartite = nx.from_scipy_sparse_matrix(sg_sparse_matrix)

# Get the set of nodes of the desired type (rows) for each bipartite graph
fsq_nodes = {n for n, d in fsq_bipartite.nodes(data=True) if d["bipartite"] == 0}
sg_nodes = {n for n, d in sg_bipartite.nodes(data=True) if d["bipartite"] == 0}

# Project the bipartite graphs onto the specified nodes
fsq_projected = bipartite.projected_graph(fsq_bipartite, fsq_nodes)
sg_projected = bipartite.projected_graph(sg_bipartite, sg_nodes)

# Compute the communities using the Louvain method
fsq_communities, _ = bipartite.modularity(fsq_projected, fsq_nodes)
sg_communities, _ = bipartite.modularity(sg_projected, sg_nodes)

# Calculate the modularity for each network
fsq_modularity = bipartite.modularity(fsq_projected, fsq_communities)
sg_modularity = bipartite.modularity(sg_projected, sg_communities)

# Compare the modularity values
if fsq_modularity > sg_modularity:
    print("The modularity of the fsq_network is higher.")
elif fsq_modularity < sg_modularity:
    print("The modularity of the sg_network is higher.")
else:
    print("The modularity values are equal.")


In [None]:
# Advanced Comparisons:

# 1. Clustering Coefficient:
fsq_avg_clustering = nx.average_clustering(nx.from_scipy_sparse_matrix(fsq_sparse_matrix))
sq_avg_clustering = nx.average_clustering(nx.from_scipy_sparse_matrix(sq_sparse_matrix))

# 2. Assortativity:
fsq_assortativity = nx.degree_assortativity_coefficient(nx.from_scipy_sparse_matrix(fsq_sparse_matrix))
sq_assortativity = nx.degree_assortativity_coefficient(nx.from_scipy_sparse_matrix(sq_sparse_matrix))

# 3. Network Distance:
fsq_avg_shortest_path = nx.average_shortest_path_length(nx.from_scipy_sparse_matrix(fsq_sparse_matrix))
sq_avg_shortest_path = nx.average_shortest_path_length(nx.from_scipy_sparse_matrix(sq_sparse_matrix))

# 4. Centrality Measures:
fsq_degree_centrality = nx.degree_centrality(nx.from_scipy_sparse_matrix(fsq_sparse_matrix))
sq_degree_centrality = nx.degree_centrality(nx.from_scipy_sparse_matrix(sq_sparse_matrix))

# 5. Network Motifs:
fsq_motifs = nx.algorithms.motifs(nx.from_scipy_sparse_matrix(fsq_sparse_matrix))
sq_motifs = nx.algorithms.motifs(nx.from_scipy_sparse_matrix(sq_sparse_matrix))

# 6. Community Structure:
fsq_communities = nx.algorithms.community.greedy_modularity_communities(nx.from_scipy_sparse_matrix(fsq_sparse_matrix))
sq_communities = nx.algorithms.community.greedy_modularity_communities(nx.from_scipy_sparse_matrix(sq_sparse_matrix))

# 7. Network Robustness:
fsq_giant_component_size = nx.algorithms.components.number_connected_components(nx.from_scipy_sparse_matrix(fsq_sparse_matrix))
sq_giant_component_size = nx.algorithms.components.number_connected_components(nx.from_scipy_sparse_matrix(sq_sparse_matrix))

# 8. Network Evolution:
# Perform temporal analysis to compare networks across different time points or intervals

# Print the results of advanced comparisons:

print("Advanced Comparisons:")
print("Clustering Coefficient Comparison:")
print("Foursquare Average Clustering Coefficient:", fsq_avg_clustering)
print("Safegraph Average Clustering Coefficient:", sq_avg_clustering)

print("Assortativity Comparison:")
print("Foursquare Assortativity Coefficient:", fsq_assortativity)
print("Safegraph Assortativity Coefficient:", sq_assortativity)

print("Network Distance Comparison:")
print("Foursquare Average Shortest Path Length:", fsq_avg_shortest_path)
print("Safegraph Average Shortest Path Length:", sq_avg_shortest_path)

print("Centrality Measures Comparison:")
print("Foursquare Degree Centrality:", fsq_degree_centrality)
print("Safegraph Degree Centrality:", sq_degree_centrality)

print("Network Motifs Comparison:")
print("Foursquare Network Motifs:", fsq_motifs)
print("Safegraph Network Motifs:", sq_motifs)

print("Community Structure Comparison:")
print("Foursquare Communities:", fsq_communities)
print("Safegraph Communities:", sq_communities)

print("Network Robustness Comparison:")
print("Foursquare Giant Component Size:", fsq_giant_component_size)
print("Safegraph Giant Component Size:", sq_giant_component_size)

# Continue with other comparisons and calculations as needed


In [None]:



# Perform other desired comparisons and calculations
# such as modularity, edge variance, central nodes, etc.

# # Example: Create networkx graphs for visualization
# fsq_graph = nx.from_scipy_sparse_matrix(fsq_sparse_matrix)
# sg_graph = nx.from_scipy_sparse_matrix(sg_sparse_matrix)

# # Example: Calculate modularity using networkx
# fsq_modularity = nx.algorithms.community.modularity(fsq_graph, fsq_graph.nodes())
# sg_modularity = nx.algorithms.community.modularity(sg_graph, sg_graph.nodes())

# # Example: Compare modularity
# print("Modularity Comparison:")
# print("fsq Modularity:", fsq_modularity)
# print("Safegraph Modularity:", sg_modularity)

# Example: Compare sparsity and density


# Continue with other comparisons and calculations as needed

In [None]:
import matplotlib.pyplot as plt
import networkx as nx

# Assuming you have the necessary data and calculations from previous code sections

# Example 1: Degree Distribution Comparison
plt.hist(fsq_out_degree, bins=10, alpha=0.5, label='Foursquare')
plt.hist(sq_out_degree, bins=10, alpha=0.5, label='Safegraph')
plt.xlabel('Degree')
plt.ylabel('Frequency')
plt.legend()
plt.title('Degree Distribution Comparison')
plt.show()

# Example 2: Network Structure Comparison
fsq_graph = nx.from_scipy_sparse_matrix(fsq_sparse_matrix)
sq_graph = nx.from_scipy_sparse_matrix(sq_sparse_matrix)
plt.figure(figsize=(10, 5))
plt.subplot(121)
nx.draw(fsq_graph, with_labels=True)
plt.title('Foursquare Network')
plt.subplot(122)
nx.draw(sq_graph, with_labels=True)
plt.title('Safegraph Network')
plt.tight_layout()
plt.show()

# Example 3: Community Structure Comparison
plt.figure(figsize=(10, 5))
plt.subplot(121)
nx.draw(fsq_graph, with_labels=True, node_color='lightblue')
plt.title('Foursquare Network')
plt.subplot(122)
nx.draw(sq_graph, with_labels=True, node_color='lightgreen')
plt.title('Safegraph Network')
plt.tight_layout()
plt.show()

# Example 4: Centrality Comparison
plt.scatter(range(len(fsq_degree_centrality)), list(fsq_degree_centrality.values()), label='Foursquare')
plt.scatter(range(len(sq_degree_centrality)), list(sq_degree_centrality.values()), label='Safegraph')
plt.xlabel('Node Index')
plt.ylabel('Centrality')
plt.legend()
plt.title('Centrality Comparison')
plt.show()

# Example 5: Network Evolution Comparison (using hypothetical data)
time_points = [t1, t2, t3]  # Replace with your actual time points
densities_fsq = [0.8, 0.7, 0.6]  # Replace with your actual density values for Foursquare
densities_sq = [0.9, 0.8, 0.7]  # Replace with your actual density values for Safegraph
plt.plot(time_points, densities_fsq, marker='o', label='Foursquare')
plt.plot(time_points, densities_sq, marker='o', label='Safegraph')
plt.xlabel('Time')
plt.ylabel('Density')
plt.legend()
plt.title('Network Density Evolution')
plt.show()

# Example 6: Heatmap Comparison (using hypothetical edge weight data)
edge_weights_fsq = np.random.rand(fsq_sparse_matrix.nnz)
edge_weights_sq = np.random.rand(sq_sparse_matrix.nnz)
heatmap_data = [edge_weights_fsq, edge_weights_sq]
plt.imshow(heatmap_data, cmap='hot', aspect='auto')
plt.colorbar(label='Edge Weight')
plt.xticks([])  # Assuming you don't want x-axis tick labels
plt.ylabel('Edge Index')
plt.title('Edge Weight Heatmap Comparison')
plt.show()

# Continue with other plot comparisons as needed


# epidemic modeling

In [None]:
population = pd.read_csv(
    's3://upenn-seas-wattscovid19lab/paco/acs_vars/safegraph_open_census_data/data/data/cbg_b01.csv',
    dtype = {'census_block_group':str})
population = population.loc[population.census_block_group.apply(lambda x: x[:5]=='42101')].set_index('census_block_group').B01003e1
population.name = 'population'
population.index = population.index.map(lambda x: x[:-1])
population.index.name = 'census_tract'
population = population.astype(int)
population = population.groupby('census_tract').sum()
population = population.to_frame().reset_index()
# population['census_block_group'] = population['census_block_group'].astype(int)
total_population = population['population'].sum()
population['census_tract'] = population['census_tract'].astype('str')

In [None]:
population[population['population']>8]

In [None]:
pop_geoid = population.set_index('census_tract')
pop_geoid = pop_geoid.rename_axis('origin_geoid')
pop_geoid.index = pop_geoid.index.astype(int)

In [None]:
init_date = datetime.strptime('2020-04-01', '%Y-%m-%d')

def day_interval(date_start, num_days):
    day_list = [date_start+timedelta(days=t) for t in range(num_days)]
    return(day_list)

date_interval = day_interval(init_date, 7)

In [None]:
net_dict = {}
for d in date_interval:
    date = str(d)[:10]
    fsq_path = 's3://phl-poi-networks/fsq/'+date+'_indexed.csv'
    net = pd.read_csv(fsq_path)[['venue_id','home_cbg','count']]
    net.rename(columns={'home_cbg':'origin_geoid','venue_id':'dest_geoid'},inplace=True)
    net.set_index(['origin_geoid','dest_geoid'],inplace=True)
    date_obj = datetime.strptime(date, '%Y-%m-%d')
    net_dict[date_obj] = net
    print(date_obj)

In [None]:
params = {
    'beta' : 2000,
    'kappa' : 0.22,
    'gamma' : 0.14,
    'tau' : 0.08}

In [None]:
# def best_guess_x0(init_date, params, cases_df, pop_geoid, which='middle'):
#     sum_caseload = np.sum(cases_df[cases_df.index <= init_date])
#     window = [init_date - timedelta(t) for t in range(12)]
#     if which=='middle':
#         #we find good guesses for E and I based on tau, real and kappa
#         #new cases at the beggining is approx kappa*E*tau
#         E = cases_df[init_date]\
#              /(params['kappa']*params['tau'])
#         #look 10 days into the past for new cases, and estimate the 
#         #number still in I using gamma
#         I = [cases*(1-params['gamma'])**(t) for t, cases
#                               in enumerate(cases_df.reindex(window))]
#         I = pd.Series(I).sum()/params['tau'] #to ignore nan
#     elif which == 'low':
#         E = (1-params['kappa'])*real[init_date]\
#              /(params['kappa']*params['tau'])
#         E = E - np.sqrt(E)

#         I = [cases*(1-params['gamma'])**(t+1) for t, cases
#                               in enumerate(cases_df.reindex(window))]
#         I = [x - np.sqrt(x) for x in I]
#         I = pd.Series(I).sum()/params['tau'] #to ignore nan

#     elif which == 'high':
#         E = (1-params['kappa'])*real[init_date-timedelta(days=1)]\
#              /(params['kappa']*params['tau'])
#         E = E + np.sqrt(E)

#         I = [cases*(1-params['gamma'])**(t+1) for t, cases
#                               in enumerate(cases_df.reindex(window))]
#         I = [x + np.sqrt(x) for x in I]
#         I = pd.Series(I).sum()/params['tau'] #to ignore nan

#     #COMPUTE S RESIDUALLY
#     S = np.sum(pop_geoid) - E - sum_caseload/params['tau']
#     return(int(S),int(E),int(I))

# params['S'], params['E'], params['I'] = best_guess_x0(
#         init_date, params, cases_df, pop_geoid, which = 'low')

params['S'], params['E'], params['I'] = 100000, 10000, 1000

In [None]:
def initialize_states_list(initial_cond, i, pop_geoid):
    mu = initial_cond[i, :]
    n = len(pop_geoid)

    states = pd.DataFrame(index=pop_geoid.index)
    states['N'] = pop_geoid.copy() #total population

    states['S'] = np.maximum(mu[:n],0) 
    states['S/N'] = states['S']/states['N']

    states['E'] = np.maximum(mu[2*n:3*n],0)
    states['I'] = np.maximum(mu[4*n:5*n],0)

    states['I/N'] = states['I']/states['N']
    return(states)

def initialize_states(pop_geoid, params,model='SEIR'):
    if 'S' not in params.keys():
        sys.exit("Initial conditions must be in params to use this method")
    states = pd.DataFrame(index=pop_geoid.index)
    states['N'] = pop_geoid.copy() #total population
    probs = states['N']/np.sum(states['N'])

    N_minus_S = np.sum(states['N']) - params['S']
    N_minus_S  = npr.multinomial(int(N_minus_S), pvals = probs) 

    states['S'] = states['N'] - N_minus_S 
    states['S'] = states[['N','S']].min(axis=1)
    states['S/N'] = states['S']/states['N']

    states['E'] = npr.multinomial(int(params['E']), pvals = probs) 
    states['E'] = states[['N','E']].min(axis=1)

    states['I'] = npr.multinomial(int(params['I']), pvals = probs) 
    states['I'] = states[['N','I']].min(axis=1)

    if model == 'SEIIR':
        states['I_s'] = (states['I']*params['rho']).round()
        states['I_a'] = (states['I']*(1-params['rho'])).round()
        states['I_s/N'] = states['I_s']/states['N']
        states['I_a/N'] = states['I_a']/states['N']
        states['I'] = None
    elif model == 'SEIR':
        states['I/N'] = states['I']/states['N']
    else:
        sys.exit('model not programmed')
    return(states)

def which_date(date, weekly_dates):
    """
    Obtain valid dates of patterns files, the date represents the Monday
    of the relevant week.

    parameters:
        date datetime.datetime object
        patterns_path to weekly patterns data files
    """
    #compare date with patterns_dates and find the last one that is less
    date_inds = [i for i, x in enumerate(weekly_dates) if date >= x]
    if len(date_inds) == 0: return(weekly_dates[0])
    return( weekly_dates[ max(date_inds)])

def update_states(states, params, net, model='SEIR', log_p=False):
    """
    Takes the states of the model in a given day and simulates the 
    compartmental transitions. The only non-static parameter, is the 
    contact network. 
    params:
        states: dict
            current compartment counts for every subpopulation.
        alpha: float
            discount factor for contact with an asymptomatic infective.
        beta: float
            rate of infection the probability of a susceptible becoming 
            infected from exposure to one infective/sq_foot. 
        gamma: float
            rate at which infectives recover.
        kappa: float
            rate at which an exposed becomes infective.
        rho: float
            probability of exposed becoming symptomatic.
        net: pandas DataFrame
            network estimating the number of contacts with other CBGs
            in a given day.
        net: dict
            contact network with double index (origin_geoid, dest_geoid) 
        model: str
            the type of compartmental model
        out_trans: bool
            if trans, we return the transitions
            for likelihood computation
    """
    #compute effective infection rates
    net = net.join(states['S/N'], on='origin_geoid')
    
    if model == 'SEIIR':
        net = net.join(
            states[['I_a/N', 'I_s/N']],
            on='destination_geoid')
        effective_rates = compute_rate(
            net,
            states=states,
            alpha=params['alpha'],
            beta=params['beta'])
    elif model == 'SEIR':
        print('Problem occurs=========================')
#         print(net.index)
        net = net.join(states['I/N'], on='destination_geoid')  # why on destination_geoid??
        effective_rates = compute_rate_SEIR(
            net,
            beta=params['beta'])
    #sample state transitions
    if log_p:
        trans, p = state_transitions(states, effective_rates, params, model, log_p)
        apply_transitions(states, trans, params, model)
        return(states, trans['E_to_I'], p)
    else:
        trans = state_transitions(states, effective_rates, params, model)
        apply_transitions(states, trans, params, model)
        return((states, trans['E_to_I']))
    
def run_model(params,
              net_dict,
              pop_geoid,
              GEOID_type,
              date_interval,
              model='SEIR',
              num_sims=36,
              initial_cond=None):
    caseload_df = pd.DataFrame({'date':date_interval})

    pbar = ProgressBar()
    for i in pbar(range(num_sims)):
        if initial_cond is not None:
            states = initialize_states_list(
                initial_cond,
                i,
                pop_geoid)
        else:
            states = initialize_states(
                pop_geoid,
                params=params)
        print('Initialized states:')
        print(states)
        caseload = []
        patterns_date = ''
        for date in date_interval:
            #print("Simulating for date {}".format(date.isoformat()[:10]))
            if patterns_date != which_date(date, list(net_dict.keys())):
                #print("--Changing contact network")
                patterns_date = which_date(date, list(net_dict.keys()))
                net=net_dict[patterns_date]

            states, new_cases = update_states(
                states=states,
                params=params,
                net=net)
            caseload.append(new_cases.sum())

        caseload_df[i]=caseload
    caseload_df.set_index('date',inplace=True)
    return(caseload_df)

stime = time.time()
df = run_model(
    params,
    net_dict,
    pop_geoid,
    'CT',
    date_interval)
print("Elapsed time: {}".format(time.time()-stime))

In [None]:
stime = time.time()
df = run_model(
    params,
    net_dict,
    pop_geoid,
    'CT',
    date_interval)
print("Elapsed time: {}".format(time.time()-stime))