In [1]:
import pandas as pd
import numpy as np

import utils

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)

pd.set_option('display.float_format', '{:.2f}'.format)


## 1. Data Load with raw dataset
Here are 4 datasets:
1. One dataset of core places in New York City: shape: 
    (71468, 22)
    
2. Three datasets of movement pattern for people live in NYC from:
    ##### a. 2019/12/23 to 2020/01/13: shape: (68911, 34)
    ##### b. 2020/12/21 to 2021/01/11 : shape: (76202, 34)

    ##### c. 2021/12/20 to 2022/01/10 : shape: (120011, 55)


In [2]:
pwd

'D:\\MUSA\\CPLN680\\mobility_covid-19\\scripts'

In [3]:
pattern_1=pd.read_csv('../data/3.2019-4.2019.csv')
print(pattern_1.shape)

pattern_2=pd.read_csv('../data/2.2020-4.2020.csv')
print(pattern_2.shape)


pattern_3=pd.read_csv('../data/3.2021-4.2021.csv')
print(pattern_3.shape)



(73483, 31)
(91881, 31)
(58401, 31)


In [4]:
#verification
tmp=pattern_1.loc[pattern_1['date_range_start']=='2019-03-01T00:00:00-05:00']
print(tmp[['raw_visit_counts','raw_visitor_counts']].sum())
tmp=pattern_2.loc[pattern_2['date_range_start']=='2020-03-01T00:00:00-05:00']
print(tmp[['raw_visit_counts','raw_visitor_counts']].sum())
tmp=pattern_3.loc[pattern_3['date_range_start']=='2021-03-01T00:00:00-05:00']
print(tmp[['raw_visit_counts','raw_visitor_counts']].sum())

raw_visit_counts      13837548
raw_visitor_counts     7657880
dtype: int64
raw_visit_counts      7822390
raw_visitor_counts    4485765
dtype: int64
raw_visit_counts      4742714
raw_visitor_counts    2701775
dtype: int64


In [5]:
core=pd.read_csv('../data/core/core.csv')
print(core.shape)
core.head()

(84717, 22)


Unnamed: 0,placekey,parent_placekey,location_name,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,street_address,city,region,postal_code,iso_country_code,phone_number,open_hours,category_tags,opened_on,closed_on,tracking_closed_since,geometry_type
0,222-222@627-s4m-vzz,,Just Salad,SG_BRAND_fcc6dd7686cfffc04d325dbe5ab994f1,Just Salad,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,40.78,-73.96,1471 3rd Ave,New York,NY,10028,US,,"{ ""Mon"": [[""10:30"", ""21:00""]], ""Tue"": [[""10:30...","Counter Service,Salad,Lunch,Smoothie & Juice B...",,,2019-07-01,POLYGON
1,222-222@627-s4n-nwk,,Enthaice,,,Restaurants and Other Eating Places,Full-Service Restaurants,722511.0,40.78,-73.95,1598 3rd Ave,New York,NY,10128,US,12122891888.0,,Thai Food,,2020-01-15,2019-07-01,POLYGON
2,222-222@627-s6m-5fz,,Cash Flow Partners,,,"Management, Scientific, and Technical Consulti...",,5416.0,40.85,-73.93,201 Wadsworth Ave,New York,NY,10033,US,12129288600.0,"{ ""Mon"": [[""10:00"", ""18:00""]], ""Tue"": [[""10:00...",,,,2019-07-01,POLYGON
3,222-222@627-s6m-q9f,,Apartments at 134 Haven Ave,,,Lessors of Real Estate,Lessors of Residential Buildings and Dwellings,531110.0,40.85,-73.94,134 Haven Ave,New York,NY,10032,US,,,,,,2019-07-01,POLYGON
4,222-222@627-s6n-7wk,,New Leaf Restaurant,,,Drinking Places (Alcoholic Beverages),Drinking Places (Alcoholic Beverages),722410.0,40.86,-73.93,1 Margaret Corbin Dr,New York,NY,10040,US,12125685323.0,,"Bar or Pub,Late Night,American Food",,2020-01-15,2019-07-01,POLYGON


## 2. Data Prepocess

In [6]:
core_columns=['placekey','location_name','naics_code','latitude',
             'longitude','postal_code','top_category','sub_category']
core_df=core[core_columns]

In [7]:
pattern_column=['placekey','date_range_start','date_range_end','raw_visit_counts',
             'raw_visitor_counts','visits_by_day','visitor_home_cbgs','poi_cbg',
                     'visitor_daytime_cbgs','distance_from_home','bucketed_dwell_times',
                     'related_same_day_brand','device_type','median_dwell','bucketed_dwell_times']



## 3. Data Processing
1. creat new variable:
    ##### a. pop_up_col
    ##### b. date_range_start: tranfer from string to time object
    ##### c. sg_wp__median_dwell_hour: average dwelling time by hour
    ##### d. category: aggravated category with reference from:
    https://www.census.gov/naics/history/docs/cm_2.pdf
    especially, naics 721 is accomodation, 722 is food:
    https://www.naics.com/naics-code-description/?code=72
    

In [36]:
def process_data(raw_data):
    print('raw_data shape',raw_data.shape)
    data_pre=raw_data.merge(core,left_on='placekey',right_on='placekey')
    data = data_pre.dropna(subset=['naics_code'])
    # preprocess data
    data['pop_up_col'] = 'name: ' + data.location_name + '；' + 'type:' + data.sub_category + '；' + 'stay time: ' + \
                        data['median_dwell'].astype(str) + '；' + 'visit number: ' + data[
                            'raw_visit_counts'].astype(str)
    data["date_range_start"] = pd.to_datetime(data["date_range_start"])

    # aggravate the business type of data by naics code
    data['naics_code'] = data.naics_code.astype(int)

    data['naics_2dig'] = data.naics_code.astype(str).str[:2].astype('int64')
    data['naics_3dig'] = data.naics_code.astype(str).str[:3].astype(int)
    naics_code=pd.read_csv('../data/aggregated_naics.csv')
#     # naics code classification
#     data.loc[data['naics_2dig'] == 11, 'category'] = 'Other'
#     data.loc[data['naics_2dig'] == 21, 'category'] = 'Other'
#     data.loc[data['naics_2dig'] == 49, 'category'] = 'Other'
#     data.loc[data['naics_2dig'] == 42, 'category'] = 'Other'
#     data.loc[data['naics_2dig'] == 22, 'category'] = 'Other'
#     data.loc[data['naics_2dig'] == 51, 'category'] = 'Other'
#     data.loc[data['naics_2dig'] == 81, 'category'] = 'Other'
    
#     data.loc[data['naics_2dig'] == 23, 'category'] = 'Goods Production'
#     data.loc[data['naics_2dig'] == 31, 'category'] = 'Goods Production'
#     data.loc[data['naics_2dig'] == 32, 'category'] = 'Goods Production'
#     data.loc[data['naics_2dig'] == 33, 'category'] = 'Goods Production'
    
#     data.loc[data['naics_2dig'] == 42, 'category'] = 'Wholesale and Retail'
#     data.loc[data['naics_2dig'] == 43, 'category'] = 'Wholesale and Retail'
#     data.loc[data['naics_2dig'] == 44, 'category'] = 'Wholesale and Retail'
#     data.loc[data['naics_2dig'] == 45, 'category'] = 'Wholesale and Retail'
    
#     data.loc[data['naics_2dig'] == 54, 'category'] = 'Professional and Business Services'
#     data.loc[data['naics_2dig'] == 55, 'category'] = 'Professional and Business Services'
#     data.loc[data['naics_2dig'] == 56, 'category'] = 'Professional and Business Services'    

#     data.loc[data['naics_2dig'] == 48, 'category'] = 'Transportation'
    
#     data.loc[data['naics_2dig'] == 71, 'category'] = 'Recreation'
    
#     data.loc[data['naics_2dig'] == 62, 'category'] = 'Health Care'
    
#     data.loc[data['naics_2dig'] == 61, 'category'] = 'Education'
    
#     data.loc[data['naics_2dig'] == 52, 'category'] = 'Financial Activities'

#     data.loc[data['naics_2dig'] == 53, 'category'] = 'Real Estate'
    
    # devide 'Accomodation' and 'Food' according to the first 3 digit

#     data.loc[data['naics_3dig'] == 721, 'category'] = 'Accomodation'
#     data.loc[data['naics_3dig'] == 722, 'category'] = 'Food'
    print(data.shape)
    print(naics_code.dtypes)
    new_Data=pd.merge(data,naics_code,left_on='naics_2dig',right_on='code',how='left')
    print(new_Data.shape)
    
    data=data.dropna(subset=["poi_cbg"])

    data['trac_id']=data['poi_cbg'].astype('int64')
    data['tract_id']=data['trac_id'].astype(str).str[:11].astype('int64')
    print('current data shape',data.shape)
    return data


In [37]:
pattern_mar_19=pattern_1[pattern_column].loc[pattern_1['date_range_start']=='2019-03-01T00:00:00-05:00']
df_mar_19=process_data(pattern_mar_19)
df_mar_19

raw_data shape (36695, 15)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pop_up_col'] = 'name: ' + data.location_name + '；' + 'type:' + data.sub_category + '；' + 'stay time: ' + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["date_range_start"] = pd.to_datetime(data["date_range_start"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['naics_code'] = dat

(36587, 39)
code         int64
category    object
dtype: object
(36681, 41)
current data shape (36587, 41)


Unnamed: 0,placekey,date_range_start,date_range_end,raw_visit_counts,raw_visitor_counts,visits_by_day,visitor_home_cbgs,poi_cbg,visitor_daytime_cbgs,distance_from_home,bucketed_dwell_times,related_same_day_brand,device_type,median_dwell,bucketed_dwell_times.1,parent_placekey,location_name,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,street_address,city,region,postal_code,iso_country_code,phone_number,open_hours,category_tags,opened_on,closed_on,tracking_closed_since,geometry_type,pop_up_col,naics_2dig,naics_3dig,trac_id,tract_id
0,222-222@627-s8j-zj9,2019-03-01 00:00:00-05:00,2019-04-01T00:00:00-04:00,62,26,"[2,0,2,0,2,5,2,3,0,0,4,4,3,4,2,0,0,4,6,6,0,0,0...","{""340030172002"":4}",360610095002,"{""360610095002"":5,""360610184002"":4,""3400301810...",9243.00,"{""<5"":1,""5-10"":9,""11-20"":4,""21-60"":10,""61-120""...","{""Starbucks"":7,""Dunkin'"":5,""Planet Fitness"":4,...","{""android"":10,""ios"":16}",107.50,"{""<5"":1,""5-10"":9,""11-20"":4,""21-60"":10,""61-120""...",,DF Discount,,,"General Merchandise Stores, including Warehous...",All Other General Merchandise Stores,452319,40.75,-73.99,210 W 29th St,New York,NY,10001,US,,,,,,2019-07-01,POLYGON,name: DF Discount；type:All Other General Merch...,45,452,360610095002,36061009500
1,222-222@627-s8r-6c5,2019-03-01 00:00:00-05:00,2019-04-01T00:00:00-04:00,113,87,"[5,3,3,4,4,6,7,7,2,1,2,6,6,2,10,4,2,1,3,3,1,3,...","{""360810991003"":4,""360610131002"":4,""3604705560...",360610131001,"{""360610131001"":16,""360470076001"":4,""420171055...",14968.00,"{""<5"":3,""5-10"":17,""11-20"":14,""21-60"":29,""61-12...","{""Pret A Manger"":9,""Starbucks"":4,""McDonald's"":...","{""android"":52,""ios"":35}",49.00,"{""<5"":3,""5-10"":17,""11-20"":14,""21-60"":29,""61-12...",,Sports Illustrated For Women,,,"Sporting Goods, Hobby, and Musical Instrument ...",Sporting Goods Stores,451110,40.76,-73.98,135 W 50th St,New York,NY,10020,US,,,,,,2019-07-01,POLYGON,name: Sports Illustrated For Women；type:Sporti...,45,451,360610131001,36061013100
2,222-222@627-s8r-rp9,2019-03-01 00:00:00-05:00,2019-04-01T00:00:00-04:00,35,30,"[1,0,0,0,1,2,0,0,0,0,3,1,0,1,4,0,1,3,0,1,1,1,2...","{""360470698001"":4,""360810045001"":4,""3608701050...",360610108007,"{""360610108007"":6,""360610133003"":4,""3604706980...",13244.00,"{""<5"":2,""5-10"":13,""11-20"":6,""21-60"":7,""61-120""...","{""Food Bazaar Supermarket"":3,""Hugo Boss"":3,""Ol...","{""android"":18,""ios"":12}",14.00,"{""<5"":2,""5-10"":13,""11-20"":6,""21-60"":7,""61-120""...",,Deliteria Deli & Grocery,,,Grocery Stores,Supermarkets and Other Grocery (except Conveni...,445110,40.76,-73.97,1061 2nd Ave,New York,NY,10022,US,12123719944.00,,,,,2019-07-01,POLYGON,name: Deliteria Deli & Grocery；type:Supermarke...,44,445,360610108007,36061010800
3,222-222@627-wbt-r8v,2019-03-01 00:00:00-05:00,2019-04-01T00:00:00-04:00,411,230,"[8,9,5,8,12,13,18,12,24,18,9,15,16,12,14,22,11...","{""360470021001"":6,""360810554002"":5,""3604705130...",360610099001,"{""360610099001"":11,""360339504004"":6,""060855086...",14288.00,"{""<5"":3,""5-10"":46,""11-20"":40,""21-60"":76,""61-12...","{""Starbucks"":5,""Dunkin'"":3,""SoulCycle"":2,""Chip...","{""android"":74,""ios"":156}",83.00,"{""<5"":3,""5-10"":46,""11-20"":40,""21-60"":76,""61-12...",,INA PINCH Snack Bar,,,Restaurants and Other Eating Places,Snack and Nonalcoholic Beverage Bars,722515,40.75,-74.00,537 W 27th St,New York,NY,10001,US,12122447000.00,,"Salad,Bakery",,,2019-07-01,POLYGON,name: INA PINCH Snack Bar；type:Snack and Nonal...,72,722,360610099001,36061009900
4,222-222@627-wbv-brk,2019-03-01 00:00:00-05:00,2019-04-01T00:00:00-04:00,200,141,"[6,1,8,5,3,5,6,8,6,7,1,7,5,6,9,6,6,2,7,6,20,11...","{""360610129002"":14,""360050185004"":6,""340297112...",360610129002,"{""360610129002"":10,""360050344002"":6,""360610189...",17738.00,"{""<5"":8,""5-10"":80,""11-20"":29,""21-60"":21,""61-12...","{""Lifetime Fitness"":7,""Starbucks"":6,""Dunkin'"":...","{""android"":51,""ios"":94}",14.00,"{""<5"":8,""5-10"":80,""11-20"":29,""21-60"":21,""61-12...",,McQuaid's Public House,,,Drinking Places (Alcoholic Beverages),Drinking Places (Alcoholic Beverages),722410,40.76,-74.00,589 11th Ave,New York,NY,10036,US,12125826359.00,,"Bar or Pub,Late Night",,2020-01-15,2019-07-01,POLYGON,name: McQuaid's Public House；type:Drinking Pla...,72,722,360610129002,36061012900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36594,zzy-227@627-s4k-ks5,2019-03-01 00:00:00-05:00,2019-04-01T00:00:00-04:00,431,193,"[14,12,10,12,18,17,31,16,6,3,15,9,16,15,16,12,...","{""360610175005"":10,""360470140001"":6,""360810065...",360610175005,"{""360610175005"":15,""360610109001"":8,""360470296...",6043.00,"{""<5"":3,""5-10"":17,""11-20"":35,""21-60"":164,""61-1...","{""Starbucks"":4,""Walgreens"":2,""Dunkin'"":2,""Barn...","{""android"":81,""ios"":115}",60.00,"{""<5"":3,""5-10"":17,""11-20"":35,""21-60"":164,""61-1...",,The Bar Method Broadway,,,Other Amusement and Recreation Industries,Fitness and Recreational Sports Centers,713940,40.79,-73.98,2387 Broadway,New York,NY,10024,US,,"{ ""Mon"": [[""6:00"", ""21:00""]], ""Tue"": [[""6:00"",...",,,,2019-07-01,POLYGON,name: The Bar Method Broadway；type:Fitness and...,71,713,360610175005,36061017500
36595,zzy-22r@627-s8k-2rk,2019-03-01 00:00:00-05:00,2019-04-01T00:00:00-04:00,362,299,"[13,11,7,12,10,13,12,12,15,9,11,14,7,11,12,13,...","{""121030272024"":14,""130510001001"":6,""051159515...",360610076001,"{""360610076001"":18,""120950170151"":6,""360610074...",18919.00,"{""<5"":24,""5-10"":108,""11-20"":68,""21-60"":85,""61-...","{""Build A Bear Workshop"":9,""Chipotle Mexican G...","{""android"":171,""ios"":129}",18.00,"{""<5"":24,""5-10"":108,""11-20"":68,""21-60"":85,""61-...",,Starbucks,SG_BRAND_f116acfe9147494063e58da666d1d57e,Starbucks,Restaurants and Other Eating Places,Snack and Nonalcoholic Beverage Bars,722515,40.75,-73.99,350 Fifth Avenue Empire State Building,New York,NY,10118,US,,,"Snacks,Counter Service,Dessert,Tea House,Coffe...",,,2019-07-01,POLYGON,name: Starbucks；type:Snack and Nonalcoholic Be...,72,722,360610076001,36061007600
36596,zzz-222@627-rvs-zfz,2019-03-01 00:00:00-05:00,2019-04-01T00:00:00-04:00,1376,514,"[45,36,32,35,38,29,31,50,30,26,37,44,33,44,55,...","{""360811483003"":17,""360811479003"":16,""36081147...",360811483001,"{""360811479002"":19,""360811483003"":13,""36081147...",5515.00,"{""<5"":22,""5-10"":238,""11-20"":132,""21-60"":261,""6...","{""7-Eleven"":3,""McDonald's"":3,""Starbucks"":2,""St...","{""android"":217,""ios"":296}",69.50,"{""<5"":22,""5-10"":238,""11-20"":132,""21-60"":261,""6...",,UdallS Park Preserve,,,"Museums, Historical Sites, and Similar Institu...",Nature Parks and Other Similar Institutions,712190,40.78,-73.75,Udalls Park Preserve,New York,NY,11363,US,,,,,,2019-07-01,POLYGON,name: UdallS Park Preserve；type:Nature Parks a...,71,712,360811483001,36081148300
36597,zzz-222@627-s69-kxq,2019-03-01 00:00:00-05:00,2019-04-01T00:00:00-04:00,1418,542,"[45,54,41,35,32,29,49,54,29,47,48,34,49,47,58,...","{""360050201002"":31,""360050211001"":26,""36005020...",360050201002,"{""360050201002"":21,""360050211001"":20,""36005020...",2035.00,"{""<5"":34,""5-10"":239,""11-20"":129,""21-60"":293,""6...","{""Dunkin'"":4,""McDonald's"":3,""Bravo Supermarket...","{""android"":348,""ios"":209}",63.00,"{""<5"":34,""5-10"":239,""11-20"":129,""21-60"":293,""6...",,Mosaic Success Garden,,,"Museums, Historical Sites, and Similar Institu...",Nature Parks and Other Similar Institutions,712190,40.84,-73.93,Mosaic Success Garden,New York,NY,10452,US,,,,,,2019-07-01,POLYGON,name: Mosaic Success Garden；type:Nature Parks ...,71,712,360050201002,36005020100


In [38]:
df_mar_19.dtypes

placekey                                                  object
date_range_start          datetime64[ns, pytz.FixedOffset(-300)]
date_range_end                                            object
raw_visit_counts                                           int64
raw_visitor_counts                                         int64
visits_by_day                                             object
visitor_home_cbgs                                         object
poi_cbg                                                    int64
visitor_daytime_cbgs                                      object
distance_from_home                                       float64
bucketed_dwell_times                                      object
related_same_day_brand                                    object
device_type                                               object
median_dwell                                             float64
bucketed_dwell_times                                      object
parent_placekey          

In [10]:
pattern_mar_20=pattern_2[pattern_column].loc[pattern_2['date_range_start']=='2020-03-01T00:00:00-05:00']
df_mar_20=process_data(pattern_mar_20)

raw_data shape (31512, 15)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pop_up_col'] = 'name: ' + data.location_name + '；' + 'type:' + data.sub_category + '；' + 'stay time: ' + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["date_range_start"] = pd.to_datetime(data["date_range_start"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['naics_code'] = dat

current data shape (31424, 42)


In [11]:
pattern_mar_21=pattern_3[pattern_column].loc[pattern_3['date_range_start']=='2021-03-01T00:00:00-05:00']
df_mar_21=process_data(pattern_mar_21)

raw_data shape (29162, 15)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pop_up_col'] = 'name: ' + data.location_name + '；' + 'type:' + data.sub_category + '；' + 'stay time: ' + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["date_range_start"] = pd.to_datetime(data["date_range_start"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['naics_code'] = dat

current data shape (29081, 42)


In [12]:
df_mar_19.to_csv('../data_save/df_mar_19.csv')  
df_mar_20.to_csv('../data_save/df_mar_20.csv')  
df_mar_21.to_csv('../data_save/df_mar_21.csv')  

NA percent <5%, you can ignore the NA values directly