In [1]:
import pandas as pd
import numpy as np

import utils

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)

pd.set_option('display.float_format', '{:.2f}'.format)


## 1. Data Load with raw dataset
Here are 4 datasets:
1. One dataset of core places in New York City: shape: 
    (71468, 22)
    
2. Three datasets of movement pattern for people live in NYC from:
    ##### a. 2019/12/23 to 2020/01/13: shape: (68911, 34)
    ##### b. 2020/12/21 to 2021/01/11 : shape: (76202, 34)

    ##### c. 2021/12/20 to 2022/01/10 : shape: (120011, 55)


In [2]:
pwd

'E:\\Xiaoyi\\mobility_covid-19-main\\scripts'

In [6]:
pattern_1=pd.read_csv('../data/3.2019-4.2019.csv')
print(pattern_1.shape)

pattern_2=pd.read_csv('../data/2.2020-4.2020.csv')
print(pattern_2.shape)


pattern_3=pd.read_csv('../data/3.2021-4.2021.csv')
print(pattern_3.shape)



(2, 1)
(2, 1)
(2, 1)


Unnamed: 0,version https://git-lfs.github.com/spec/v1
0,oid sha256:cf8008fd972aaa2c4f52048fc56ef0e2e04...
1,size 198205317


In [4]:
#verification
tmp=pattern_1.loc[pattern_1['date_range_start']=='2019-03-01T00:00:00-05:00']
print(tmp[['raw_visit_counts','raw_visitor_counts']].sum())
tmp=pattern_2.loc[pattern_2['date_range_start']=='2020-03-01T00:00:00-05:00']
print(tmp[['raw_visit_counts','raw_visitor_counts']].sum())
tmp=pattern_3.loc[pattern_3['date_range_start']=='2021-03-01T00:00:00-05:00']
print(tmp[['raw_visit_counts','raw_visitor_counts']].sum())

KeyError: 'date_range_start'

In [None]:
core=pd.read_csv('../data/core/core.csv')
print(core.shape)
core.head()

## 2. Data Prepocess

In [None]:
core_columns=['placekey','location_name','naics_code','latitude',
             'longitude','postal_code','top_category','sub_category']
core_df=core[core_columns]

In [None]:
pattern_column=['placekey','date_range_start','date_range_end','raw_visit_counts',
             'raw_visitor_counts','visits_by_day','visitor_home_cbgs','poi_cbg',
                     'visitor_daytime_cbgs','distance_from_home','bucketed_dwell_times',
                     'related_same_day_brand','device_type','median_dwell','bucketed_dwell_times']


## 3. Data Processing
1. creat new variable:
    ##### a. pop_up_col
    ##### b. date_range_start: tranfer from string to time object
    ##### c. sg_wp__median_dwell_hour: average dwelling time by hour
    ##### d. category: aggravated category with reference from:
    https://www.census.gov/naics/history/docs/cm_2.pdf
    especially, naics 721 is accomodation, 722 is food:
    https://www.naics.com/naics-code-description/?code=72
    

In [None]:
def process_data(raw_data):
    print('raw_data shape',raw_data.shape)
    data_pre=raw_data.merge(core,left_on='placekey',right_on='placekey')
    data = data_pre.dropna(subset=['naics_code'])
    # preprocess data
    data['pop_up_col'] = 'name: ' + data.location_name + '；' + 'type:' + data.sub_category + '；' + 'stay time: ' + \
                        data['median_dwell'].astype(str) + '；' + 'visit number: ' + data[
                            'raw_visit_counts'].astype(str)
    data["date_range_start"] = pd.to_datetime(data["date_range_start"])

    # aggravate the business type of data by naics code
    data['naics_code'] = data.naics_code.astype(int)

    data['naics_2dig'] = data.naics_code.astype(str).str[:2].astype('int64')
    data['naics_3dig'] = data.naics_code.astype(str).str[:3].astype(int)
    naics_code=pd.read_csv('../data/aggregated_naics.csv')
    # naics code classification
    data.loc[data['naics_2dig'] == 11, 'category'] = 'Other'
    data.loc[data['naics_2dig'] == 21, 'category'] = 'Other'
    data.loc[data['naics_2dig'] == 49, 'category'] = 'Other'
    data.loc[data['naics_2dig'] == 42, 'category'] = 'Other'
    data.loc[data['naics_2dig'] == 22, 'category'] = 'Other'
    data.loc[data['naics_2dig'] == 51, 'category'] = 'Other'
    data.loc[data['naics_2dig'] == 81, 'category'] = 'Other'
    
    data.loc[data['naics_2dig'] == 23, 'category'] = 'Goods Production'
    data.loc[data['naics_2dig'] == 31, 'category'] = 'Goods Production'
    data.loc[data['naics_2dig'] == 32, 'category'] = 'Goods Production'
    data.loc[data['naics_2dig'] == 33, 'category'] = 'Goods Production'
    
    data.loc[data['naics_2dig'] == 42, 'category'] = 'Wholesale and Retail'
    data.loc[data['naics_2dig'] == 43, 'category'] = 'Wholesale and Retail'
    data.loc[data['naics_2dig'] == 44, 'category'] = 'Wholesale and Retail'
    data.loc[data['naics_2dig'] == 45, 'category'] = 'Wholesale and Retail'
    
    data.loc[data['naics_2dig'] == 54, 'category'] = 'Professional and Business Services'
    data.loc[data['naics_2dig'] == 55, 'category'] = 'Professional and Business Services'
    data.loc[data['naics_2dig'] == 56, 'category'] = 'Professional and Business Services'    

    data.loc[data['naics_2dig'] == 48, 'category'] = 'Transportation'
    
    data.loc[data['naics_2dig'] == 71, 'category'] = 'Recreation'
    
    data.loc[data['naics_2dig'] == 62, 'category'] = 'Health Care'
    
    data.loc[data['naics_2dig'] == 61, 'category'] = 'Education'
    
    data.loc[data['naics_2dig'] == 52, 'category'] = 'Financial Activities'

    data.loc[data['naics_2dig'] == 53, 'category'] = 'Real Estate'
    
#     devide 'Accomodation' and 'Food' according to the first 3 digit

    data.loc[data['naics_3dig'] == 721, 'category'] = 'Accomodation'
    data.loc[data['naics_3dig'] == 722, 'category'] = 'Food'
#     print(data.shape)
#     print(naics_code.dtypes)
#     new_Data=pd.merge(data,naics_code,left_on='naics_2dig',right_on='code',how='left')
#     print(new_Data.shape)
    
    data=data.dropna(subset=["poi_cbg"])

    data['trac_id']=data['poi_cbg'].astype('int64')
    data['tract_id']=data['trac_id'].astype(str).str[:11].astype('int64')
    print('current data shape',data.shape)
    return data


In [5]:
pattern_mar_19=pattern_1[pattern_column].loc[pattern_1['date_range_start']=='2019-03-01T00:00:00-05:00']
df_mar_19=process_data(pattern_mar_19)
df_mar_19

NameError: name 'pattern_column' is not defined

In [15]:
df_mar_19.dtypes

placekey                                                  object
date_range_start          datetime64[ns, pytz.FixedOffset(-300)]
date_range_end                                            object
raw_visit_counts                                           int64
raw_visitor_counts                                         int64
visits_by_day                                             object
visitor_home_cbgs                                         object
poi_cbg                                                    int64
visitor_daytime_cbgs                                      object
distance_from_home                                       float64
bucketed_dwell_times                                      object
related_same_day_brand                                    object
device_type                                               object
median_dwell                                             float64
bucketed_dwell_times                                      object
parent_placekey          

In [16]:
pattern_mar_20=pattern_2[pattern_column].loc[pattern_2['date_range_start']=='2020-03-01T00:00:00-05:00']
df_mar_20=process_data(pattern_mar_20)

raw_data shape (31512, 15)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pop_up_col'] = 'name: ' + data.location_name + '；' + 'type:' + data.sub_category + '；' + 'stay time: ' + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["date_range_start"] = pd.to_datetime(data["date_range_start"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['naics_code'] = dat

current data shape (31424, 42)


In [17]:
pattern_mar_21=pattern_3[pattern_column].loc[pattern_3['date_range_start']=='2021-03-01T00:00:00-05:00']
df_mar_21=process_data(pattern_mar_21)

raw_data shape (29162, 15)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pop_up_col'] = 'name: ' + data.location_name + '；' + 'type:' + data.sub_category + '；' + 'stay time: ' + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["date_range_start"] = pd.to_datetime(data["date_range_start"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['naics_code'] = dat

current data shape (29081, 42)


In [18]:
df_mar_19.to_csv('../data_save/df_mar_19.csv')  
df_mar_20.to_csv('../data_save/df_mar_20.csv')  
df_mar_21.to_csv('../data_save/df_mar_21.csv')  

NA percent <5%, you can ignore the NA values directly