# Compile patterns data
The original POI pattern data (popularity by hour, etc.) is split across 30 large files. Here we load and combine the relevant data into one file. 

In [None]:
root = ''
folder = root + 'Data/SafeGraphData/DownloadedData/2022/10/PG/'
upper_folder = root + 'Data/SafeGraphData/DownloadedData/2022/10/'


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

### Load compiled file without patterns data

In [7]:
df = pd.read_csv(folder+'compiled.csv', index_col=0)

### Compile patterns data

In [14]:
def process_pattern_data(tmp2):
    tmp3 = tmp2.loc[:, ['placekey', 'raw_visit_counts', 'raw_visitor_counts', 'distance_from_home', 'median_dwell', 'normalized_visits_by_state_scaling']].copy(deep=True)
    tmp4 = pd.DataFrame(tmp2["popularity_by_day"].str.split('{|,|:|}', expand=True).values, columns=np.arange(0, 16)).loc[:, [2, 4, 6, 8, 10, 12, 14]]
    tmp3.loc[:, ['pop_by_day_'+i for i in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]] = tmp4.values
    dwell_buckets = ["<5", "5-10", "11-20", "21-60", "61-120", "121-240", ">240"]
    tmp3.loc[:, ['dwell_time_bucket_'+i for i in dwell_buckets]] = pd.DataFrame(tmp2["bucketed_dwell_times"].str.split('{|:|,|}', expand=True).values, columns=np.arange(16)).loc[:, [2,4,6,8,10,12,14]].values
    tmp3.loc[:, ['related_same_day_brand_top'+str(i) for i in [1,2,3]]] = tmp2['related_same_day_brand'].str.split('{|,|:|}', expand=True).loc[:, [1, 3, 5]].values
    tmp5 = tmp2['popularity_by_hour'].str.replace('[', '').str.replace(']', '').str.split(',', expand=True)
    inds = tmp5[tmp5.isna().sum(axis=1)==0].index
    hour_names = ['0to3', '3to6', '6to9', '9to12', '12to15', '15to18', '18to21', '21to24']
    tmp3.loc[inds, 'popularity_by_hour_'+hour_names[0]] = tmp5.loc[inds, 0:2].astype(int).sum(axis=1).values
    tmp3.loc[inds, 'popularity_by_hour_'+hour_names[1]] = tmp5.loc[inds, 3:5].astype(int).sum(axis=1).values
    tmp3.loc[inds, 'popularity_by_hour_'+hour_names[2]] = tmp5.loc[inds, 6:8].astype(int).sum(axis=1).values
    tmp3.loc[inds, 'popularity_by_hour_'+hour_names[3]] = tmp5.loc[inds, 9:11].astype(int).sum(axis=1).values
    tmp3.loc[inds, 'popularity_by_hour_'+hour_names[4]] = tmp5.loc[inds, 12:14].astype(int).sum(axis=1).values
    tmp3.loc[inds, 'popularity_by_hour_'+hour_names[5]] = tmp5.loc[inds, 15:17].astype(int).sum(axis=1).values
    tmp3.loc[inds, 'popularity_by_hour_'+hour_names[6]] = tmp5.loc[inds, 18:20].astype(int).sum(axis=1).values
    tmp3.loc[inds, 'popularity_by_hour_'+hour_names[7]] = tmp5.loc[inds, 21:23].astype(int).sum(axis=1).values

    for col in ['raw_visit_counts', 'raw_visitor_counts', 'distance_from_home',
           'median_dwell', 'normalized_visits_by_state_scaling',
           'pop_by_day_Monday', 'pop_by_day_Tuesday', 'pop_by_day_Wednesday',
           'pop_by_day_Thursday', 'pop_by_day_Friday', 'pop_by_day_Saturday',
           'pop_by_day_Sunday', 'dwell_time_bucket_<5', 'dwell_time_bucket_5-10',
           'dwell_time_bucket_11-20', 'dwell_time_bucket_21-60',
           'dwell_time_bucket_61-120', 'dwell_time_bucket_121-240',
           'dwell_time_bucket_>240', 
           'popularity_by_hour_0to3', 'popularity_by_hour_3to6',
           'popularity_by_hour_6to9', 'popularity_by_hour_9to12',
           'popularity_by_hour_12to15', 'popularity_by_hour_15to18',
           'popularity_by_hour_18to21', 'popularity_by_hour_21to24']:
        inds = tmp3[~tmp3[col].isna()].index
        tmp3.loc[inds, col] = tmp3.loc[inds, col].astype(int)

    return tmp3


In [None]:
patterns = None

for file_number in np.arange(1, 30):
#     print('File number: ', file_number)
    file = 'core_poi-geometry-patterns-part'+str(file_number)+'.csv'
    tmp = pd.read_csv(folder+file)
    tmp2 = tmp.loc[tmp['iso_country_code']=='US']
    tmp4 = process_pattern_data(tmp2)
    if patterns is None:
        patterns = tmp4.copy(deep=True).reset_index(drop=True)
    else:
        patterns = pd.concat((patterns, tmp4.copy(deep=True).reset_index(drop=True)), axis=0, ignore_index=True)

### Save patterns data

In [19]:
patterns.to_csv(folder+'compiled_patterns.csv')

### Combine two compiled dfs

In [29]:
df_full = df.merge(patterns, how='outer', on='placekey')
df_full.to_csv(upper_folder+'compiled_plus_patterns.csv')