In [2]:
import pandas as pd
from datetime import time
from database.db_utils import get_connection
pd.set_option('display.max_columns', None)

### since, the smallest unit for which we have to measure the availability is an hour.
### so, divide max(timestamp) - min(timestamp) into batches of 1 hour for each day.
### look for available data in each of the batches.
### if the store was active we mark the whole hour as available.
### if the store was not active or the data is not available then we mark the whole hour as not available.

In [3]:
# connect to the database
engine = get_connection()

# load the tables into pd.Dataframes
store_status_df: pd.DataFrame = pd.read_sql_table('store_status', con=engine)
time_zone_df: pd.DataFrame = pd.read_sql_table('time_zone', con=engine)
menu_hours_df: pd.DataFrame = pd.read_sql_table('menu_hours', con=engine)
reports_df: pd.DataFrame = pd.read_sql_table('reports', con=engine)

In [4]:
merged_df = pd.merge(store_status_df, menu_hours_df, on='store_id', how='left')
merged_df = pd.merge(merged_df, time_zone_df, on='store_id', how='left')

merged_df['timestamp_utc'] = pd.to_datetime(merged_df['timestamp_utc'])
merged_df['timestamp_utc'] = merged_df['timestamp_utc'].dt.tz_localize('UTC')

merged_df['timezone_str'] = merged_df['timezone_str'].fillna('America/Chicago')
merged_df['timestamp_local'] = merged_df\
    .apply(lambda row: row['timestamp_utc'].tz_convert(row['timezone_str']), axis=1)
grouped = merged_df.groupby(by=['store_id'])\
    .agg(min_value=('timestamp_local', 'min'), max_value=('timestamp_local', 'max'))

print(grouped)

               store_id  day end_time_local start_time_local
0    646862635677872649    0       01:00:00         00:00:00
1    646862635677872649    5       01:00:00         00:00:00
2    646862635677872649    2       01:00:00         00:00:00
3    646862635677872649    6       01:00:00         00:00:00
4    646862635677872649    4       01:00:00         00:00:00
..                  ...  ...            ...              ...
95  5002848469100396071    2       02:00:00         00:00:00
96  5002848469100396071    4       02:00:00         00:00:00
97  5002848469100396071    0       02:00:00         00:00:00
98  5002848469100396071    1       02:00:00         00:00:00
99  5002848469100396071    5       02:00:00         00:00:00

[100 rows x 4 columns]
               store_id  day end_time_local start_time_local
0   5002848469100396071    6       02:00:00         00:00:00
1   5002848469100396071    3       02:00:00         00:00:00
2    422095329085009175    0       02:00:00         00:00:00


AttributeError: 'generator' object has no attribute 'head'