## How to get this data

In [1]:
def get_opening_hours_from_osm_data(path_to_osm_geojson):
    """
        path_to_osm_geojson (geojson): path to the geojson file. We get data from https://download.bbbike.org/osm/bbbike/ 
    """

    f = open(path_to_osm_geojson, "r")
    all_lines = f.readlines()
    print("Number of lines in file:", len(all_lines))

    line_index = 0
    opening_hours_indexes = []
    
    f = open(path_to_osm_geojson, "r")
    for x in f:
        if 'opening_hours' in x:
            opening_hours_indexes.append(line_index)
        line_index += 1
    print("Number of opening_hours entries:", len(opening_hours_indexes))
    return all_lines, opening_hours_indexes


def write_opening_hours_to_geojson(path_to_output_geojson, all_lines, opening_hours_indexes):
    """
        all_lines (list): all lines in the overall file
        
    """
    with open(path_to_output_geojson, "w+") as my_file:
        my_file.write(all_lines[0])
        for line in opening_hours_indexes[:-1]:
            my_file.write(all_lines[line])
        # remove comma from final line
        final_line = all_lines[opening_hours_indexes[-1]].replace(",\n","\n")
        my_file.write(final_line)
        my_file.write(all_lines[-1])
        my_file.close()


In [2]:
# all_lines, opening_hours_indexes = get_opening_hours_from_osm_data(path_to_osm_geojson="../Data/shapes/Montreal.osm.geojson")
# write_opening_hours_to_geojson(all_lines=all_lines, opening_hours_indexes=opening_hours_indexes, path_to_output_geojson="../Data/model_inputs/mtl_opening_hours.geojson")
##then
# COLUMNS_TO_USE = ['name','opening_hours','amenity','tourism', 'shop', 'healthcare', 'leisure', 'sport','craft', 'building','geometry']
# all_opening_hours[COLUMNS_TO_USE].to_csv('../Data/model_inputs/mtl_opening_hours_cleaned.csv',index=False, encoding='utf-8')

# Opening Hour analysis

In [3]:
import geopandas as gpd
import pandas as pd
import shapely
import datetime
import re
import numpy as np

In [4]:
all_opening_hours = gpd.read_file("../../Data/model_inputs/mtl_opening_hours.geojson")

X_all = pd.read_csv('../../Data/model_inputs/gdf_2017_X.csv')
y_all = pd.read_csv('../../Data/model_inputs/gdf_2017_y.csv')


In [5]:
all_opening_hours = all_opening_hours.dropna(subset=['opening_hours'])

In [6]:
all_opening_hours.head()

Unnamed: 0,url,name,phone,name:en,tourism,wikidata,wikipedia,wheelchair,alt_name:en,alt_name:fr,...,u-pick,building:part,fuel:octane_92,fuel:octane_98,layer,lit,service:vehicle:tyres,service:vehicle:suspension,playground:theme,geometry
0,http://www.mbam.qc.ca,Musée des Beaux-Arts de Montréal,+1 514 285 2000,Montreal Museum of Fine Arts,museum,Q860812,en:Montreal Museum of Fine Arts,yes,Museum of Fine Arts,Musée des Beaux-Arts,...,,,,,,,,,,POINT (-73.57940 45.49870)
1,,Noodles Star,+1-514-932-2888,,,,,limited,,,...,,,,,,,,,,POINT (-73.58042 45.49307)
2,,,,,,,,,,,...,,,,,,,,,,POINT (-73.51726 45.49226)
3,,Maxi,+1-450-672-3201,,,,,,,,...,,,,,,,,,,POINT (-73.46781 45.46025)
4,,Ben & Florentine,,,,,,,,,...,,,,,,,,,,POINT (-73.46731 45.46930)


In [7]:
all_opening_hours['amenity'].value_counts()

restaurant                354
cafe                      134
fast_food                  96
pharmacy                   59
bank                       55
bar                        41
library                    31
fuel                       28
pub                        18
bicycle_repair_station     16
clinic                     13
veterinary                 12
atm                        12
post_office                12
dentist                    11
car_rental                 10
ice_cream                  10
social_facility             8
parking                     7
bureau_de_change            6
community_centre            6
charging_station            6
marketplace                 4
shop|clothes                3
childcare                   3
driving_school              3
toilets                     3
nightclub                   3
recycling                   3
school                      2
theatre                     2
car_sharing                 2
food_court                  2
dojo      

In [8]:
for col in all_opening_hours.columns:
    print(col)

url
name
phone
name:en
tourism
wikidata
wikipedia
wheelchair
alt_name:en
alt_name:fr
opening_hours
amenity
cuisine
smoking
capacity
addr:city
addr:street
addr:postcode
addr:province
addr:housenumber
status
barrier
designation
shop
website
atm
ref
brand
brand:wikidata
brand:wikipedia
addr:suburb
internet_access
internet_access:fee
operator
dispensing
drive_through
wikipedia:fr
door
email
entrance
automatic_door
takeaway
outdoor_seating
level
name:fr
alt_name
addr:door
payment:cash
toilets:wheelchair
payment:debit_cards
payment:credit_cards
internet_access:operator
delivery
brand:en
brand:fr
official_name
official_name:en
official_name:fr
source
healthcare
leisure
organic
contact:email
contact:phone
contact:website
payment:visa
payment:interac
payment:mastercard
payment:american_express
ele
addr:country
note
contact:twitter
payment:bitcoin
contact:facebook
internet_access:ssid
description
office
country
name:de
addr:unit
consulate
diplomatic
opening_hours:covid19
air_conditioning
gst_num

In [9]:
all_opening_hours['opening_hours']

0                               We-Su 10:00-17:00; Mo off
1                                       Mo-Su 11:00-23:00
2                                              6:00-22:00
3                                       Mo-Su 08:00-22:00
4                       Mo-Sa 06:00-15:00, Su 07:00-15:00
                              ...                        
2044                 Mo-Fr 09:00-18:00; Sa-Su 09:00-17:00
2045                    Mo-Fr 08:30-18:00; Sa 08:00-13:00
2046    Apr-Nov: Th-Tu 08:00-17:00;Nov-Apr: Su 08:00-1...
2047                                           6:00-22:00
2048                                           6:00-22:00
Name: opening_hours, Length: 2042, dtype: object

In [10]:
all_opening_hours['shop'].value_counts()

supermarket    98
convenience    72
clothes        65
hairdresser    46
bakery         39
               ..
carpet          1
leather         1
car_parts       1
photo           1
tailor          1
Name: shop, Length: 107, dtype: int64

In [11]:
def change_crs_of_X(X, crs_from="EPSG:4236",crs_to="EPSG:3347"):
    """
        Function for translating the data into Canada Lambert projection 
        EPSG: 3347, so that the base unit is 1 m
    """
    new_X = X.copy()
    new_X['geometry'] = new_X.apply(lambda row: shapely.geometry.Point(row['long'],row['lat']),axis=1)
    new_X = gpd.GeoDataFrame(new_X,crs=crs_from)
    new_X = new_X.to_crs(crs_to)
    return new_X

In [12]:
## append end time to trips
start_end_times = gpd.read_file('../../Data/mtl_trajet/mtl_trajet_2017_final.shp')[['id_trip','starttime','endtime']]
X_all = X_all.merge(start_end_times, on='id_trip')

In [13]:
buffer_size_m = 50
geo_X = change_crs_of_X(X_all)

geo_X['buffers'] = geo_X['geometry'].apply(lambda row: row.buffer(buffer_size_m))

# get a geo-dataframe with only the Trip ID,trip end buffer and trip purpose    
only_buffers = geo_X[['id_trip','endtime','buffers']]
only_buffers =  gpd.GeoDataFrame(only_buffers.rename(columns={'buffers':'geometry'}), crs="EPSG:3347")

In [14]:
all_opening_hours = gpd.GeoDataFrame(all_opening_hours, crs='EPSG:4326')
all_opening_hours = all_opening_hours.to_crs('EPSG:3347')

In [15]:
def get_prop_open_hours(buffers, opening_hours):
    joined_data = gpd.sjoin(buffers, opening_hours, op='intersects', how='left')
    return joined_data

In [16]:
joined_data = get_prop_open_hours(only_buffers, all_opening_hours)

In [17]:
joined_data = joined_data.dropna(subset=['opening_hours'])

In [18]:
joined_data['endtime'] = pd.to_datetime(joined_data['endtime'])

In [19]:
IS_OPEN = ["24/7", "All Day, 7/24", "7/24"]
DAYOFWEEK_NAMES = {0:"Mo", 1:"Tu", 2:"We", 3:"Th", 4:"Fr", 5:"Sa", 6:"Su"}
MONTHS = ["Jan","Feb","Mar", "Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
LIST_DAYS = list(DAYOFWEEK_NAMES.values())
# EDGE CASES
# if no day
# Mo off; 
# Sa, Su off
# May-Dec
# 17:00+
# No time
# dim-lun
# until 22h
# 10:00AM to 5:00PM
# Mo-We 11:30-14:30, 17:30-22:30; Th 11:30-14:30, 17:30-23:30; Fr 11:30-14:30, 17:30-02:30; Sa 17:30-02:30; Su 17:30-22:30

In [20]:
datetime.datetime.strptime('03:55', '%H:%M').time() == datetime.datetime.strptime('03:55', '%H:%M').time()

True

In [21]:
def work_out_if_open(end_time, opening_time):
    day_of_week = DAYOFWEEK_NAMES[end_time.dayofweek]
    time_of_day = str(end_time.hour) + ':' + str(end_time.minute)
    
    ## check if , has been used instead of ; for splitting the opening times
    if len(re.findall('[0-9],', opening_time)) != 0:
        ## replace , with ; if so
        opening_time = change_comma_to_semicolon(opening_time)
        
    for opening_time_part in opening_time.split(';'):
        opening_time_part = opening_time_part.strip()
        # check if opening time string contains any days in it (will return True or False)
        all_days_not_in_opening_time = all([day not in opening_time_part for day in LIST_DAYS])
        ## 1. Check if just the time is provided or always open
        if all_days_not_in_opening_time:
            
            if opening_time_part in IS_OPEN:
                return True
            else:
                try:
                    bound_start, bound_end = extract_time(opening_time_part)
                except:
                    print('IGNORED', opening_time_part)
                    continue
                if within_opening_hours(bound_start, bound_end, time_of_day):
                    return True
                
        ## 2. Check day if open during
        all_days_in_opening_times = find_all_days_in_opening_time(opening_time_part)
        if day_of_week in all_days_in_opening_times:
            ## format string:
            opening_time_part = re.sub('[A-Z][a-z]-[A-Z][a-z]','',opening_time_part)
            opening_time_part = re.sub('[A-Z][a-z]','',opening_time_part)
            opening_time_part = opening_time_part.strip()
            try:
                bound_start, bound_end = extract_time(opening_time_part)
                if within_opening_hours(bound_start, bound_end, time_of_day):
                    return True
            except Exception as e:
                print(e, 'IGNORED', opening_time_part)
                continue
        else:
            continue

    return False


def change_comma_to_semicolon(opening_time):
    return re.sub('(?<=[0-9]),', ';', opening_time)
    

def extract_time(opening_time_part):
    split_parts = opening_time_part.split('-')
    ## Check there is two times
    try:
        assert len(split_parts) == 2
    except Exception as e:
        print(e)
    return split_parts



def find_all_days_in_opening_time(opening_time_part):
    all_days_found = []
    ## find all LIKE Mo-Fr and attach days
    part_of_str_to_ignore = []
    for day_range in re.finditer('[F-W][a-u]-[F-W][a-u]', opening_time_part):
        part_of_str_to_ignore.append(day_range.span())
        day_range = day_range.string.strip()
        try:
            start_range = key_from_value(DAYOFWEEK_NAMES, day_range[0:2])
            end_range = key_from_value(DAYOFWEEK_NAMES, day_range[3:5])
            if start_range == end_range:
                return list(DAYOFWEEK_NAMES.values())
            counter = 0
            while DAYOFWEEK_NAMES[(start_range+counter)%7] != DAYOFWEEK_NAMES[end_range]: 
                all_days_found.append(DAYOFWEEK_NAMES[(start_range+counter)%7])
                counter +=1
            all_days_found.append(DAYOFWEEK_NAMES[end_range])
            
        except Exception as e:
            print(e,  opening_time_part)
            continue
        
    ## make new string without day range
    part_of_str_to_ignore.sort()
    opening_time_part = remove_ignored_indexes(opening_time_part, part_of_str_to_ignore)
    all_days_found.extend(re.findall('[F-W][a-u]',opening_time_part))
    
    return all_days_found


def remove_ignored_indexes(orig_str, ignored_parts):
    if len(ignored_parts) == 0:
        return orig_str
    new_str = ""
    for i in range(len(ignored_parts)+1):
        if i == 0:
            new_str += orig_str[0:ignored_parts[i][0]]
        elif i == len(ignored_parts):
            new_str += orig_str[ignored_parts[i-1][1]:]
            break
        else:
            new_str += orig_str[ignored_parts[i-1][1]:ignored_parts[i][0]]
    return new_str
            
            
def key_from_value(the_dict, value):
    key = list(the_dict.keys())[list(the_dict.values()).index(value)]
    return key

def within_opening_hours(bound_start, bound_end, time_of_day):
    # if the opening time is 22:00 - 03:00 for example    
    if bound_start > bound_end:
        is_within = bound_start < time_of_day
    else:
        is_within = bound_start < time_of_day < bound_end
        
    return is_within

In [22]:
test_date = joined_data['endtime'].iloc[1]
test_date

Timestamp('2017-09-18 20:56:40')

In [23]:
test_ot = joined_data['opening_hours'].iloc[0]
test_ot

'Mo-Tu 09:00-18:00;We-Fr 09:00-21:00; Sa 10:00-17:00; Su 11:00-17:00'

In [24]:
joined_data['opened_at_time'] = joined_data.apply(lambda row: work_out_if_open(row['endtime'], row['opening_hours']), axis=1)


IGNORED PH closed

not enough values to unpack (expected 2, got 1) IGNORED off
'Ma' is not in list May-Dec Mo-We 09:00-18:00
'Ja' is not in list Jan-Apr We-Su 10:00-17:00

not enough values to unpack (expected 2, got 1) IGNORED 17:00+
' T' is not in list Su, Tu-Th 11:30-21:00

IGNORED PH closed

IGNORED dim-lun: 11:00 - 20:00 mar-sam: 10:00 - 23:00

too many values to unpack (expected 2) IGNORED -  09:00 - 22:00

too many values to unpack (expected 2) IGNORED -  09:00 - 22:00

IGNORED 7 days

IGNORED PH closed

IGNORED PH closed

IGNORED PH closed

IGNORED PH closed

IGNORED until 22h
'Ma' is not in list Ma-We 10:00-18:00

IGNORED dim-lun: 11:00 - 20:00 mar-sam: 10:00 - 23:00

IGNORED dim-lun: 11:00 - 20:00 mar-sam: 10:00 - 23:00

IGNORED PH closed

IGNORED Appointment

IGNORED "seasonal"

not enough values to unpack (expected 2, got 1) IGNORED off

IGNORED 7 days

not enough values to unpack (expected 2, got 1) IGNORED osed

IGNORED PH closed

IGNORED 7 days

too many values to unpac

IGNORED PH closed

IGNORED M-F: 07:30-21:00

too many values to unpack (expected 2) IGNORED t-n 08:00-21:00

IGNORED 7 days

IGNORED until 22h

not enough values to unpack (expected 2, got 1) IGNORED off

IGNORED PH closed

IGNORED 7 days

not enough values to unpack (expected 2, got 1) IGNORED off

IGNORED PH closed

IGNORED PH closed

IGNORED PH closed

IGNORED 7 days

IGNORED PH closed
'Au' is not in list Aug 16 - May 15: Mo-Fr 9:00-24:00

not enough values to unpack (expected 2, got 1) IGNORED closed

IGNORED PH closed

IGNORED PH closed

not enough values to unpack (expected 2, got 1) IGNORED off

IGNORED 7 days

IGNORED PH closed

IGNORED PH closed

IGNORED PH closed

IGNORED PH closed

IGNORED PH closed

IGNORED PH closed

IGNORED "seasonal"
'No' is not in list No-Su 10:00-23:00

not enough values to unpack (expected 2, got 1) IGNORED 17:00+

IGNORED PH closed

IGNORED 10:00AM to 5:00PM

not enough values to unpack (expected 2, got 1) IGNORED off

IGNORED PH closed

IGNORED PH c

In [25]:
joined_data['opened_at_time'].value_counts()

False    10616
True     10420
Name: opened_at_time, dtype: int64

In [26]:
joined_data.amenity.isna().value_counts()

True     10639
False    10397
Name: amenity, dtype: int64

In [27]:
joined_data['amenity'].value_counts()

restaurant                3711
cafe                      1412
fast_food                  866
library                    619
bar                        595
pharmacy                   553
bank                       470
marketplace                438
ice_cream                  236
atm                        219
pub                        170
bureau_de_change           153
clinic                     119
post_office                 86
arts_centre                 82
nightclub                   64
shop|clothes                55
fuel                        49
dentist                     48
food_court                  48
veterinary                  46
parking                     43
car_rental                  39
bicycle_repair_station      34
school                      31
community_centre            31
hospital                    18
theatre                     16
car_wash                    16
coworking_space             15
childcare                   13
social_facility             12
bicycle_

In [28]:
all_amenities = {"Leisure":['bar', 'nightclub', 'library', 'pub', 'theatre', 'arts_centre', 'dojo'],
                "Food":["restaurant", 'cafe', 'ice_cream', 'food_court', 'pub', 'fast_food'],
                "Shopping":['shop|clothes','marketplace', 'car_wash', 'bicycle_repair_station', 'bicycle_rental'],
                "Education":['school', 'library', 'driving_school', 'hospital', 'language_school'],
                "Health":['clinic', 'doctors', 'dentist', 'hospital', 'pharmacy', 'veterinary', 'audiologist']}
## MIXED 'library' , 'pub', 'hospital' 'arts_centre'
## CONSIDER ADDING 'social_centre' 'community_centre' 'bureau_de_change' 'social_facility' 'car_rental'
## CONSIDER ADDING (cont) 'post_office'
## IGNORED 'vehicle_inspection', 'bicycle_parking'  'car_sharing' 'police', 'water_point' atm', bank  childcare 
## IGNORED (cont) 'internet_cafe' 'animal_boarding' 'coworking_space' parking ''fuel', embassy' animal_shelter

In [157]:
def categorise_amenities(value):
    """
        order is Leisure, Food, Shopping, Education, Health 
    """
    all_cats = []
    for cat in all_amenities.keys():
        if value in all_amenities[cat]:
            all_cats.append(1)
        else:
            all_cats.append(0)
            
    return all_cats


def calc_proportion(row):
    num_rows = len(row)
    total_vals = 0
    for ind in row.index:
        if JOINED_DATA_WITH_AMENTIES.iloc[ind]['opened_at_time']:
            total_vals += row[ind]
    ## calculate proportion
    return total_vals / num_rows

In [159]:
amenity_cols = pd.DataFrame(joined_data["amenity"].apply(categorise_amenities).apply(pd.Series))
amenity_cols.columns = ['Leisure','Food','Shopping', 'Education', 'Health']

In [160]:
## join back to data
JOINED_DATA_WITH_AMENTIES = pd.concat([joined_data, amenity_cols], axis=1).reset_index(drop=True)

In [161]:
opening_time_data = JOINED_DATA_WITH_AMENTIES.groupby(['id_trip']).agg({'Leisure':calc_proportion,
                                                    'Food':calc_proportion,
                                                    'Shopping':calc_proportion,
                                                    'Education':calc_proportion,
                                                    'Health':calc_proportion}).reset_index()

In [170]:
opening_time_data.head()

Unnamed: 0,id_trip,Leisure,Food,Shopping,Education,Health
0,7,0.0,0.0,0.0,0.0,0.0
1,148,0.0,0.0,0.0,0.0,0.0
2,502,0.0,0.0,0.0,0.0,0.0
3,585,0.0,0.0,0.0,0.0,0.0
4,715,0.0,0.0,0.0,0.0,1.0


In [168]:
for col in ['Leisure', 'Food', 'Shopping', 'Education', 'Health']:
    print(opening_time_data[col].value_counts())

0.000000    6190
1.000000     176
0.333333     127
0.500000     122
0.666667      68
0.166667      31
0.083333      30
0.142857      30
0.250000      28
0.117647      18
0.200000      12
0.111111      12
0.125000      11
0.055556      11
0.076923       7
0.058824       7
0.285714       4
0.133333       3
0.100000       2
0.400000       1
0.090909       1
Name: Leisure, dtype: int64
0.000000    4214
1.000000     825
0.500000     532
0.333333     364
0.250000     205
0.200000     118
0.166667      79
0.666667      77
0.111111      50
0.142857      49
0.083333      31
0.125000      28
0.076923      26
0.222222      25
0.294118      21
0.181818      21
0.750000      20
0.428571      18
0.375000      17
0.100000      17
0.285714      17
0.153846      16
0.090909      15
0.272727      13
0.600000      11
0.400000      11
0.266667      10
0.071429       8
0.363636       7
0.555556       5
0.055556       4
0.300000       4
0.571429       4
0.416667       4
0.230769       3
0.800000       3
0.1

In [176]:
## rename columns
opening_time_data.columns = ['id_trip', 'Leisure_opened', 'Food_opened', 'Shopping_opened',\
                             'Education_opened', 'Health_opened']

In [177]:
# opening_time_data.to_csv('../../Data/model_inputs/opening_time_by_purpose.csv', index=False)

In [179]:
opening_time_data.describe()

Unnamed: 0,id_trip,Leisure_opened,Food_opened,Shopping_opened,Education_opened,Health_opened
count,6891.0,6891.0,6891.0,6891.0,6891.0,6891.0
mean,238329.700189,0.051464,0.212509,0.017293,0.017577,0.023069
std,135793.486704,0.185752,0.337131,0.113854,0.114927,0.124706
min,7.0,0.0,0.0,0.0,0.0,0.0
25%,122189.5,0.0,0.0,0.0,0.0,0.0
50%,239644.0,0.0,0.0,0.0,0.0,0.0
75%,355554.5,0.0,0.333333,0.0,0.0,0.0
max,477230.0,1.0,1.0,1.0,1.0,1.0
