# Notebook for preparing necessary data for COM6101 group project

## Introduction

We want to start an italian restaurant in Hong Kong. We want to find the best location for our restaurant. as well as the menu we should offer. (to be added more details)

## Data Preparation

Here are the data we need to prepare for our project:
* 2021 Hong Kong Census data (source: https://data.gov.hk/en-data/dataset/hk-censtatd-census_geo-2021-population-census-by-ssg )
* Hong Kong Restaurant License (source: https://data.gov.hk/en-data/dataset/hk-fehd-fehdlmis-restaurant-licences )
* Hong Kong Pedestrian Network (source: https://opendata.esrichina.hk/datasets/48e295256fd84032a87b27000cea35cd/about)
* Openrice data (source: https://www.openrice.com/zh/hongkong/restaurants )
* Kaggle data (source: https://www.kaggle.com/gsnehaa21/yelp-dataset )
* <span style="background-color: #FF0000">Hong Kong Geocommunity (source: https://data.gov.hk/en-data/dataset/hk-landsd-openmap-development-hkms-digital-geocom/resource/0931b84b-f1c8-409f-9cd4-176a26645db0 ) (May not useful, as the data cannot match the time)</span>

## Data Processing

In this notebook, we will process the data to get the following information:

### HK Census Data

In [32]:
# transform HK Census Data from espg 2326 (HK coordinate system) to 4326 (WGS84)
import geopandas as gpd
import pandas as pd
from shapely.geometry import Polygon

gpdf = 'data/2021_census/LSUG_21C.json'

class transform_LSUG:
    def __init__(self, gpdf, epsg=2326, output_path='data/2021_census'):
        self.output_path = output_path
        self.gpdf = gpd.read_file(gpdf)
        self.epsg = epsg
    def transform(self):
        self.gpdf = self.gpdf.set_crs(epsg=self.epsg, allow_override=True)
        self.gpdf = self.gpdf.to_crs(epsg=4326)
        self.gpdf['name'] = self.gpdf.index
        self.gpdf = self.gpdf.explode()
        self.gpdf['geometry'] = self.gpdf['geometry'].apply(lambda x: Polygon(x.exterior.coords))
        return self.gpdf
    def export(self):
        self.transform()
        return self.gpdf.to_file(self.output_path + '/LSUG_21C_lat_lng.json', driver='GeoJSON')

transform_LSUG(gpdf).export()

  self.gpdf = self.gpdf.explode()


Transform the HK Census data to preferred attributes:

In [33]:
import geopandas as gpd

LSUG = gpd.read_file('data/2021_census/LSUG_21C_lat_lng.json')
LSUG_copy = LSUG.copy()
LSUG_copy

Unnamed: 0,level_0,level_1,age_1,age_2,age_3,age_4,age_5,born_chi,born_else,born_hk,...,wp_n,wp_o,wp_p,wp_q,wp_r,wp_s,wp_se,wp_t,name,geometry
0,0,0,257,141,472,461,224,259,362,934,...,-,91,63,154,123,141,130,-,0,"POLYGON ((114.12554 22.28403, 114.12554 22.284..."
1,0,1,257,141,472,461,224,259,362,934,...,-,91,63,154,123,141,130,-,0,"POLYGON ((114.11267 22.28621, 114.11267 22.286..."
2,0,2,257,141,472,461,224,259,362,934,...,-,91,63,154,123,141,130,-,0,"POLYGON ((114.11683 22.28633, 114.11683 22.286..."
3,1,0,199,42,527,510,253,321,290,920,...,11,56,284,199,135,153,47,-,1,"POLYGON ((114.12668 22.28306, 114.12654 22.283..."
4,2,0,276,163,760,664,146,470,278,1261,...,62,36,250,187,203,101,50,25,2,"POLYGON ((114.12804 22.28329, 114.12791 22.283..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1962,1743,0,153,159,689,686,339,400,108,1518,...,16,56,53,227,340,158,82,14,1743,"POLYGON ((114.04878 22.28376, 114.04878 22.283..."
1963,1743,1,153,159,689,686,339,400,108,1518,...,16,56,53,227,340,158,82,14,1743,"POLYGON ((114.04416 22.28705, 114.04416 22.287..."
1964,1744,0,102,157,636,748,440,343,118,1622,...,45,22,90,284,186,109,131,31,1744,"POLYGON ((114.03342 22.29006, 114.03335 22.290..."
1965,1744,1,102,157,636,748,440,343,118,1622,...,45,22,90,284,186,109,131,31,1744,"POLYGON ((114.04349 22.29217, 114.04346 22.292..."


In [34]:
'''
calculate the mean of age in each subnit group
age_1 represents number of people in age 0-14, age_2 represents number of people in age 15-24,
age_3 represents number of people in age 25-44, age_4 represents number of people in age 45 - 64
age_5 represents number of people in age 65 or above, assume it is 65-84
Method of finding the mean of age in each subunit group:
1. multiply the number of people in each age group by the middle value of the age group
2. sum up the result of each age group
3. divide the sum by the total number of people in the subunit group
'''
# input all '-' to 0 first
LSUG_copy[['age_1', 'age_2', 'age_3', 'age_4', 'age_5']] = LSUG_copy[['age_1', 'age_2', 'age_3', 'age_4', 'age_5']].replace('-', '0')

# change dtype from int to float
LSUG_copy[['age_1', 'age_2', 'age_3', 'age_4', 'age_5']] = LSUG_copy[['age_1', 'age_2', 'age_3', 'age_4', 'age_5']].astype('float')

LSUG_copy['mean_age'] = LSUG_copy['age_1'] * 7 + LSUG_copy['age_2'] * 19.5 + LSUG_copy['age_3'] * 34.5 + LSUG_copy['age_4'] * 54.5 + LSUG_copy['age_5'] * 74.5
# median age is the age of the middle person in the subunit group, find the median age of each subunit group first by dividing the total number of people in the subunit group by 2
LSUG_copy['median_age'] = (LSUG_copy['age_1'] + LSUG_copy['age_2'] + LSUG_copy['age_3'] + LSUG_copy['age_4'] + LSUG_copy['age_5']) / 2
LSUG_copy['median_age'] = LSUG_copy['median_age'].astype('int')

# look the median is situated in which age group, then find the middle value of the age group
LSUG_copy.loc[LSUG_copy['median_age'] <= LSUG_copy['age_1'], 'median_age'] = 7 # age_1
LSUG_copy.loc[(LSUG_copy['median_age'] > LSUG_copy['age_1']) & (LSUG_copy['median_age'] <= LSUG_copy['age_1'] + LSUG_copy['age_2']), 'median_age'] = 20 # age_2
LSUG_copy.loc[(LSUG_copy['median_age'] > LSUG_copy['age_1'] + LSUG_copy['age_2']) & (LSUG_copy['median_age'] <= LSUG_copy['age_1'] + LSUG_copy['age_2'] + LSUG_copy['age_3']), 'median_age'] = 35 # age_3
LSUG_copy.loc[(LSUG_copy['median_age'] > LSUG_copy['age_1'] + LSUG_copy['age_2'] + LSUG_copy['age_3']) & (LSUG_copy['median_age'] <= LSUG_copy['age_1'] + LSUG_copy['age_2'] + LSUG_copy['age_3'] + LSUG_copy['age_4']), 'median_age'] = 55 # age_4
LSUG_copy.loc[(LSUG_copy['median_age'] > LSUG_copy['age_1'] + LSUG_copy['age_2'] + LSUG_copy['age_3'] + LSUG_copy['age_4']) & (LSUG_copy['median_age'] <= LSUG_copy['age_1'] + LSUG_copy['age_2'] + LSUG_copy['age_3'] + LSUG_copy['age_4'] + LSUG_copy['age_5']), 'median_age'] = 75 # age_5

LSUG_copy['mean_age'] = LSUG_copy['mean_age'] / (LSUG_copy['age_1'] + LSUG_copy['age_2'] + LSUG_copy['age_3'] + LSUG_copy['age_4'] + LSUG_copy['age_5'])

# reduce the effect of outliers of mean age
LSUG_copy['mean_median_age'] = LSUG_copy['mean_age'] * 0.5 + LSUG_copy['median_age'] * 0.5
LSUG_copy[['mean_age', 'median_age', 'mean_median_age']].describe()

Unnamed: 0,mean_age,median_age,mean_median_age
count,1967.0,1967.0,1967.0
mean,43.942314,45.81088,44.876597
std,3.833918,10.043943,6.564367
min,29.875228,20.0,24.937614
25%,41.319854,35.0,38.249683
50%,43.811919,55.0,48.813066
75%,46.581901,55.0,50.781901
max,65.826733,75.0,70.413366


In [35]:
# total population is the sum of age_1, age_2, age_3, age_4, age_5
LSUG_copy['total_population'] = LSUG_copy['age_1'] + LSUG_copy['age_2'] + LSUG_copy['age_3'] + LSUG_copy['age_4'] + LSUG_copy['age_5']
LSUG_copy['total_population'].describe()

count     1967.000000
mean      4083.156075
std       4687.779911
min       1000.000000
25%       1595.500000
50%       2208.000000
75%       4311.500000
max      39767.000000
Name: total_population, dtype: float64

In [36]:
# median of income is in ma_hh
LSUG_copy['median_hs_income'] = LSUG_copy['ma_hh']
LSUG_copy['median_hs_income'].value_counts()

median_hs_income
63820    57
47850    43
17000    28
47070    19
20040    17
         ..
40560     1
86240     1
89870     1
62140     1
82290     1
Name: count, Length: 1286, dtype: int64

In [37]:
# mean household size: total_population / dh
# convert dh '-' to 1
LSUG_copy['dh'] = LSUG_copy['dh'].replace('-', '1')
LSUG_copy['dh'] = LSUG_copy['dh'].astype('float')
LSUG_copy['mean_hs_size'] = LSUG_copy['total_population'] / LSUG_copy['dh']
# for rows where 'dh' = 1, set mean_hs_size = 1
LSUG_copy.loc[LSUG_copy['dh'] == 1, 'mean_hs_size'] = 1
# find row where 'mean_hs_size' > 10, set mean_hs_size = 10
LSUG_copy.loc[LSUG_copy['mean_hs_size'] > 10, 'mean_hs_size'] = 10
LSUG_copy['mean_hs_size'].describe()

count    1967.000000
mean        2.854699
std         0.592379
min         1.000000
25%         2.484750
50%         2.782135
75%         3.095936
max        10.000000
Name: mean_hs_size, dtype: float64

In [38]:
# % in labour force: t_wp / total_population
# transform '-' to 0
LSUG_copy['t_wp'] = LSUG_copy['t_wp'].replace('-', '0')
# transform dtype from str to float
LSUG_copy['t_wp'] = LSUG_copy['t_wp'].astype('float')
LSUG_copy['labour_force_percent'] = LSUG_copy['t_wp'] / LSUG_copy['total_population']
LSUG_copy['labour_force_percent'].describe()

count    1967.000000
mean        0.510765
std         0.067052
min         0.000000
25%         0.471007
50%         0.515250
75%         0.555184
max         0.785621
Name: labour_force_percent, dtype: float64

In [39]:
# % of Chinese: ethn_chi / total_population
# transform '-' to 0
LSUG_copy['ethn_chi'] = LSUG_copy['ethn_chi'].replace('-', '0')
# transform dtype from str to float
LSUG_copy['ethn_chi'] = LSUG_copy['ethn_chi'].astype('float')
LSUG_copy['chinese_percent'] = LSUG_copy['ethn_chi'] / LSUG_copy['total_population']
LSUG_copy['chinese_percent'].describe()

count    1967.000000
mean        0.863458
std         0.133182
min         0.263777
25%         0.817618
50%         0.915462
75%         0.955576
max         1.000000
Name: chinese_percent, dtype: float64

In [40]:
# % of Male: pop_m / total_population
# transform '-' to 0
LSUG_copy['pop_m'] = LSUG_copy['pop_m'].replace('-', '0')
# transform dtype from str to float
LSUG_copy['pop_m'] = LSUG_copy['pop_m'].astype('float')
LSUG_copy['pop_m_percent'] = LSUG_copy['pop_m'] / LSUG_copy['total_population']
LSUG_copy['pop_m_percent'].describe()

count    1967.000000
mean        0.452532
std         0.044367
min         0.298851
25%         0.428301
50%         0.454599
75%         0.476916
max         0.888457
Name: pop_m_percent, dtype: float64

In [41]:
# % of owner owned property: ten_oc / LSUG_copy['dh']
# transform '-' to 0
LSUG_copy['ten_oc'] = LSUG_copy['ten_oc'].replace('-', '0')
# transform dtype from str to float
LSUG_copy['ten_oc'] = LSUG_copy['ten_oc'].astype('float')
LSUG_copy['ten_oc_percent'] = LSUG_copy['ten_oc'] / LSUG_copy['dh']
LSUG_copy['ten_oc_percent'].describe()

count    1967.000000
mean        0.532857
std         0.246771
min         0.000000
25%         0.410584
50%         0.550868
75%         0.721917
max         1.000000
Name: ten_oc_percent, dtype: float64

In [42]:
# rent to income ratio: dmr_ir
# transform from object to float
import pandas as pd
LSUG_copy['rent_to_income_ratio'] = pd.to_numeric(LSUG_copy['dmr_ir'], errors='coerce')
LSUG_copy.loc[(LSUG_copy['rent_to_income_ratio'] > 100), 'rent_to_income_ratio'] = 100
LSUG_copy['rent_to_income_ratio'] = LSUG_copy['rent_to_income_ratio'] / 100
LSUG_copy['rent_to_income_ratio'].describe()

count    1900.000000
mean        0.283352
std         0.117332
min         0.020000
25%         0.219000
50%         0.289000
75%         0.346000
max         1.000000
Name: rent_to_income_ratio, dtype: float64

In [50]:
# export used columns to json file, rename 'name' to 'census_subnit' first
LSUG_copy = LSUG_copy.rename(columns={'name': 'census_subnit'})
# used columns: 'subunit', 'mean_median_age', 'median_hs_income', 'mean_hs_size', 'labour_force_percent', 'chinese_percent', 'pop_m_percent', 'ten_oc_percent', 'rent_to_income_ratio', 'geometry'
LSUG_copy[['census_subnit', 'mean_median_age', 'median_hs_income', 'mean_hs_size', 'labour_force_percent', 'chinese_percent', 'pop_m_percent', 'ten_oc_percent', 'rent_to_income_ratio', 'geometry']].to_file('data/2021_census/LSUG_21C_cleaned.json', driver='GeoJSON')

In [51]:
# preview the cleaned data
LSUG_copy[['census_subnit', 'mean_median_age', 'median_hs_income', 'mean_hs_size', 'labour_force_percent', 'chinese_percent', 'pop_m_percent', 'ten_oc_percent', 'rent_to_income_ratio', 'geometry']].head()

Unnamed: 0,census_subnit,mean_median_age,median_hs_income,mean_hs_size,labour_force_percent,chinese_percent,pop_m_percent,ten_oc_percent,rent_to_income_ratio,geometry
0,0,37.643087,51370,3.031189,0.506109,0.779421,0.518971,0.670565,0.273,"POLYGON ((114.12554 22.28403, 114.12554 22.284..."
1,0,37.643087,51370,3.031189,0.506109,0.779421,0.518971,0.670565,0.273,"POLYGON ((114.11267 22.28621, 114.11267 22.286..."
2,0,37.643087,51370,3.031189,0.506109,0.779421,0.518971,0.670565,0.273,"POLYGON ((114.11683 22.28633, 114.11683 22.286..."
3,1,39.393207,55680,2.244868,0.640758,0.835402,0.430438,0.478006,0.295,"POLYGON ((114.12668 22.28306, 114.12654 22.283..."
4,2,37.011075,50040,2.272624,0.578895,0.83773,0.446491,0.573529,0.252,"POLYGON ((114.12804 22.28329, 114.12791 22.283..."


### Restaurant License Data

In [44]:
# transfer xml to csv for restaurant licence data
import os
import xml.etree.ElementTree as Xet
import pandas as pd

folder_path = 'data/restaurant_license/xml'
output_folder = 'data/restaurant_license/csv'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

cols = ["TYPE", "DIST", "LICNO", "SS", "ADR", "INFO", "EXPDATE"]

# Iterate through all XML files in the folder_path
for filename in os.listdir(folder_path):
    if filename.endswith(".XML"):
        xml_file = os.path.join(folder_path, filename)
        csv_file = os.path.join(output_folder, os.path.splitext(filename)[0][1:] + ".csv")
        rows = []
        xmlparse = Xet.parse(xml_file)
        # The data are stored in the <LPS> tag
        root = xmlparse.getroot().find("LPS")

        for i in root:
            
            TYPE = i.find("TYPE").text
            DIST = i.find("DIST").text
            LICNO = i.find("LICNO").text
            SS = i.find("SS").text
            ADR = i.find("ADR").text
            INFO = i.find("INFO").text
            EXPDATE = i.find("EXPDATE").text
            rows.append({"TYPE": TYPE, "DIST": DIST, "LICNO": LICNO, 
                         "SS": SS, "ADR": ADR, "INFO": INFO, "EXPDATE": EXPDATE})

        df = pd.DataFrame(rows, columns=cols)
        df.to_csv(csv_file, index=False)
        print(f"Converted {filename} to {os.path.splitext(filename)[0][1:] + '.csv'}")

print("XML to CSV conversion completed.")

Converted 20230101-1033-LP_Restaurants_EN.XML to 0230101-1033-LP_Restaurants_EN.csv
Converted 20240101-1050-LP_Restaurants_EN.XML to 0240101-1050-LP_Restaurants_EN.csv
XML to CSV conversion completed.


In [46]:
# geocoding for restaurant licence data
import os
import pandas as pd
import geocoder

API_KEY = "AAPKae2a40767592460cb54f98756c8579053czHfV2nsyRUZFSbUVLCBS1poOYx5GBSCwaj9nhJL3yjZggs94pMrjDgZtoZhg-0"

folder_path = 'data/restaurant_license/csv'
output_folder = 'data/restaurant_license/geo'

# create output folder if not exist
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

# Iterate through all CSV files in the folder_path
# columns: TYPE, DIST, LICNO, SS, ADR, INFO, EXPDATE
# column ADR is the address

# use old csv file to reference if address already exist
old_file = 'data/restaurant_license/geo/geo_full.csv'

for filename in os.listdir(folder_path):
    # only retrive csv file and file not end with .finished.csv
    if filename.endswith(".csv") and not filename.endswith(".finished.csv"):
        print(f'start converting {filename} to geo location')
        csv_file = os.path.join(folder_path, filename)
        geo_file = os.path.join(output_folder, filename + ".geo.csv")
        # find rows if ADR contains ['KWAI CHUNG"] or ['TSUEN WAN']
        # transform ADR to upper case first
        df = pd.read_csv(csv_file, encoding='utf-8')
        df['ADR'] = df['ADR'].str.upper()
        df = df.loc[lambda df: df['ADR'].str.contains('KWAI CHUNG') | df['ADR'].str.contains('TSUEN WAN')]
        # add new columns for geo location
        lat = []
        lng = []
        df.reset_index(inplace=True, drop=True)
        # check if df['ADR'] == old_df['ADR']
        old_df = pd.read_csv(old_file, encoding='utf-8')
        for i in range(len(df)):
            if df['ADR'][i] in old_df['ADR'].values:
                # if address already exist in old_df, then use the geo location in old_df
                lat.append(old_df.loc[old_df['ADR'] == df['ADR'][i], 'lat'].iloc[0])
                lng.append(old_df.loc[old_df['ADR'] == df['ADR'][i], 'lng'].iloc[0])
                print(f"Converted {df['ADR'][i]} to {old_df.loc[old_df['ADR'] == df['ADR'][i], 'lat'].iloc[0]}, {old_df.loc[old_df['ADR'] == df['ADR'][i], 'lng'].iloc[0]}, by referencing old csv file")
                continue
            g = geocoder.arcgis(df['ADR'][i], key = API_KEY)
            lat.append(g.lat)
            lng.append(g.lng)
            print(f"Converted {df['ADR'][i]} to {g.lat}, {g.lng}")
        df['lat'] = lat
        df['lng'] = lng
        df.to_csv(geo_file, index=False)
        # rename the original csv file to finished
        os.rename(csv_file, os.path.join(folder_path, filename + "_finished.csv"))
        print(f"Converted {filename} to {filename}")
print("All csv files converted to geo location")

start converting 0230101-1033-LP_Restaurants_EN.csv to geo location
Converted 290, G/F., 290-292, M/F, SHA TSUI ROAD, TSUEN WAN, NEW TERRITORIES to 22.36932735534819, 114.117078387233, by referencing old csv file
Converted 15-16, WING KA HOUSE, FUK LOI ESTATE, G/F., TSUEN WAN, NEW TERRITORIES to 22.373866994108013, 114.1124345614167, by referencing old csv file
Converted NO. 9 SAN TSUEN MAIN STREET, G/F & 1/F, SHAM TSENG, TSUEN WAN, NEW TERRITORIES to 22.353245961644333, 114.10051331580638, by referencing old csv file
Converted 13 MAIN STREET, SAN TSUEN, SHAM TSENG, TSUEN WAN, NEW TERRITORIES to 22.37008224800008, 114.05948684100008, by referencing old csv file
Converted YUEN YUEN INSTITUTE, LO WAI, TSUEN WAN, NEW TERRITORIES to 22.383020000000045, 114.12272000000009, by referencing old csv file
Converted NO. 12-14 HOI PA STREET, G/F, TSUEN WAN, NEW TERRITORIES to 22.3738310357434, 114.11179569275669, by referencing old csv file
Converted NO. 294 SHA TSUI ROAD, G/F & M/F, TSUEN WAN, NE

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'data/restaurant_license/csv\\0230101-1033-LP_Restaurants_EN.csv' -> 'data/restaurant_license/csv\\0230101-1033-LP_Restaurants_EN.csv_finished.csv'

In [None]:
# map the restaurant licence data to its relative HK census polygon TPU
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
import os

class map_census_polygon_restaurant:
    def __init__(self, census_polygon, restaurant_path, output_folder, restaurant_name):
        self.census_polygon = gpd.read_file(census_polygon)
        self.restaurant_path = restaurant_path
        self.restaurant_name = restaurant_name
        self.output_folder = output_folder
    def map(self):
        restaurant = pd.read_csv(self.restaurant_path + self.restaurant_name)
        # if restaurant['lat'] and restaurant['lng'] are contained in census_polygon['geometry'], then the restaurant is in the census polygon
        try:
            restaurant['census_subnit'] = restaurant.apply(lambda row: self.census_polygon[self.census_polygon['geometry'].contains(Point(row['lng'], row['lat']))]['name'].values[0], axis=1) 
        except:
            restaurant['census_subnit'] = restaurant.apply(lambda row: self.census_polygon[self.census_polygon['geometry'].contains(Point(row['lng'], row['lat']))]['name'].values, axis=1)
        # if restaurant['census_subnit'] is a list, then only keep the first element
        restaurant['census_subnit'] = restaurant['census_subnit'].str[0]
        # export the mapped restaurant to csv, named as the file name of the original restaurant csv file to output_folder
        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder, exist_ok=True)
        return restaurant.to_csv(self.output_folder + self.restaurant_name.replace('.csv.geo.csv', '') + '_geo_mapped.csv', index=False), print(f"Restaurant {self.restaurant_name} mapped to census polygon")
    
# loop all the restaurant csv files in folder restaurant-info-hk/geo, and map them to census polygon
census_polygon = 'data/2021_census/LSUG_21C_lat_lng.json'
restaurant_path = 'data/restaurant_license/geo/'
output_folder = 'data/restaurant_license/geo_mapped/'
for restaurant_name in os.listdir(restaurant_path):
    map_census_polygon_restaurant(census_polygon, restaurant_path, output_folder, restaurant_name).map()

Restaurant 0230101-1033-LP_Restaurants_EN.csv.geo.csv mapped to census polygon
Restaurant geo_full.csv mapped to census polygon
Restaurant 0240101-1050-LP_Restaurants_EN.csv.geo.csv mapped to census polygon


In [49]:
# preview the mapped restaurant data
restaurant_mapped = pd.read_csv('data/restaurant_license/geo_mapped/0230101-1033-LP_Restaurants_EN_geo_mapped.csv')
restaurant_mapped.head()

Unnamed: 0,TYPE,DIST,LICNO,SS,ADR,INFO,EXPDATE,lat,lng,census_subnit
0,RL,92,2292000010,東江,"290, G/F., 290-292, M/F, SHA TSUI ROAD, TSUEN ...",,2023-06-30,22.369327,114.117078,1006.0
1,RL,92,2292000047,海蓮,"15-16, WING KA HOUSE, FUK LOI ESTATE, G/F., TS...",,2023-06-30,22.373867,114.112435,989.0
2,RL,92,2292000056,裕記,"NO. 9 SAN TSUEN MAIN STREET, G/F & 1/F, SHAM T...",,2023-06-30,22.353246,114.100513,1072.0
3,RL,92,2292000065,能記飯店,"13 MAIN STREET, SAN TSUEN, SHAM TSENG, TSUEN W...",,2023-06-30,22.370082,114.059487,1061.0
4,RL,92,2292000074,圓玄學院,"YUEN YUEN INSTITUTE, LO WAI, TSUEN WAN, NEW TE...",,2023-06-30,22.38302,114.12272,973.0


## Openrice Data to get the type of restaurant in the restaurant license list

In [None]:
# reference to the web scrapping code

## Machine Learning Model

In [None]:
# as per above logic, build an ANN model with 17 neurons, and 1 output layer with sigmoid function to determine which weight is the best for predicting the survive rate
# use 10 fold cross validation to train the model, and use the average accuracy as the final accuracy
# training datas: C:\Competition\coding-with-git-repo\open-data-hackthon\becky\training-model\prepared-data\training-data
# valid datas: C:\Competition\coding-with-git-repo\open-data-hackthon\becky\training-model\prepared-data\valid-data t
import os
import time
import pandas as pd
import json
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from keras import optimizers
from keras.models import Sequential #used for model building
from keras.layers import Dense #used for creating layers
from scikeras.wrappers import KerasClassifier #used for scikit learn wrapper
from sklearn.model_selection import StratifiedKFold #Stratified K Fold
from sklearn.model_selection import GridSearchCV #Grid Searching
from sklearn.metrics import confusion_matrix #import confusion matrix
from sklearn.metrics import roc_curve, roc_auc_score #import ROC curve tools
from itertools import product
from sklearn.model_selection import KFold


# function to load the test data
def load_test_data(test_data_path):
    test_data = pd.read_csv(test_data_path)
    # return the 7th and the second last column of the test data
    # return the 7th and the second last column of the test data
    test_data = test_data.iloc[:,7:]
    test_data = test_data.iloc[:,:-1]
    test_data = test_data.fillna(test_data.mean())
    # reset index
    test_data = test_data.reset_index()
    return test_data
  
# function to load the desire output training data
def load_training_data(training_data_path):
    training_data = pd.read_csv(training_data_path)
    training_data = training_data.iloc[:,-1]
    # fill the missing data with the mode of the column
    training_data = training_data.fillna(training_data.mode())
    # reset index
    training_data = training_data.reset_index()
    # drop the index column
    training_data = training_data.iloc[:,-1]
    return training_data

# open the training data folder, in my PC path: C:\Competition\coding-with-git-repo\open-data-hackthon\becky\training-model\prepared-data\master-data\RL.csv
# in my cloud path: '/workspaces/open-data-hackthon/becky/training-model/prepared-data/master-data/RL.csv'
import os
training_data_folder = 'C:\\Competition\\coding-with-git-repo\\open-data-hackthon\\becky\\training-model\\prepared-data\\master-data\\RL.csv'


# function to create the model
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
neurons2 = 0
learn_rate2 = 0
decay2 = 0
momentum2 = 0
nesterov2 = True
epochs2 = 0
counterx = 1

def ANN_model(neurons, learn_rate, decay, momentum, nesterov, init, epochs, IV):
    global neurons2, learn_rate2, decay2, momentum2
    global nesterov2, epochs2, counterx
    if((neurons != neurons2) or (learn_rate != learn_rate2) or
        (decay != decay2) or (momentum != momentum2) or
        (nesterov != nesterov2) or (epochs != epochs2)):
        if counterx <= testcomblen:
            print(counterx, "of", testcomblen, '\t',
            "Neurons:", neurons,
            "-- Learn Rate:", learn_rate,
            "-- Decay:", decay,
            "-- Momentum:", momentum,
            "-- Nesterov:", nesterov,
            "-- Epochs:", epochs)
        # create model
        model = Sequential()
        # add hidden layer
        model.add(Dense(neurons,kernel_initializer = init,use_bias = False, activation = 'relu',input_dim=IV))
        # add output layer
        model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))
        # compile optimizer
        optimizer = optimizers.SGD(lr=learn_rate, decay=decay, momentum=momentum, nesterov=nesterov)
        # compile model
        model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        return model

# function to train the model
if __name__ == "__main__":
    tempmean = []
    tempstd = []
    IV = []
    training_data = load_test_data(training_data_folder)
    training_data_output = load_training_data(training_data_folder)
    # directory to save the best parameters for the model is 'model-training-result'
    # if the directory does not exist, then create the directory
    desktop = 'model-training-result'
    if not os.path.exists('model-training-result'):
        os.makedirs('model-training-result')
    # file paths for saving the best parameters for the model
    final_model_params = desktop + "\\final_model_params.json"
    final_model_json = desktop + "\\final_model_json.json"
    final_model_weights = desktop + "\\final_model_weights.h5"
    final_model_complete = desktop + "\\final_model_complete.h5"
    final_model_layer_1_weights = desktop + "\\final_model_layer_1_weights.csv"
    final_model_layer_2_weights = desktop + "\\final_model_layer_2_weights.csv"
    final_model_mean_stdev = desktop + "\\final_model_mean_stdev.csv"
    final_model_probabilities = desktop + "\\final_model_probabilities.csv"
    
    # get column headings
    column_headings = list(training_data.columns.values)
    # get mean and standard deviation of each column
    tempmean.clear()
    tempstd.clear()
    for i in column_headings:
        tempmean.append(training_data[i].mean())
        tempstd.append(training_data[i].std())
    # standardize the data
    d = {'Column Headings': column_headings, 'Mean': tempmean, 'Standard Deviation': tempstd}
    Mean_Stdev = pd.DataFrame(d)
    
    # number of independent variables
    num_X = training_data.shape[1]
    IV.append(num_X)
    
    #get start time
    start_time = time.time()
    
    #Find ideal parameters
    estimator = KerasClassifier(model = ANN_model,
                                init = 'glorot_uniform', nesterov = [True, False], neurons = [24, 12, 25, 49],
                                learn_rate = [0.1, 0.01, 0.001, 0.005], momentum = [0.1, 0.5, 0.9, 0.99],
                                decay = [0.0, 0.1, 0.01, 0.001], epochs = [1, 10, 50, 100, 500], IV = 18, verbose = 0)
    
    init = ['glorot_uniform']
    nesterov = [True, False]
    neurons = [24, 12, 25, 49]
    learn_rate = [0.1, 0.01, 0.001, 0.005]
    momentum = [0.1, 0.5, 0.9, 0.99]
    decay = [0.0, 0.1, 0.01, 0.001]
    epochs = [1, 10, 50, 100, 500]
    
    #Find the number of different combinations
    testcomb = product(neurons, learn_rate, momentum, nesterov, decay, epochs)
    testcomblen = len(list(testcomb))
    
    print("Running Stratified K Fold Testing...")
    param_grid = dict(
    neurons = neurons, 
    decay = decay, 
    learn_rate = learn_rate, 
    momentum = momentum, 
    nesterov = nesterov, 
    epochs = epochs, 
    init = init, 
    IV = IV)
    SKfold= KFold(n_splits=5, shuffle= True)
    
    #cv cross-validation generator to use a 10x StatifiedKFold
    grid = GridSearchCV(
    estimator = estimator, 
    param_grid = param_grid, 
    cv = SKfold
    )
    
    #Find the data to the model
    grid_result = grid.fit(X = training_data, y = training_data_output)
    
    ##Get finished time
    finish_time = time.time()
    total_time = finish_time - start_time
    minutes, seconds = divmod(total_time, 60)
    hours, minutes = divmod(minutes, 60)
    days, hours = divmod(hours, 24)
    print("\n" + "Finished in %.0f" %days + " days %.0f" %hours + 
    " hours %.0f" %minutes + " minutes %.0f" %seconds + " seconds")
    
    print(
    "\n" + 
    "Best Parameter Accuracy: %.2f%%" %(grid_result.best_score_ * 100)
    )
    bp_dict = grid_result.best_params_
    bp_init = bp_dict["init"]
    bp_neurons = bp_dict["neurons"]
    bp_learn_rate = bp_dict["learn_rate"]
    bp_momentum = bp_dict["momentum"]
    bp_decay = bp_dict["decay"]
    bp_nesterov = bp_dict["nesterov"]
    bp_epochs = bp_dict["epochs"]
    bp_IV = num_X
    
    print("\n" + "Top Parameters For Final Model:")
    print(" Neurons" + '\t' + ": " + str(bp_neurons))
    print(" Learn Rate" + '\t' + ": " + str(bp_learn_rate))
    print(" Momentum" + '\t' + ": " + str(bp_momentum))
    print(" Decay" + '\t' + '\t' + ": " + str(bp_decay))
    print(" Nesterov" + '\t' + ": " + str(bp_nesterov))
    print(" Epochs" + '\t' + ": " + str(bp_epochs))
    
    ##Set final model parameters based on testing and run model
    sk_params = {
    'init': bp_init,
    'epochs': bp_epochs, 
    'neurons': bp_neurons, 
    'learn_rate': bp_learn_rate, 
    'decay': bp_decay, 
    'momentum':bp_momentum, 
    'nesterov':bp_nesterov, 
    'IV':bp_IV
    }
    #Set the parameters based off of the best found
    final_estimator = KerasClassifier(
    build_fn = ANN_model, 
    **sk_params, 
    verbose = 0
    )
    #Fit the final model based off of the training_data and training_data_output
    history = final_estimator.fit(training_data, training_data_output)
    scores = final_estimator.model.evaluate(training_data, training_data_output, verbose = 0)
    ##Get the final weights used
    #place weights in numpy array
    weights_layer_1 = final_estimator.model.layers[0].get_weights()[0] 
    weights_layer_2 = final_estimator.model.layers[1].get_weights()[0]
    #convert numpy array to DataFrame
    weights_layer_1 = pd.DataFrame(weights_layer_1) 
    weights_layer_2 = pd.DataFrame(weights_layer_2)
    ##Get the actual scores of the model and the classes
    #actual predicted scores of the DV based on the IV
    pred_actual = final_estimator.model.predict(training_data) 
    #place predicted results in a DataFrame
    pred_actual = pd.DataFrame(pred_actual, columns=['Predicted Probability']) 
    #classes of the predicted scores of the DV based on the IV
    pred_class = final_estimator.model.predict_classes(training_data) 
    #place predicted results in a DataFrame
    pred_class = pd.DataFrame(pred_class) 
    print("\n" + "Model Evaluation: %.2f%%" % (scores[1]*100))

    ##Save files
    #JSON files
    b_file = open(final_model_param, "w")
    
    json.dump(sk_params, b_file)
    b_file.close()
    final_json = final_estimator.model.to_json()
    with open(final_model_json, "w") as json_file:
        json_file.write(final_json)
    #H5 files
    final_estimator.model.save_weights(final_model_weight)
    final_estimator.model.save(final_model_complete)
    #CSV files
    weights_layer_1.to_csv(final_model_layer_1_weights, index = False)
    weights_layer_2.to_csv(final_model_layer_2_weights, index = False)
    Mean_Stdev.to_csv(final_model_mean_stdev, index = False)
    pred_actual.to_csv(final_model_probabilities, index = False)
    print("\n" + "Saved 8 files." + "\n") 
    ##Calculate AIC & BIC
    n = len(training_data_output)
    k = IV[0] * bp_neurons + bp_neurons
    y2 = pred_actual.T.squeeze()
    LogL = sum(training_data_output * np.log(y2) + (1 - training_data_output) * np.log(1 - y2))
    AIC = -2 * (LogL - k)
    BIC = -2 * LogL + np.log(n) * k
    print("AIC:", AIC)
    print("BIC:", BIC, "\n")
    ##Confusion Matrix
    tn, fp, fn, tp = confusion_matrix(training_data_output, pred_class).ravel()
    print("Confusion Matrix")
    print("\t", "\t", "Predicted")
    print("\t", "\t", "0", "\t", "1", "\t", "Total")
    print("Actual", "\t", "0", "\t", tn, "\t", fp, "\t", tn+fp)
    print("\t", "1", "\t", fn, "\t", tp, "\t", fn+tp)
    print("\t", "Total", "\t", tn+fn, "\t", fp+tp)
    ##Calculate the AUROC score
    auc = roc_auc_score(training_data_output, pred_actual)
    print('\n' + "AUC: %.6f" %auc)
    
    ##Create ROC curve
    FPR, TPR, _ = roc_curve(training_data_output, pred_actual) #get the rates for the curve
    figure(num=1, figsize=(7, 7), edgecolor = 'w')
    plt.plot(FPR, TPR, marker='.', ms=0.05, color='r')
    plt.plot([0,1], [0,1], '--', color='black') #create a diagonal dashed line
    plt.title('Artificial Neural Network - ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

