In [21]:
import pandas as pd
import os
import unicodedata
import geopandas as gpd
import matplotlib.pyplot as plt
import googlemaps
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import DBSCAN
import numpy as np
from shapely.ops import nearest_points
from geopandas import GeoSeries
import folium

In [11]:
# Define constants
FILE_PATHS = {
    'aed': '~/Documents/Datathon files/aed_locations.xlsx',
    'cad9': '~/Documents/Datathon files/cad9.xlsx',
    'ambulance': '~/Documents/Datathon files/ambulance_locations.xlsx',
    'interventions_bxl1': '~/Documents/Datathon files/interventions_bxl.xlsx',
    'interventions_bxl2': '~/Documents/Datathon files/interventions_bxl2.xlsx',
    'interventions1': '~/Documents/Datathon files/interventions1.xlsx',
    'interventions2': '~/Documents/Datathon files/interventions2.xlsx',
    'interventions3': '~/Documents/Datathon files/interventions3.xlsx',
    'mug': '~/Documents/Datathon files/mug_locations.xlsx'
}
GOOGLE_MAPS_API_KEY = 'AIzaSyCHT_UZ49tsasFakgDUVha05snFVsUbq-M'
COST_PER_AED = 1500
INSTALLATION_COST_PER_AED = 250
ANNUAL_MAINTENANCE_COST_PER_AED = 75
MIN_DISTANCE_THRESHOLD = 0.1

In [12]:
# Step 1: Read data
def read_data(file_paths):
    data = {name: pd.read_excel(os.path.expanduser(path)) for name, path in file_paths.items()}
    return data

In [22]:
# Custom transformer for preprocessing AED data
class AEDPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['type'] = X['type'].str.lower()
        X['address'] = X['address'].str.lower()
        X['municipality'] = X['municipality'].str.lower()
        X['province'] = X['province'].str.lower()
        X = X.fillna('unknown').drop_duplicates()
        X['address'] = X['address'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())
        X['municipality'] = X['municipality'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())
        X['province'] = X['province'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())
        X['location_aed'] = X['address'] + ',' + X['postal_code'].astype(str) + ',' + X['municipality'] + ',' + 'Belgium'
        X.drop(columns=['type', 'location', 'public', 'available', 'hours', 'number', 'postal_code', 'municipality', 'address'], inplace=True)
        return X

In [23]:
# Custom transformer for geocoding addresses
class Geocoder(BaseEstimator, TransformerMixin):
    def __init__(self, api_key):
        self.gmaps_client = googlemaps.Client(key=api_key)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def geocode_address(address):
            result = self.gmaps_client.geocode(address)
            if result:
                location = result[0]['geometry']['location']
                return location['lat'], location['lng']
            else:
                return None, None
        X['latitude'], X['longitude'] = zip(*X['location_aed'].apply(geocode_address))
        return X

In [24]:
# Custom transformer for combining and filtering intervention data
class CombineAndFilterInterventions(BaseEstimator, TransformerMixin):
    def __init__(self, data):
        self.data = data

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        interventions_df = pd.concat([self.data['interventions1'], self.data['interventions2'], self.data['interventions3']])
        filtered_cad9 = self.data['cad9'][self.data['cad9']['EventType Trip'].isin(['P003 - HARTSTILSTAND - DOOD - OVERLEDEN', 
                                                                                   'P008 - PATIËNT MET DEFIBRILLATOR OF PACEMAKER', 
                                                                                   'P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST)'])]
        filtered_interventions = interventions_df[interventions_df['EventType Trip'].isin(['P039 - Cardiac problem (other than thoracic pain)', 
                                                                                          'P003 - Cardiac arrest', 
                                                                                          'P008 - Patient with defibrillator - pacemaker'])]
        filtered_interventions_bxl1 = self.data['interventions_bxl1'][self.data['interventions_bxl1']['eventtype_trip'].isin(['P039 - Cardiac problem (other than thoracic pain)',
                                                                                                                            'P003 - Cardiac arrest', 
                                                                                                                            'P008 - Patient with defibrillator - pacemaker'])]
        temp = self.data['interventions_bxl2']['EventType and EventLevel'].str.extract(r'(.*)(N[0-9]{2})(.*)')
        self.data['interventions_bxl2']['EventType'] = temp[0].str.strip() + temp[2]
        self.data['interventions_bxl2']['EventLevel'] = temp[1].str.strip()
        filtered_interventions_bxl2 = self.data['interventions_bxl2'][self.data['interventions_bxl2']['EventType'].isin(['P003 - HARTSTILSTAND - DOOD - OVERLEDEN', 
                                                                                                                       'P008 - PATIËNT MET DEFIBRILLATOR OF PACEMAKER', 
                                                                                                                       'P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST)'])]
        interventions_lat_long = filtered_interventions[['Latitude intervention', 'Longitude intervention']]
        interventionsbxl1_lat_long = filtered_interventions_bxl1[['latitude_intervention', 'longitude_intervention']]
        interventionsbxl2_lat_long = filtered_interventions_bxl2[['Latitude intervention', 'Longitude intervention']]
        cad9_lat_long = filtered_cad9[['Latitude intervention', 'Longitude intervention']]
        lat_long_df = [
            interventions_lat_long.rename(columns={'Latitude intervention': 'latitude', 'Longitude intervention': 'longitude'}),
            interventionsbxl1_lat_long.rename(columns={'latitude_intervention': 'latitude', 'longitude_intervention': 'longitude'}),
            interventionsbxl2_lat_long.rename(columns={'Latitude intervention': 'latitude', 'Longitude intervention': 'longitude'}),
            cad9_lat_long.rename(columns={'Latitude intervention': 'latitude', 'Longitude intervention': 'longitude'})
        ]
        combined_int_lat_long = pd.concat(lat_long_df, ignore_index=True).dropna()
        return combined_int_lat_long

In [25]:
# Custom transformer for DBSCAN clustering
class DBSCANClustering(BaseEstimator, TransformerMixin):
    def __init__(self, eps=0.1, min_samples=5):
        self.eps = eps / 6371.0088  # Convert kilometers to radians
        self.min_samples = min_samples

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        coords = np.radians(X[['latitude', 'longitude']].values)
        db = DBSCAN(eps=self.eps, min_samples=self.min_samples, algorithm='ball_tree', metric='haversine').fit(coords)
        X['cluster'] = db.labels_
        cardiac_arrests_gdf = gpd.GeoDataFrame(X, geometry=gpd.points_from_xy(X.longitude, X.latitude))
        cluster_centroids = cardiac_arrests_gdf.groupby('cluster').geometry.apply(lambda x: x.unary_union.centroid if x.unary_union else None)
        return cluster_centroids

In [26]:
# Custom transformer for finding proposed AED locations
class ProposedAEDLocations(BaseEstimator, TransformerMixin):
    def __init__(self, aed_gdf, min_distance_threshold=0.1):
        self.aed_gdf = aed_gdf
        self.min_distance_threshold = min_distance_threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        proposed_aed_locations = GeoSeries(X, crs='EPSG:4326')
        def distance_to_nearest_aed(proposed_location, existing_aeds):
            nearest_aed_point = nearest_points(proposed_location, existing_aeds.unary_union)[1]
            return proposed_location.distance(nearest_aed_point)
        distances_to_nearest_aed = proposed_aed_locations.apply(lambda x: distance_to_nearest_aed(x, self.aed_gdf.geometry))
        proposed_aed_locations_df = proposed_aed_locations.to_frame(name='geometry')
        proposed_aed_locations_df['distance_to_nearest_aed_km'] = distances_to_nearest_aed * 100
        adequate_coverage_df = proposed_aed_locations_df[proposed_aed_locations_df['distance_to_nearest_aed_km'] > self.min_distance_threshold]
        return adequate_coverage_df

In [27]:
# Read initial data
data = read_data(FILE_PATHS)
aed_df = data['aed']

# Preprocess AED data and geocode addresses
aed_preprocessor = AEDPreprocessor()
geocoder = Geocoder(api_key=GOOGLE_MAPS_API_KEY)
aed_df = aed_preprocessor.fit_transform(aed_df)
aed_df = geocoder.fit_transform(aed_df)

# Convert AED DataFrame to GeoDataFrame
aed_gdf = gpd.GeoDataFrame(aed_df, geometry=gpd.points_from_xy(aed_df.longitude, aed_df.latitude))

In [28]:
# Define and run the pipeline
pipeline = Pipeline([
    ('combine_and_filter_interventions', CombineAndFilterInterventions(data)),
    ('dbscan_clustering', DBSCANClustering()),
    ('proposed_aed_locations', ProposedAEDLocations(aed_gdf))
])

In [29]:
def main():
    adequate_coverage_df = pipeline.fit_transform(aed_df)
    # Calculate total cost for the first year
    total_cost_first_year = (COST_PER_AED + INSTALLATION_COST_PER_AED + ANNUAL_MAINTENANCE_COST_PER_AED) * len(adequate_coverage_df)
    print(f'Total cost for the first year: {total_cost_first_year}')
    # Save new AED locations to Excel
    new_aed_df = pd.DataFrame({
        'longitude': adequate_coverage_df.geometry.x,
        'latitude': adequate_coverage_df.geometry.y
    })
    new_aed_df.to_excel(os.path.expanduser('~/Documents/new_aed_locations_pipeline.xlsx'), index=False)
    # Visualize proposed AED locations on a map
    m = visualize_proposed_locations(aed_gdf, None, adequate_coverage_df)
    m.save(os.path.expanduser('~/Documents/proposed_aed_locations_map.html'))

In [30]:
def visualize_proposed_locations(aed_gdf, cardiac_arrests_gdf, adequate_coverage_df):
    m = folium.Map(location=[50.8503, 4.3517], zoom_start=8)
    for idx, row in adequate_coverage_df.iterrows():
        folium.Marker([row['geometry'].y, row['geometry'].x], icon=folium.Icon(color="green")).add_to(m)
    return m

In [31]:
if __name__ == "__main__":
    main()

Total cost for the first year: 1898000
