# Data Processing
This notebook is used to process the data from the raw datasets to curated final datasets and then loaded into a Knowledge Graph.

## 1. Setup
### 1.1. Importing Libraries
The first step is to import the necessary libraries for data processing and define the paths to the raw datasets and intermediate saved datasets.

In [1]:

import pandas as pd
import os
from tqdm import tqdm

datadir = "../data/original"
savedir = "../data/processed"

### 1.2. Loading Data
The second step is to load the raw datasets into the notebook. Each of them is accessed and stored in a pandas DataFrame.

In [2]:
# Load all data
population_data = pd.read_csv(os.path.join(datadir, "ABS_Population_and_people_by_2021_SA2_(Beta).csv"))

data_distance_all = pd.read_pickle(os.path.join(datadir, "duration_sa2_hospitals.pkl"))
data_distance_shortest = pd.read_pickle(os.path.join(datadir, "duration_sa2_hospital_shortest.pkl"))

hospital_details = pd.read_csv(os.path.join(datadir, "myhospitals-contact-details.csv"), encoding="latin1")


## 2 Preprocessing datasets
### 2.1 Data cleaning

## 3 Merge datasets
### 3.1 Population Distribution and State Area level 2 (SA2) metadata
The SA2 metadata is created by calculating the population distribution for each SA2 area. The population distribution is calculated by dividing the population of each SA2 area by the total population of the state. The age data is binned into 5-year age groups. The metadata is then merged with the SA2 dataset.

In [3]:
# Take a look at the data
population_data.head()

Unnamed: 0,OBJECTID,SA2_CODE_2021,SA2_NAME_2021,AREA_ALBERS_SQKM,ASGS_LOCI_URI_2021,ERP_P_202021,ERP_212021,ERP_M_202021,ERP_F_202021,ERP_192021,...,CENSUS_302021,CENSUS_312021,CENSUS_392021,CENSUS_332021,ADFS_22021,ADFS_32021,ADFS_42021,ADFS_52021,SHAPE_Length,SHAPE_Area
0,1,101021007,Braidwood,3418.3525,https://linked.data.gov.au/dataset/asgsed3/SA2...,4330.0,1.3,2248.0,2082.0,49.9,...,3.5,7.8,180.0,4.1,20.0,182.0,0.5,5.0,3.913695,0.339397
1,2,101021008,Karabar,6.9825,https://linked.data.gov.au/dataset/asgsed3/SA2...,8546.0,1223.9,4324.0,4222.0,36.3,...,5.2,5.7,1228.0,14.4,109.0,257.0,1.6,3.7,0.138642,0.000693
2,3,101021009,Queanbeyan,4.762,https://linked.data.gov.au/dataset/asgsed3/SA2...,11370.0,2387.7,5788.0,5582.0,36.3,...,14.2,4.9,2615.0,23.1,150.0,354.0,1.6,3.7,0.10715,0.000472
3,4,101021010,Queanbeyan - East,13.0032,https://linked.data.gov.au/dataset/asgsed3/SA2...,5093.0,391.7,2671.0,2422.0,36.2,...,11.9,4.5,1037.0,20.4,152.0,197.0,3.5,4.6,0.189549,0.00129
4,5,101021012,Queanbeyan West - Jerrabomberra,13.6748,https://linked.data.gov.au/dataset/asgsed3/SA2...,12743.0,931.9,6387.0,6356.0,36.6,...,4.1,3.0,1597.0,12.5,585.0,587.0,5.8,5.9,0.193368,0.001356


In [4]:
# Create columns for filtering
columns = [
    "SA2_CODE_2021",
    "SA2_NAME_2021",
    "AREA_ALBERS_SQKM"
]

# Filter the table
filtered_population_data = population_data[columns]

# Rename columns with mapping
mapping = {
    "SA2_CODE_2021": "SA2_5DIG16",
    "SA2_NAME_2021": "SA2_name",
    "AREA_ALBERS_SQKM": "area"
}

# Rename columns
SA2PopulationDistribution = filtered_population_data.rename(columns=mapping)
SA2PopulationDistribution.head()

Unnamed: 0,SA2_5DIG16,SA2_name,area
0,101021007,Braidwood,3418.3525
1,101021008,Karabar,6.9825
2,101021009,Queanbeyan,4.762
3,101021010,Queanbeyan - East,13.0032
4,101021012,Queanbeyan West - Jerrabomberra,13.6748


In [None]:
# Save the data as a csv file
file_path = os.path.join(savedir, "SA2PopulationDistribution.csv")

SA2PopulationDistribution.to_csv(file_path, index=False)

### 3.2 Distances Hospitals - SA2
The next dataset is the hospital distances to SA2 areas. The dataset is created by adding the closest hospital ID to each SA2 area in the `data_distance_shortest` dataset. 

In [5]:
# Take a look at the data
data_distance_all.head()

Unnamed: 0,SA2_5DIG16,time_to_1,time_to_2,time_to_3,time_to_4,time_to_5,time_to_6,time_to_7,time_to_8,time_to_9,...,time_to_1002,time_to_1003,time_to_1004,time_to_1005,time_to_1006,time_to_1007,time_to_1008,time_to_1009,time_to_1010,time_to_1011
0,11007,158980.478696,55545.678696,55606.278696,55190.178696,55395.378696,11772.694738,159285.978696,32900.469892,18879.961833,...,18921.478696,176771.378696,16621.333137,32815.309343,30294.871355,23233.794482,7661.486679,29297.294482,155286.478696,12762.078696
1,11008,155655.689234,52220.889234,52281.489234,51865.389234,52070.589234,12091.796855,155961.189234,29592.889234,15555.389234,...,15596.689234,173446.589234,16932.096855,29498.889234,27762.141889,19910.789234,4287.389234,25974.289234,151961.689234,9437.289234
2,11009,155495.195328,52060.395328,52120.995328,51704.895328,51910.095328,11943.307067,155800.695328,29432.395328,15394.895328,...,15436.195328,173286.095328,16783.607067,29338.395328,27918.731659,19750.295328,4126.895328,25813.795328,151801.195328,9276.795328
3,11010,155499.714501,52064.914501,52125.514501,51709.414501,51914.614501,11855.214501,155805.214501,29436.914501,15399.414501,...,15440.714501,173290.614501,16695.514501,29342.914501,28018.014647,19754.814501,4131.414501,25818.314501,151805.714501,9281.314501
4,11011,156375.773471,52940.973471,53001.573471,52585.473471,52790.673471,12260.254294,156681.273471,30312.68805,16275.473471,...,16316.773471,174166.673471,17100.554294,30218.973471,28761.886848,20630.873471,5017.88844,26694.373471,152681.773471,10157.373471


In [6]:
# Take a look at the data
data_distance_shortest.head()

Unnamed: 0,SA2_5DIG16,shortest_time_sec,shortest_time_min
0,11007,1143.589117,20.0
1,11008,374.559538,7.0
2,11009,202.83564,4.0
3,11010,272.384746,5.0
4,11011,1693.438621,29.0


In [15]:
# Initialize a list to store the results
hospital_IDs = []
hospital_distances = []

# Iterate over the values list
for index, row in tqdm(data_distance_all.iterrows()):
    # Remove SA2_5DIG16 from the row
    filtered_row = row.drop("SA2_5DIG16")
    
    # Remove NaN values
    filtered_row = filtered_row.dropna()
    
    if len(filtered_row) == 0:
        hospital_IDs.append("Not found")
        hospital_distances.append("Not found")
        continue
    
    closests_hospitals = []
    closests_distances = []
    for i in range(0, 5):
        # Find the lowest value in the row
        results = filtered_row.idxmin()
        closests_hospitals.append(results.split("_")[2])
        closests_distances.append(filtered_row[results])
        # Drop the lowest value
        filtered_row = filtered_row.drop(results)
        if len(filtered_row) == 0:
            break
        
    hospital_IDs.append(closests_hospitals)
    hospital_distances.append(closests_distances)

data_distance_shortest["closest_hospital_IDs"] = hospital_IDs
data_distance_shortest["closest_hospital_distances"] = hospital_distances

2310it [00:04, 526.45it/s]


Unnamed: 0,SA2_5DIG16,shortest_time_sec,shortest_time_min,closest_hospital_ID,closest_hospital_IDs,closest_hospital_distances
0,11007,1143.589117,20.0,"[99, 53, 695, 93, 332]","[99, 53, 695, 93, 332]","[1147.601232032854, 4303.012397330595, 4544.50..."
1,11008,374.559538,7.0,"[695, 102, 606, 864, 152]","[695, 102, 606, 864, 152]","[374.55953792185795, 1022.2892343050682, 1037...."
2,11009,202.835640,4.0,"[695, 102, 606, 864, 152]","[695, 102, 606, 864, 152]","[202.83563991523084, 871.2869529162444, 886.18..."
3,11010,272.384746,5.0,"[695, 102, 606, 864, 152]","[695, 102, 606, 864, 152]","[272.3847457627118, 1077.434400502197, 1092.33..."
4,11011,1693.438621,29.0,"[695, 102, 606, 864, 152]","[695, 102, 606, 864, 152]","[1759.8287599881621, 2328.1674341521157, 2343...."
...,...,...,...,...,...,...
2305,91002,3184.000000,54.0,"[288, 158, 645, 425, 722]","[288, 158, 645, 425, 722]","[3184.0, 18355.2, 19860.8, 25902.4, 27019.2]"
2306,91003,2301.064796,39.0,"[639, 755, 552, 918, 231]","[639, 755, 552, 918, 231]","[2301.0647959183675, 2489.2647959183673, 3135...."
2307,91004,,,Not found,Not found,Not found
2308,99499,,,Not found,Not found,Not found


In [42]:
# Drop shortest_time_minutes column
data_distance_shortest = data_distance_shortest.drop(columns=["shortest_time_min"])

# Rename SA2_5DIG16 to SA2_5DIG
data_distance_shortest = data_distance_shortest.rename(columns={"SA2_5DIG16": "SA2_5DIG"})

# Save the data as csv
file_path = os.path.join(savedir, "HospitalDistance.csv")
data_distance_shortest.to_csv(file_path, index=False)

### 3.3 Hospitals Metadata
The final dataset is the hospitals metadata. The dataset is created by renaming the columns of the `hospital_details` dataset.

In [43]:
# Inspect the data
hospital_details.head()

Unnamed: 0,Hospital name,Phone number,Street address,Suburb,Postcode,State,Local Hospital Network (LHN),Primary Health Network area (PHN),Website,Description,Sector,Beds,Latitude,Longitude,Hospital_ID
0,Abbotsford Private Hospital,08 9200 6282,61 Cambridge Street,West Leederville,6007,WA,,,www.abbotsfordhospital.com.au,,Private,<50,-31.940992,115.837344,1
1,Adelaide Clinic,08 8269 8100,33 Park Terrace,Gilberton,5081,SA,,,www.adelaideclinic.com.au/,,Private,50-99,-34.90658,138.613167,2
2,Adelaide Day Surgery Pty Ltd,08 8239 4900,18 North Terrace,Adelaide,5000,SA,,,http://www.curagroup.com.au/adelaide-day-surgery,,Private,<50,-34.922409,138.588817,3
3,Adelaide Eye & Laser Centre,08 8274 7000,215 Greenhill Road,Eastwood,5063,SA,,,www.aelc.com.au,,Private,,-34.94042,138.6215,4
4,Adelaide Surgicentre,08 8211 0000,89 King William Street,Kent Town,5067,SA,,,www.asec.net.au,,Private,<50,-34.917691,138.621022,5


In [45]:
# Create mapping
mapping = {
    "Hospital ID": "hospital_ID",
    "Hospital name": "hospital_name",
    "Phone number": "phone_number",
    "Street address": "address",
    "Suburb": "suburb",
    "Postcode": "postcode",
    "State": "state",
    "Local Hospital Network (LHN)": "local_hospital_network",
    "Primary Health Network (PHN)": "primary_health_network",
    "Website": "website",
    "Description": "description",
    "Sector": "sector",
    "Beds": "beds",
    "Lattitude": "latitude",
    "Longitude": "longitude"
}

# Apply mapping
HospitalMetadata = hospital_details.rename(columns=mapping)
HospitalMetadata.head()

Unnamed: 0,hospital_name,phone_number,address,suburb,postcode,state,local_hospital_network,Primary Health Network area (PHN),website,description,sector,beds,Latitude,longitude,Hospital_ID
0,Abbotsford Private Hospital,08 9200 6282,61 Cambridge Street,West Leederville,6007,WA,,,www.abbotsfordhospital.com.au,,Private,<50,-31.940992,115.837344,1
1,Adelaide Clinic,08 8269 8100,33 Park Terrace,Gilberton,5081,SA,,,www.adelaideclinic.com.au/,,Private,50-99,-34.90658,138.613167,2
2,Adelaide Day Surgery Pty Ltd,08 8239 4900,18 North Terrace,Adelaide,5000,SA,,,http://www.curagroup.com.au/adelaide-day-surgery,,Private,<50,-34.922409,138.588817,3
3,Adelaide Eye & Laser Centre,08 8274 7000,215 Greenhill Road,Eastwood,5063,SA,,,www.aelc.com.au,,Private,,-34.94042,138.6215,4
4,Adelaide Surgicentre,08 8211 0000,89 King William Street,Kent Town,5067,SA,,,www.asec.net.au,,Private,<50,-34.917691,138.621022,5


In [46]:
# Save the data as csv
file_path = os.path.join(savedir, "HospitalMetadata.csv")
HospitalMetadata.to_csv(file_path, index=False)

## 4 Knowledge Graph
### 4.1 Libraries and Functions

In [None]:
from GraphDB import GraphDB

URI = "bolt://localhost:7687"
USER = "neo4j"
PASSWORD = "healthcare"  # Change this to your Neo4j database password
DBNAME = "maingraph"
  
graph_handler = GraphDB(uri=URI, user=USER, pwd=PASSWORD, dbname=DBNAME)

### 4.2 Creating the Knowledge Graph
#### 4.2.1 Creating the Nodes
State areas (level 2)

In [None]:
file_path = os.path.join(savedir, "SA2PopulationDistribution.csv")
SA2PopulationDistribution = pd.read_csv(file_path)

# Use the GraphDB class to create nodes for all rows in the dataframe
for index, row in tqdm(SA2PopulationDistribution.iterrows()):
    graph_handler.create_sa2(
        sa2_5dig=row["SA2_5DIG16"], 
        sa2_name=row["SA2_name"], 
        area=row["area"])

Hospitals

In [None]:
file_path = os.path.join(savedir, "HospitalMetadata.csv")
HospitalMetadata = pd.read_csv(file_path)

#### 4.2.2 Creating the Edges

#### 4.2.3 Cleaning up

In [None]:
graph_handler.close()