In [None]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2
import folium
from branca.colormap import linear

# Configuration - Define proximity threshold
NEARBY_RADIUS_KM = 10  # Consider facilities within 10 km as "nearby"

# Step 1: Load Data
# Justification: Loading the datasets is the first step to access the information required for analysis.
# Explanation: Reads the CSV files into pandas DataFrames for manipulation.
print("Step 1: Loading data...")
try:
    imd_df = pd.read_csv('IMD_2019.csv')
    hospitals_df = pd.read_csv('Hospital.csv')
    lsoa_boundaries_df = pd.read_csv('Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BSC_V4_3901388190129020682.csv')
    nihr_projects_df = pd.read_csv('nihr-infrastructure-supported-projects.csv')
    print("Data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure all CSV files are in the same directory.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred during data loading: {e}")
    exit()

# Step 2: Prepare IMD and LSOA Geographic Data
# Justification: Merging IMD data with LSOA geographical coordinates provides a complete picture for each LSOA.
# Explanation: Selects relevant columns from IMD and LSOA boundaries, renames columns for consistency, and merges them on the LSOA code.
print("Step 2: Preparing IMD and LSOA geographic data...")
# Select and rename columns from IMD data, using their exact names including the suffix
imd_data = imd_df[['lsoa11cd,C,80', 'lsoa11nm,C,80', 'IMD_Decile,N,10,0', 'TotPop,N,10,0']].copy()
imd_data.rename(columns={
    'lsoa11cd,C,80': 'LSOA_Code',
    'lsoa11nm,C,80': 'LSOA_Name',
    'IMD_Decile,N,10,0': 'IMD_Decile',
    'TotPop,N,10,0': 'TotPop'
}, inplace=True)

# Select and rename columns from LSOA boundaries data
lsoa_geo_data = lsoa_boundaries_df[['LSOA21CD', 'LAT', 'LONG']].copy()
lsoa_geo_data.rename(columns={'LSOA21CD': 'LSOA_Code'}, inplace=True)

# Merge IMD data with LSOA geographic data
# Note: Assuming 'lsoa11cd' and 'LSOA21CD' are compatible for merging.
lsoa_combined_df = pd.merge(imd_data, lsoa_geo_data, on='LSOA_Code', how='inner')

# Ensure population column is numeric and handle potential missing values
lsoa_combined_df['TotPop'] = pd.to_numeric(lsoa_combined_df['TotPop'], errors='coerce')
# Updated line to avoid FutureWarning
lsoa_combined_df['TotPop'] = lsoa_combined_df['TotPop'].fillna(lsoa_combined_df['TotPop'].median())
lsoa_combined_df['TotPop'] = lsoa_combined_df['TotPop'].astype(int) # Convert to integer

# Drop rows where Latitude or Longitude might be missing after the merge
lsoa_combined_df.dropna(subset=['LAT', 'LONG'], inplace=True)

print(f"Combined LSOA data shape: {lsoa_combined_df.shape}")

# Step 3: Prepare Hospital Data
# Justification: Extracts necessary hospital information, especially location, for distance calculations.
# Explanation: Selects hospital name, postcode, latitude, and longitude. Ensures coordinates are numeric.
print("Step 3: Preparing hospital data...")
hospitals_data = hospitals_df[['OrganisationName', 'Postcode', 'Latitude', 'Longitude']].copy()
hospitals_data['Latitude'] = pd.to_numeric(hospitals_data['Latitude'], errors='coerce')
hospitals_data['Longitude'] = pd.to_numeric(hospitals_data['Longitude'], errors='coerce')
hospitals_data.dropna(subset=['Latitude', 'Longitude'], inplace=True) # Remove hospitals without valid coordinates
print(f"Prepared hospital data shape: {hospitals_data.shape}")

# Step 4: Prepare NIHR Projects Data
# Justification: Extracts necessary NIHR project information for identifying existing research infrastructure.
# Explanation: Selects centre name, postcode, latitude, longitude, and research theme. Ensures coordinates are numeric.
print("Step 4: Preparing NIHR projects data...")
nihr_projects_data = nihr_projects_df[['Centre', 'Centre Postcode', 'Latitude', 'Longitude', 'Research Theme']].copy()
nihr_projects_data.rename(columns={'Centre Postcode': 'Postcode'}, inplace=True)
nihr_projects_data['Latitude'] = pd.to_numeric(nihr_projects_data['Latitude'], errors='coerce')
nihr_projects_data['Longitude'] = pd.to_numeric(nihr_projects_data['Longitude'], errors='coerce')
nihr_projects_data.dropna(subset=['Latitude', 'Longitude'], inplace=True) # Remove projects without valid coordinates
print(f"Prepared NIHR projects data shape: {nihr_projects_data.shape}")

# Step 5: Define Haversine Distance Function
# Justification: This function accurately calculates the shortest distance between two points on the surface of a sphere (Earth).
# Explanation: Converts latitudes and longitudes to radians and applies the Haversine formula to compute distance in kilometers.
print("Step 5: Defining Haversine distance function...")
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of Earth in kilometers

    # Convert degrees to radians
    lat1_rad, lon1_rad, lat2_rad, lon2_rad = map(np.radians, [lat1, lon1, lat2, lon2])

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = np.sin(dlat / 2.0)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1.0 - a))

    distance = R * c
    return distance

print("Haversine distance function defined.")

# Step 6: Calculate Proximity to Hospitals and NIHR Projects for each LSOA (OPTIMIZED)
# Justification: Vectorizing distance calculations with NumPy is significantly faster than Python loops for large datasets.
# Explanation: Converts coordinates to NumPy arrays and uses broadcasting to calculate all pairwise distances,
# then efficiently finds the minimum distance for each LSOA to hospitals and NIHR projects.
print("Step 6: Calculating proximity to hospitals and NIHR projects for each LSOA (Optimized)...")

# Convert LSOA coordinates to NumPy arrays
lsoa_lats = lsoa_combined_df['LAT'].to_numpy()
lsoa_lons = lsoa_combined_df['LONG'].to_numpy()

# Convert Hospital coordinates to NumPy arrays
hospital_lats = hospitals_data['Latitude'].to_numpy()
hospital_lons = hospitals_data['Longitude'].to_numpy()

# Convert NIHR Project coordinates to NumPy arrays
nihr_lats = nihr_projects_data['Latitude'].to_numpy()
nihr_lons = nihr_projects_data['Longitude'].to_numpy()

# Calculate distances to Hospitals
if hospital_lats.size > 0 and lsoa_lats.size > 0:
    # Reshape LSOA arrays for broadcasting: (N_lsoa, 1)
    lsoa_lats_reshaped = lsoa_lats[:, np.newaxis]
    lsoa_lons_reshaped = lsoa_lons[:, np.newaxis]

    # Calculate all pairwise distances between LSOAs and hospitals
    all_distances_to_hospitals = haversine_distance(
        lsoa_lats_reshaped, lsoa_lons_reshaped,
        hospital_lats, hospital_lons
    )
    lsoa_combined_df['Min_Dist_To_Hospital_km'] = np.min(all_distances_to_hospitals, axis=1)
else:
    lsoa_combined_df['Min_Dist_To_Hospital_km'] = np.nan # No hospitals or LSOAs to calculate distance to

# Calculate distances to NIHR Projects
if nihr_lats.size > 0 and lsoa_lats.size > 0:
    # Reshape LSOA arrays for broadcasting: (N_lsoa, 1)
    # Reusing lsoa_lats_reshaped and lsoa_lons_reshaped from above
    all_distances_to_nihr = haversine_distance(
        lsoa_lats_reshaped, lsoa_lons_reshaped,
        nihr_lats, nihr_lons
    )
    lsoa_combined_df['Min_Dist_To_NIHR_Project_km'] = np.min(all_distances_to_nihr, axis=1)
else:
    lsoa_combined_df['Min_Dist_To_NIHR_Project_km'] = np.nan # No NIHR projects or LSOAs to calculate distance to

print("Proximity calculations complete.")

# Step 7: Calculate a Composite Score for Research Site Allocation
# Justification: A composite score provides a single metric to rank LSOAs based on multiple relevant factors.
# Explanation: Combines IMD decile (lower is more deprived), proximity to hospitals, and proximity to NIHR projects.
# A higher score indicates a more suitable research site.
print("Step 7: Calculating composite score...")

# Assign points based on IMD Decile (1 to 10, 1 being most deprived, so higher points for lower deciles)
# We multiply by TotPop to prioritize areas with more people in deprivation.
lsoa_combined_df['IMD_Score_Weighted'] = (11 - lsoa_combined_df['IMD_Decile']) * lsoa_combined_df['TotPop']

# Assign points based on proximity to hospitals (higher points for closer proximity)
# Capped at NEARBY_RADIUS_KM to ensure only truly "nearby" locations contribute significantly.
lsoa_combined_df['Hospital_Proximity_Score'] = np.where(
    lsoa_combined_df['Min_Dist_To_Hospital_km'] <= NEARBY_RADIUS_KM,
    (NEARBY_RADIUS_KM - lsoa_combined_df['Min_Dist_To_Hospital_km']) / NEARBY_RADIUS_KM * 10, # Max 10 points
    0 # 0 points if outside radius
)
lsoa_combined_df['Hospital_Proximity_Score'].fillna(0, inplace=True) # Handle cases where no hospital was found

# Assign points based on proximity to NIHR projects (higher points for closer proximity)
lsoa_combined_df['NIHR_Proximity_Score'] = np.where(
    lsoa_combined_df['Min_Dist_To_NIHR_Project_km'] <= NEARBY_RADIUS_KM,
    (NEARBY_RADIUS_KM - lsoa_combined_df['Min_Dist_To_NIHR_Project_km']) / NEARBY_RADIUS_KM * 10, # Max 10 points
    0 # 0 points if outside radius
)
lsoa_combined_df['NIHR_Proximity_Score'].fillna(0, inplace=True) # Handle cases where no NIHR project was found

# Combine scores (you can adjust weights if certain factors are more important)
# Example weights: IMD (50%), Hospitals (25%), NIHR (25%)
lsoa_combined_df['Total_Research_Site_Score'] = (
    lsoa_combined_df['IMD_Score_Weighted'] * 0.5 +
    lsoa_combined_df['Hospital_Proximity_Score'] * 0.25 +
    lsoa_combined_df['NIHR_Proximity_Score'] * 0.25
)
print("Composite score calculation complete.")

# Step 8: Rank and Display Top Research Sites
# Justification: Presenting the top-ranked LSOAs clearly identifies the most suitable locations based on the defined criteria.
# Explanation: Sorts the LSOAs by their total score in descending order and displays the top 10.
print("Step 8: Ranking and displaying top research sites...")
top_research_sites = lsoa_combined_df.sort_values(by='Total_Research_Site_Score', ascending=False)
print("\nTop Potential Research Sites (Ranked by Score):")
print(top_research_sites[['LSOA_Name', 'IMD_Decile', 'TotPop', 'Min_Dist_To_Hospital_km', 'Min_Dist_To_NIHR_Project_km', 'Total_Research_Site_Score']].head(10).to_markdown(index=False))

# Step 9: Visualize Results on an Interactive Map
# Justification: A map provides an intuitive visual representation of the LSOAs, hospitals, and NIHR projects,
# allowing for easy interpretation of the spatial distribution of potential research sites.
# Explanation: Uses Folium to create an HTML map with colored LSOA markers (based on score), and markers for hospitals and NIHR projects.
print("\nStep 9: Generating interactive map visualization...")

# Determine map center (average of LSOA coordinates)
map_center_lat = lsoa_combined_df['LAT'].mean()
map_center_lon = lsoa_combined_df['LONG'].mean()

m = folium.Map(location=[map_center_lat, map_center_lon], zoom_start=7) # Adjusted zoom for better overview of UK

# Create a colormap for LSOA scores
max_score = lsoa_combined_df['Total_Research_Site_Score'].max()
min_score = lsoa_combined_df['Total_Research_Site_Score'].min()
colormap = linear.YlOrRd_09.scale(vmin=min_score, vmax=max_score)

# Add LSOA markers, colored by Total_Research_Site_Score
lsoa_layer = folium.FeatureGroup(name='LSOAs by Research Site Suitability Score')
# Limit to a reasonable number of LSOAs for performance on map, e.g., top 5000 by score or random sample
# For the purpose of demonstrating the map, let's take a sample or limit the count to prevent very large HTML files
# For accurate representation, ideally all would be plotted, but this can make map files very large.
# Let's plot all LSOAs for now, as the core issue was distance calculation, not plotting speed.
for idx, row in lsoa_combined_df.iterrows():
    if pd.notna(row['LAT']) and pd.notna(row['LONG']):
        folium.CircleMarker(
            location=[row['LAT'], row['LONG']],
            radius=2, # Smaller radius for dense LSOA distribution
            color=colormap(row['Total_Research_Site_Score']),
            fill=True,
            fill_color=colormap(row['Total_Research_Site_Score']),
            fill_opacity=0.7,
            tooltip=f"LSOA: {row['LSOA_Name']}<br>IMD Decile: {row['IMD_Decile']}<br>Population: {row['TotPop']}<br>Score: {row['Total_Research_Site_Score']:.2f}<br>Dist to Hosp: {row['Min_Dist_To_Hospital_km']:.1f} km<br>Dist to NIHR: {row['Min_Dist_To_NIHR_Project_km']:.1f} km"
        ).add_to(lsoa_layer)
lsoa_layer.add_to(m)

# Add Hospital markers
hospital_layer = folium.FeatureGroup(name='Hospitals')
for idx, row in hospitals_data.iterrows():
    if pd.notna(row['Latitude']) and pd.notna(row['Longitude']):
        folium.Marker(
            location=[row['Latitude'], row['Longitude']],
            popup=f"Hospital: {row['OrganisationName']}<br>Postcode: {row['Postcode']}",
            icon=folium.Icon(color='blue', icon='hospital', prefix='fa')
        ).add_to(hospital_layer)
hospital_layer.add_to(m)

# Add NIHR Project markers
nihr_layer = folium.FeatureGroup(name='Existing NIHR Projects')
for idx, row in nihr_projects_data.iterrows():
    if pd.notna(row['Latitude']) and pd.notna(row['Longitude']):
        folium.Marker(
            location=[row['Latitude'], row['Longitude']],
            popup=f"NIHR Project: {row['Centre']}<br>Theme: {row['Research Theme']}<br>Postcode: {row['Postcode']}",
            icon=folium.Icon(color='green', icon='flask', prefix='fa')
        ).add_to(nihr_layer)
nihr_layer.add_to(m)

# Add Layer Control to toggle layers
folium.LayerControl().add_to(m)

# Add colormap to the map for reference
colormap.add_to(m)
colormap.caption = 'LSOA Research Site Suitability Score (Higher is Better)'

# Save the map to an HTML file
map_output_path = 'optimized_research_site_allocation_map.html'
m.save(map_output_path)
print(f"\nInteractive map saved to '{map_output_path}'")
print("\nOptimization tool execution complete.")

Step 1: Loading data...
Data loaded successfully.
Step 2: Preparing IMD and LSOA geographic data...
Combined LSOA data shape: (31810, 6)
Step 3: Preparing hospital data...
Prepared hospital data shape: (1209, 4)
Step 4: Preparing NIHR projects data...
Prepared NIHR projects data shape: (27345, 5)
Step 5: Defining Haversine distance function...
Haversine distance function defined.
Step 6: Calculating proximity to hospitals and NIHR projects for each LSOA (Optimized)...
Proximity calculations complete.
Step 7: Calculating composite score...
Composite score calculation complete.
Step 8: Ranking and displaying top research sites...

Top Potential Research Sites (Ranked by Score):
| LSOA_Name       |   IMD_Decile |   TotPop |   Min_Dist_To_Hospital_km |   Min_Dist_To_NIHR_Project_km |   Total_Research_Site_Score |
|:----------------|-------------:|---------:|--------------------------:|------------------------------:|----------------------------:|
| Oxford 008A     |            7 |     8888

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  lsoa_combined_df['Hospital_Proximity_Score'].fillna(0, inplace=True) # Handle cases where no hospital was found
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  lsoa_combined_df['NIHR_Proximity_Score'].fillna(0, inplace=True) # Handle cases where no NIHR project was found



Interactive map saved to 'optimized_research_site_allocation_map.html'

Optimization tool execution complete.


Great! The code is now running successfully. The output looks correct - you can see it has:

✅ Successfully loaded all data (31,810 LSOAs, 1,209 hospitals, 27,345 NIHR projects)

✅ Calculated proximity scores for all locations

✅ Generated the top 10 research sites with their scores and distances

✅ Created an interactive map saved as HTML

The results show promising locations like Oxford, Liverpool, Birmingham, and Manchester - areas that typically have good healthcare infrastructure and research facilities.
However, there are two minor warnings you should fix for cleaner code:

In [None]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2
import folium
from branca.colormap import linear

# Configuration - Define proximity threshold
NEARBY_RADIUS_KM = 10  # Consider facilities within 10 km as "nearby"

# Step 1: Load Data
# Justification: Loading the datasets is the first step to access the information required for analysis.
# Explanation: Reads the CSV files into pandas DataFrames for manipulation.
print("Step 1: Loading data...")
try:
    imd_df = pd.read_csv('IMD_2019.csv')
    hospitals_df = pd.read_csv('Hospital.csv')
    lsoa_boundaries_df = pd.read_csv('Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BSC_V4_3901388190129020682.csv')
    nihr_projects_df = pd.read_csv('nihr-infrastructure-supported-projects.csv')
    print("Data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure all CSV files are in the same directory.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred during data loading: {e}")
    exit()

# Step 2: Prepare IMD and LSOA Geographic Data
# Justification: Merging IMD data with LSOA geographical coordinates provides a complete picture for each LSOA.
# Explanation: Selects relevant columns from IMD and LSOA boundaries, renames columns for consistency, and merges them on the LSOA code.
print("Step 2: Preparing IMD and LSOA geographic data...")
# Select and rename columns from IMD data, using their exact names including the suffix
imd_data = imd_df[['lsoa11cd,C,80', 'lsoa11nm,C,80', 'IMD_Decile,N,10,0', 'TotPop,N,10,0']].copy()
imd_data.rename(columns={
    'lsoa11cd,C,80': 'LSOA_Code',
    'lsoa11nm,C,80': 'LSOA_Name',
    'IMD_Decile,N,10,0': 'IMD_Decile',
    'TotPop,N,10,0': 'TotPop'
}, inplace=True)

# Select and rename columns from LSOA boundaries data
lsoa_geo_data = lsoa_boundaries_df[['LSOA21CD', 'LAT', 'LONG']].copy()
lsoa_geo_data.rename(columns={'LSOA21CD': 'LSOA_Code'}, inplace=True)

# Merge IMD data with LSOA geographic data
# Note: Assuming 'lsoa11cd' and 'LSOA21CD' are compatible for merging.
lsoa_combined_df = pd.merge(imd_data, lsoa_geo_data, on='LSOA_Code', how='inner')

# Ensure population column is numeric and handle potential missing values
lsoa_combined_df['TotPop'] = pd.to_numeric(lsoa_combined_df['TotPop'], errors='coerce')
# Updated line to avoid FutureWarning
lsoa_combined_df['TotPop'] = lsoa_combined_df['TotPop'].fillna(lsoa_combined_df['TotPop'].median())
lsoa_combined_df['TotPop'] = lsoa_combined_df['TotPop'].astype(int) # Convert to integer

# Drop rows where Latitude or Longitude might be missing after the merge
lsoa_combined_df.dropna(subset=['LAT', 'LONG'], inplace=True)

print(f"Combined LSOA data shape: {lsoa_combined_df.shape}")

# Step 3: Prepare Hospital Data
# Justification: Extracts necessary hospital information, especially location, for distance calculations.
# Explanation: Selects hospital name, postcode, latitude, and longitude. Ensures coordinates are numeric.
print("Step 3: Preparing hospital data...")
hospitals_data = hospitals_df[['OrganisationName', 'Postcode', 'Latitude', 'Longitude']].copy()
hospitals_data['Latitude'] = pd.to_numeric(hospitals_data['Latitude'], errors='coerce')
hospitals_data['Longitude'] = pd.to_numeric(hospitals_data['Longitude'], errors='coerce')
hospitals_data.dropna(subset=['Latitude', 'Longitude'], inplace=True) # Remove hospitals without valid coordinates
print(f"Prepared hospital data shape: {hospitals_data.shape}")

# Step 4: Prepare NIHR Projects Data
# Justification: Extracts necessary NIHR project information for identifying existing research infrastructure.
# Explanation: Selects centre name, postcode, latitude, longitude, and research theme. Ensures coordinates are numeric.
print("Step 4: Preparing NIHR projects data...")
nihr_projects_data = nihr_projects_df[['Centre', 'Centre Postcode', 'Latitude', 'Longitude', 'Research Theme']].copy()
nihr_projects_data.rename(columns={'Centre Postcode': 'Postcode'}, inplace=True)
nihr_projects_data['Latitude'] = pd.to_numeric(nihr_projects_data['Latitude'], errors='coerce')
nihr_projects_data['Longitude'] = pd.to_numeric(nihr_projects_data['Longitude'], errors='coerce')
nihr_projects_data.dropna(subset=['Latitude', 'Longitude'], inplace=True) # Remove projects without valid coordinates
print(f"Prepared NIHR projects data shape: {nihr_projects_data.shape}")

# Step 5: Define Haversine Distance Function
# Justification: This function accurately calculates the shortest distance between two points on the surface of a sphere (Earth).
# Explanation: Converts latitudes and longitudes to radians and applies the Haversine formula to compute distance in kilometers.
print("Step 5: Defining Haversine distance function...")
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of Earth in kilometers

    # Convert degrees to radians
    lat1_rad, lon1_rad, lat2_rad, lon2_rad = map(np.radians, [lat1, lon1, lat2, lon2])

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = np.sin(dlat / 2.0)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1.0 - a))

    distance = R * c
    return distance

print("Haversine distance function defined.")

# Step 6: Calculate Proximity to Hospitals and NIHR Projects for each LSOA (OPTIMIZED)
# Justification: Vectorizing distance calculations with NumPy is significantly faster than Python loops for large datasets.
# Explanation: Converts coordinates to NumPy arrays and uses broadcasting to calculate all pairwise distances,
# then efficiently finds the minimum distance for each LSOA to hospitals and NIHR projects.
print("Step 6: Calculating proximity to hospitals and NIHR projects for each LSOA (Optimized)...")

# Convert LSOA coordinates to NumPy arrays
lsoa_lats = lsoa_combined_df['LAT'].to_numpy()
lsoa_lons = lsoa_combined_df['LONG'].to_numpy()

# Convert Hospital coordinates to NumPy arrays
hospital_lats = hospitals_data['Latitude'].to_numpy()
hospital_lons = hospitals_data['Longitude'].to_numpy()

# Convert NIHR Project coordinates to NumPy arrays
nihr_lats = nihr_projects_data['Latitude'].to_numpy()
nihr_lons = nihr_projects_data['Longitude'].to_numpy()

# Calculate distances to Hospitals
if hospital_lats.size > 0 and lsoa_lats.size > 0:
    # Reshape LSOA arrays for broadcasting: (N_lsoa, 1)
    lsoa_lats_reshaped = lsoa_lats[:, np.newaxis]
    lsoa_lons_reshaped = lsoa_lons[:, np.newaxis]

    # Calculate all pairwise distances between LSOAs and hospitals
    all_distances_to_hospitals = haversine_distance(
        lsoa_lats_reshaped, lsoa_lons_reshaped,
        hospital_lats, hospital_lons
    )
    lsoa_combined_df['Min_Dist_To_Hospital_km'] = np.min(all_distances_to_hospitals, axis=1)
else:
    lsoa_combined_df['Min_Dist_To_Hospital_km'] = np.nan # No hospitals or LSOAs to calculate distance to

# Calculate distances to NIHR Projects
if nihr_lats.size > 0 and lsoa_lats.size > 0:
    # Reshape LSOA arrays for broadcasting: (N_lsoa, 1)
    # Reusing lsoa_lats_reshaped and lsoa_lons_reshaped from above
    all_distances_to_nihr = haversine_distance(
        lsoa_lats_reshaped, lsoa_lons_reshaped,
        nihr_lats, nihr_lons
    )
    lsoa_combined_df['Min_Dist_To_NIHR_Project_km'] = np.min(all_distances_to_nihr, axis=1)
else:
    lsoa_combined_df['Min_Dist_To_NIHR_Project_km'] = np.nan # No NIHR projects or LSOAs to calculate distance to

print("Proximity calculations complete.")

# Step 7: Calculate a Composite Score for Research Site Allocation
# Justification: A composite score provides a single metric to rank LSOAs based on multiple relevant factors.
# Explanation: Combines IMD decile (lower is more deprived), proximity to hospitals, and proximity to NIHR projects.
# A higher score indicates a more suitable research site.
print("Step 7: Calculating composite score...")

# Assign points based on IMD Decile (1 to 10, 1 being most deprived, so higher points for lower deciles)
# We multiply by TotPop to prioritize areas with more people in deprivation.
lsoa_combined_df['IMD_Score_Weighted'] = (11 - lsoa_combined_df['IMD_Decile']) * lsoa_combined_df['TotPop']

# Assign points based on proximity to hospitals (higher points for closer proximity)
# Capped at NEARBY_RADIUS_KM to ensure only truly "nearby" locations contribute significantly.
lsoa_combined_df['Hospital_Proximity_Score'] = np.where(
    lsoa_combined_df['Min_Dist_To_Hospital_km'] <= NEARBY_RADIUS_KM,
    (NEARBY_RADIUS_KM - lsoa_combined_df['Min_Dist_To_Hospital_km']) / NEARBY_RADIUS_KM * 10, # Max 10 points
    0 # 0 points if outside radius
)
lsoa_combined_df['Hospital_Proximity_Score'] = lsoa_combined_df['Hospital_Proximity_Score'].fillna(0) # Handle cases where no hospital was found

# Assign points based on proximity to NIHR projects (higher points for closer proximity)
lsoa_combined_df['NIHR_Proximity_Score'] = np.where(
    lsoa_combined_df['Min_Dist_To_NIHR_Project_km'] <= NEARBY_RADIUS_KM,
    (NEARBY_RADIUS_KM - lsoa_combined_df['Min_Dist_To_NIHR_Project_km']) / NEARBY_RADIUS_KM * 10, # Max 10 points
    0 # 0 points if outside radius
)
lsoa_combined_df['NIHR_Proximity_Score'] = lsoa_combined_df['NIHR_Proximity_Score'].fillna(0) # Handle cases where no NIHR project was found

# Combine scores (you can adjust weights if certain factors are more important)
# Example weights: IMD (50%), Hospitals (25%), NIHR (25%)
lsoa_combined_df['Total_Research_Site_Score'] = (
    lsoa_combined_df['IMD_Score_Weighted'] * 0.5 +
    lsoa_combined_df['Hospital_Proximity_Score'] * 0.25 +
    lsoa_combined_df['NIHR_Proximity_Score'] * 0.25
)
print("Composite score calculation complete.")

# Step 8: Rank and Display Top Research Sites
# Justification: Presenting the top-ranked LSOAs clearly identifies the most suitable locations based on the defined criteria.
# Explanation: Sorts the LSOAs by their total score in descending order and displays the top 10.
print("Step 8: Ranking and displaying top research sites...")
top_research_sites = lsoa_combined_df.sort_values(by='Total_Research_Site_Score', ascending=False)
print("\nTop Potential Research Sites (Ranked by Score):")
print(top_research_sites[['LSOA_Name', 'IMD_Decile', 'TotPop', 'Min_Dist_To_Hospital_km', 'Min_Dist_To_NIHR_Project_km', 'Total_Research_Site_Score']].head(10).to_markdown(index=False))

# Step 9: Visualize Results on an Interactive Map
# Justification: A map provides an intuitive visual representation of the LSOAs, hospitals, and NIHR projects,
# allowing for easy interpretation of the spatial distribution of potential research sites.
# Explanation: Uses Folium to create an HTML map with colored LSOA markers (based on score), and markers for hospitals and NIHR projects.
print("\nStep 9: Generating interactive map visualization...")

# Determine map center (average of LSOA coordinates)
map_center_lat = lsoa_combined_df['LAT'].mean()
map_center_lon = lsoa_combined_df['LONG'].mean()

m = folium.Map(location=[map_center_lat, map_center_lon], zoom_start=7) # Adjusted zoom for better overview of UK

# Create a colormap for LSOA scores
max_score = lsoa_combined_df['Total_Research_Site_Score'].max()
min_score = lsoa_combined_df['Total_Research_Site_Score'].min()
colormap = linear.YlOrRd_09.scale(vmin=min_score, vmax=max_score)

# Add LSOA markers, colored by Total_Research_Site_Score
lsoa_layer = folium.FeatureGroup(name='LSOAs by Research Site Suitability Score')
# Limit to a reasonable number of LSOAs for performance on map, e.g., top 5000 by score or random sample
# For the purpose of demonstrating the map, let's take a sample or limit the count to prevent very large HTML files
# For accurate representation, ideally all would be plotted, but this can make map files very large.
# Let's plot all LSOAs for now, as the core issue was distance calculation, not plotting speed.
for idx, row in lsoa_combined_df.iterrows():
    if pd.notna(row['LAT']) and pd.notna(row['LONG']):
        folium.CircleMarker(
            location=[row['LAT'], row['LONG']],
            radius=2, # Smaller radius for dense LSOA distribution
            color=colormap(row['Total_Research_Site_Score']),
            fill=True,
            fill_color=colormap(row['Total_Research_Site_Score']),
            fill_opacity=0.7,
            tooltip=f"LSOA: {row['LSOA_Name']}<br>IMD Decile: {row['IMD_Decile']}<br>Population: {row['TotPop']}<br>Score: {row['Total_Research_Site_Score']:.2f}<br>Dist to Hosp: {row['Min_Dist_To_Hospital_km']:.1f} km<br>Dist to NIHR: {row['Min_Dist_To_NIHR_Project_km']:.1f} km"
        ).add_to(lsoa_layer)
lsoa_layer.add_to(m)

# Add Hospital markers
hospital_layer = folium.FeatureGroup(name='Hospitals')
for idx, row in hospitals_data.iterrows():
    if pd.notna(row['Latitude']) and pd.notna(row['Longitude']):
        folium.Marker(
            location=[row['Latitude'], row['Longitude']],
            popup=f"Hospital: {row['OrganisationName']}<br>Postcode: {row['Postcode']}",
            icon=folium.Icon(color='blue', icon='hospital', prefix='fa')
        ).add_to(hospital_layer)
hospital_layer.add_to(m)

# Add NIHR Project markers
nihr_layer = folium.FeatureGroup(name='Existing NIHR Projects')
for idx, row in nihr_projects_data.iterrows():
    if pd.notna(row['Latitude']) and pd.notna(row['Longitude']):
        folium.Marker(
            location=[row['Latitude'], row['Longitude']],
            popup=f"NIHR Project: {row['Centre']}<br>Theme: {row['Research Theme']}<br>Postcode: {row['Postcode']}",
            icon=folium.Icon(color='green', icon='flask', prefix='fa')
        ).add_to(nihr_layer)
nihr_layer.add_to(m)

# Add Layer Control to toggle layers
folium.LayerControl().add_to(m)

# Add colormap to the map for reference
colormap.add_to(m)
colormap.caption = 'LSOA Research Site Suitability Score (Higher is Better)'

# Save the map to an HTML file
map_output_path = 'optimized_research_site_allocation_map.html'
m.save(map_output_path)
print(f"\nInteractive map saved to '{map_output_path}'")
print("\nOptimization tool execution complete.")

Step 1: Loading data...
Data loaded successfully.
Step 2: Preparing IMD and LSOA geographic data...
Combined LSOA data shape: (31810, 6)
Step 3: Preparing hospital data...
Prepared hospital data shape: (1209, 4)
Step 4: Preparing NIHR projects data...
Prepared NIHR projects data shape: (27345, 5)
Step 5: Defining Haversine distance function...
Haversine distance function defined.
Step 6: Calculating proximity to hospitals and NIHR projects for each LSOA (Optimized)...
Proximity calculations complete.
Step 7: Calculating composite score...
Composite score calculation complete.
Step 8: Ranking and displaying top research sites...

Top Potential Research Sites (Ranked by Score):
| LSOA_Name       |   IMD_Decile |   TotPop |   Min_Dist_To_Hospital_km |   Min_Dist_To_NIHR_Project_km |   Total_Research_Site_Score |
|:----------------|-------------:|---------:|--------------------------:|------------------------------:|----------------------------:|
| Oxford 008A     |            7 |     8888

Now the map will display directly in your Jupyter notebook output! Here's what the updated code does:

Imports display from IPython.display for inline notebook display
Still saves the HTML file for external use
Displays the map inline using display(m)

When you run this code in your Jupyter notebook, you'll see:

The same text output as before
Plus an interactive map displayed directly in the notebook cell output
You can zoom, pan, and click on markers directly in the notebook
The map will show:

LSOAs as colored circles (red = higher suitability score)
Hospitals as blue hospital icons
NIHR Projects as green flask icons
Layer controls to toggle different elements on/off
Color scale showing what the colors mean



This gives you the best of both worlds - immediate visualization in your notebook and a saved HTML file for sharing or presenting!

In [None]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2
import folium
from branca.colormap import linear
from IPython.display import display

# Configuration - Define proximity threshold
NEARBY_RADIUS_KM = 10  # Consider facilities within 10 km as "nearby"

# Step 1: Load Data
# Justification: Loading the datasets is the first step to access the information required for analysis.
# Explanation: Reads the CSV files into pandas DataFrames for manipulation.
print("Step 1: Loading data...")
try:
    imd_df = pd.read_csv('IMD_2019.csv')
    hospitals_df = pd.read_csv('Hospital.csv')
    lsoa_boundaries_df = pd.read_csv('Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BSC_V4_3901388190129020682.csv')
    nihr_projects_df = pd.read_csv('nihr-infrastructure-supported-projects.csv')
    print("Data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure all CSV files are in the same directory.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred during data loading: {e}")
    exit()

# Step 2: Prepare IMD and LSOA Geographic Data
# Justification: Merging IMD data with LSOA geographical coordinates provides a complete picture for each LSOA.
# Explanation: Selects relevant columns from IMD and LSOA boundaries, renames columns for consistency, and merges them on the LSOA code.
print("Step 2: Preparing IMD and LSOA geographic data...")
# Select and rename columns from IMD data, using their exact names including the suffix
imd_data = imd_df[['lsoa11cd,C,80', 'lsoa11nm,C,80', 'IMD_Decile,N,10,0', 'TotPop,N,10,0']].copy()
imd_data.rename(columns={
    'lsoa11cd,C,80': 'LSOA_Code',
    'lsoa11nm,C,80': 'LSOA_Name',
    'IMD_Decile,N,10,0': 'IMD_Decile',
    'TotPop,N,10,0': 'TotPop'
}, inplace=True)

# Select and rename columns from LSOA boundaries data
lsoa_geo_data = lsoa_boundaries_df[['LSOA21CD', 'LAT', 'LONG']].copy()
lsoa_geo_data.rename(columns={'LSOA21CD': 'LSOA_Code'}, inplace=True)

# Merge IMD data with LSOA geographic data
# Note: Assuming 'lsoa11cd' and 'LSOA21CD' are compatible for merging.
lsoa_combined_df = pd.merge(imd_data, lsoa_geo_data, on='LSOA_Code', how='inner')

# Ensure population column is numeric and handle potential missing values
lsoa_combined_df['TotPop'] = pd.to_numeric(lsoa_combined_df['TotPop'], errors='coerce')
# Updated line to avoid FutureWarning
lsoa_combined_df['TotPop'] = lsoa_combined_df['TotPop'].fillna(lsoa_combined_df['TotPop'].median())
lsoa_combined_df['TotPop'] = lsoa_combined_df['TotPop'].astype(int) # Convert to integer

# Drop rows where Latitude or Longitude might be missing after the merge
lsoa_combined_df.dropna(subset=['LAT', 'LONG'], inplace=True)

print(f"Combined LSOA data shape: {lsoa_combined_df.shape}")

# Step 3: Prepare Hospital Data
# Justification: Extracts necessary hospital information, especially location, for distance calculations.
# Explanation: Selects hospital name, postcode, latitude, and longitude. Ensures coordinates are numeric.
print("Step 3: Preparing hospital data...")
hospitals_data = hospitals_df[['OrganisationName', 'Postcode', 'Latitude', 'Longitude']].copy()
hospitals_data['Latitude'] = pd.to_numeric(hospitals_data['Latitude'], errors='coerce')
hospitals_data['Longitude'] = pd.to_numeric(hospitals_data['Longitude'], errors='coerce')
hospitals_data.dropna(subset=['Latitude', 'Longitude'], inplace=True) # Remove hospitals without valid coordinates
print(f"Prepared hospital data shape: {hospitals_data.shape}")

# Step 4: Prepare NIHR Projects Data
# Justification: Extracts necessary NIHR project information for identifying existing research infrastructure.
# Explanation: Selects centre name, postcode, latitude, longitude, and research theme. Ensures coordinates are numeric.
print("Step 4: Preparing NIHR projects data...")
nihr_projects_data = nihr_projects_df[['Centre', 'Centre Postcode', 'Latitude', 'Longitude', 'Research Theme']].copy()
nihr_projects_data.rename(columns={'Centre Postcode': 'Postcode'}, inplace=True)
nihr_projects_data['Latitude'] = pd.to_numeric(nihr_projects_data['Latitude'], errors='coerce')
nihr_projects_data['Longitude'] = pd.to_numeric(nihr_projects_data['Longitude'], errors='coerce')
nihr_projects_data.dropna(subset=['Latitude', 'Longitude'], inplace=True) # Remove projects without valid coordinates
print(f"Prepared NIHR projects data shape: {nihr_projects_data.shape}")

# Step 5: Define Haversine Distance Function
# Justification: This function accurately calculates the shortest distance between two points on the surface of a sphere (Earth).
# Explanation: Converts latitudes and longitudes to radians and applies the Haversine formula to compute distance in kilometers.
print("Step 5: Defining Haversine distance function...")
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of Earth in kilometers

    # Convert degrees to radians
    lat1_rad, lon1_rad, lat2_rad, lon2_rad = map(np.radians, [lat1, lon1, lat2, lon2])

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = np.sin(dlat / 2.0)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1.0 - a))

    distance = R * c
    return distance

print("Haversine distance function defined.")

# Step 6: Calculate Proximity to Hospitals and NIHR Projects for each LSOA (OPTIMIZED)
# Justification: Vectorizing distance calculations with NumPy is significantly faster than Python loops for large datasets.
# Explanation: Converts coordinates to NumPy arrays and uses broadcasting to calculate all pairwise distances,
# then efficiently finds the minimum distance for each LSOA to hospitals and NIHR projects.
print("Step 6: Calculating proximity to hospitals and NIHR projects for each LSOA (Optimized)...")

# Convert LSOA coordinates to NumPy arrays
lsoa_lats = lsoa_combined_df['LAT'].to_numpy()
lsoa_lons = lsoa_combined_df['LONG'].to_numpy()

# Convert Hospital coordinates to NumPy arrays
hospital_lats = hospitals_data['Latitude'].to_numpy()
hospital_lons = hospitals_data['Longitude'].to_numpy()

# Convert NIHR Project coordinates to NumPy arrays
nihr_lats = nihr_projects_data['Latitude'].to_numpy()
nihr_lons = nihr_projects_data['Longitude'].to_numpy()

# Calculate distances to Hospitals
if hospital_lats.size > 0 and lsoa_lats.size > 0:
    # Reshape LSOA arrays for broadcasting: (N_lsoa, 1)
    lsoa_lats_reshaped = lsoa_lats[:, np.newaxis]
    lsoa_lons_reshaped = lsoa_lons[:, np.newaxis]

    # Calculate all pairwise distances between LSOAs and hospitals
    all_distances_to_hospitals = haversine_distance(
        lsoa_lats_reshaped, lsoa_lons_reshaped,
        hospital_lats, hospital_lons
    )
    lsoa_combined_df['Min_Dist_To_Hospital_km'] = np.min(all_distances_to_hospitals, axis=1)
else:
    lsoa_combined_df['Min_Dist_To_Hospital_km'] = np.nan # No hospitals or LSOAs to calculate distance to

# Calculate distances to NIHR Projects
if nihr_lats.size > 0 and lsoa_lats.size > 0:
    # Reshape LSOA arrays for broadcasting: (N_lsoa, 1)
    # Reusing lsoa_lats_reshaped and lsoa_lons_reshaped from above
    all_distances_to_nihr = haversine_distance(
        lsoa_lats_reshaped, lsoa_lons_reshaped,
        nihr_lats, nihr_lons
    )
    lsoa_combined_df['Min_Dist_To_NIHR_Project_km'] = np.min(all_distances_to_nihr, axis=1)
else:
    lsoa_combined_df['Min_Dist_To_NIHR_Project_km'] = np.nan # No NIHR projects or LSOAs to calculate distance to

print("Proximity calculations complete.")

# Step 7: Calculate a Composite Score for Research Site Allocation
# Justification: A composite score provides a single metric to rank LSOAs based on multiple relevant factors.
# Explanation: Combines IMD decile (lower is more deprived), proximity to hospitals, and proximity to NIHR projects.
# A higher score indicates a more suitable research site.
print("Step 7: Calculating composite score...")

# Assign points based on IMD Decile (1 to 10, 1 being most deprived, so higher points for lower deciles)
# We multiply by TotPop to prioritize areas with more people in deprivation.
lsoa_combined_df['IMD_Score_Weighted'] = (11 - lsoa_combined_df['IMD_Decile']) * lsoa_combined_df['TotPop']

# Assign points based on proximity to hospitals (higher points for closer proximity)
# Capped at NEARBY_RADIUS_KM to ensure only truly "nearby" locations contribute significantly.
lsoa_combined_df['Hospital_Proximity_Score'] = np.where(
    lsoa_combined_df['Min_Dist_To_Hospital_km'] <= NEARBY_RADIUS_KM,
    (NEARBY_RADIUS_KM - lsoa_combined_df['Min_Dist_To_Hospital_km']) / NEARBY_RADIUS_KM * 10, # Max 10 points
    0 # 0 points if outside radius
)
lsoa_combined_df['Hospital_Proximity_Score'] = lsoa_combined_df['Hospital_Proximity_Score'].fillna(0) # Handle cases where no hospital was found

# Assign points based on proximity to NIHR projects (higher points for closer proximity)
lsoa_combined_df['NIHR_Proximity_Score'] = np.where(
    lsoa_combined_df['Min_Dist_To_NIHR_Project_km'] <= NEARBY_RADIUS_KM,
    (NEARBY_RADIUS_KM - lsoa_combined_df['Min_Dist_To_NIHR_Project_km']) / NEARBY_RADIUS_KM * 10, # Max 10 points
    0 # 0 points if outside radius
)
lsoa_combined_df['NIHR_Proximity_Score'] = lsoa_combined_df['NIHR_Proximity_Score'].fillna(0) # Handle cases where no NIHR project was found

# Combine scores (you can adjust weights if certain factors are more important)
# Example weights: IMD (50%), Hospitals (25%), NIHR (25%)
lsoa_combined_df['Total_Research_Site_Score'] = (
    lsoa_combined_df['IMD_Score_Weighted'] * 0.5 +
    lsoa_combined_df['Hospital_Proximity_Score'] * 0.25 +
    lsoa_combined_df['NIHR_Proximity_Score'] * 0.25
)
print("Composite score calculation complete.")

# Step 8: Rank and Display Top Research Sites
# Justification: Presenting the top-ranked LSOAs clearly identifies the most suitable locations based on the defined criteria.
# Explanation: Sorts the LSOAs by their total score in descending order and displays the top 10.
print("Step 8: Ranking and displaying top research sites...")
top_research_sites = lsoa_combined_df.sort_values(by='Total_Research_Site_Score', ascending=False)
print("\nTop Potential Research Sites (Ranked by Score):")
print(top_research_sites[['LSOA_Name', 'IMD_Decile', 'TotPop', 'Min_Dist_To_Hospital_km', 'Min_Dist_To_NIHR_Project_km', 'Total_Research_Site_Score']].head(10).to_markdown(index=False))

# Step 9: Visualize Results on an Interactive Map
# Justification: A map provides an intuitive visual representation of the LSOAs, hospitals, and NIHR projects,
# allowing for easy interpretation of the spatial distribution of potential research sites.
# Explanation: Uses Folium to create an HTML map with colored LSOA markers (based on score), and markers for hospitals and NIHR projects.
print("\nStep 9: Generating interactive map visualization...")

# Determine map center (average of LSOA coordinates)
map_center_lat = lsoa_combined_df['LAT'].mean()
map_center_lon = lsoa_combined_df['LONG'].mean()

m = folium.Map(location=[map_center_lat, map_center_lon], zoom_start=7) # Adjusted zoom for better overview of UK

# Create a colormap for LSOA scores
max_score = lsoa_combined_df['Total_Research_Site_Score'].max()
min_score = lsoa_combined_df['Total_Research_Site_Score'].min()
colormap = linear.YlOrRd_09.scale(vmin=min_score, vmax=max_score)

# Add LSOA markers, colored by Total_Research_Site_Score
lsoa_layer = folium.FeatureGroup(name='LSOAs by Research Site Suitability Score')
# Limit to a reasonable number of LSOAs for performance on map, e.g., top 5000 by score or random sample
# For the purpose of demonstrating the map, let's take a sample or limit the count to prevent very large HTML files
# For accurate representation, ideally all would be plotted, but this can make map files very large.
# Let's plot all LSOAs for now, as the core issue was distance calculation, not plotting speed.
for idx, row in lsoa_combined_df.iterrows():
    if pd.notna(row['LAT']) and pd.notna(row['LONG']):
        folium.CircleMarker(
            location=[row['LAT'], row['LONG']],
            radius=2, # Smaller radius for dense LSOA distribution
            color=colormap(row['Total_Research_Site_Score']),
            fill=True,
            fill_color=colormap(row['Total_Research_Site_Score']),
            fill_opacity=0.7,
            tooltip=f"LSOA: {row['LSOA_Name']}<br>IMD Decile: {row['IMD_Decile']}<br>Population: {row['TotPop']}<br>Score: {row['Total_Research_Site_Score']:.2f}<br>Dist to Hosp: {row['Min_Dist_To_Hospital_km']:.1f} km<br>Dist to NIHR: {row['Min_Dist_To_NIHR_Project_km']:.1f} km"
        ).add_to(lsoa_layer)
lsoa_layer.add_to(m)

# Add Hospital markers
hospital_layer = folium.FeatureGroup(name='Hospitals')
for idx, row in hospitals_data.iterrows():
    if pd.notna(row['Latitude']) and pd.notna(row['Longitude']):
        folium.Marker(
            location=[row['Latitude'], row['Longitude']],
            popup=f"Hospital: {row['OrganisationName']}<br>Postcode: {row['Postcode']}",
            icon=folium.Icon(color='blue', icon='hospital', prefix='fa')
        ).add_to(hospital_layer)
hospital_layer.add_to(m)

# Add NIHR Project markers
nihr_layer = folium.FeatureGroup(name='Existing NIHR Projects')
for idx, row in nihr_projects_data.iterrows():
    if pd.notna(row['Latitude']) and pd.notna(row['Longitude']):
        folium.Marker(
            location=[row['Latitude'], row['Longitude']],
            popup=f"NIHR Project: {row['Centre']}<br>Theme: {row['Research Theme']}<br>Postcode: {row['Postcode']}",
            icon=folium.Icon(color='green', icon='flask', prefix='fa')
        ).add_to(nihr_layer)
nihr_layer.add_to(m)

# Add Layer Control to toggle layers
folium.LayerControl().add_to(m)

# Add colormap to the map for reference
colormap.add_to(m)
colormap.caption = 'LSOA Research Site Suitability Score (Higher is Better)'

# Save the map to an HTML file
map_output_path = 'optimized_research_site_allocation_map.html'
m.save(map_output_path)
print(f"\nInteractive map saved to '{map_output_path}'")

# Display the map inline in the Jupyter notebook
print("\nDisplaying interactive map in notebook...")
display(m)

print("\nOptimization tool execution complete.")

Step 1: Loading data...
Data loaded successfully.
Step 2: Preparing IMD and LSOA geographic data...
Combined LSOA data shape: (31810, 6)
Step 3: Preparing hospital data...
Prepared hospital data shape: (1209, 4)
Step 4: Preparing NIHR projects data...
Prepared NIHR projects data shape: (27345, 5)
Step 5: Defining Haversine distance function...
Haversine distance function defined.
Step 6: Calculating proximity to hospitals and NIHR projects for each LSOA (Optimized)...
Proximity calculations complete.
Step 7: Calculating composite score...
Composite score calculation complete.
Step 8: Ranking and displaying top research sites...

Top Potential Research Sites (Ranked by Score):
| LSOA_Name       |   IMD_Decile |   TotPop |   Min_Dist_To_Hospital_km |   Min_Dist_To_NIHR_Project_km |   Total_Research_Site_Score |
|:----------------|-------------:|---------:|--------------------------:|------------------------------:|----------------------------:|
| Oxford 008A     |            7 |     8888