# Creating Clusters for analysis
###### *NOTE*: All column names and pathways need to be changed to reflect your dataset
## First, loading libraries and data

In [None]:
#Loading necessary libraries
library(readxl)
library(writexl)
library(dplyr)
library(httr)
library(jsonlite)
library(sf)
library(igraph)
library(tidyr)
library(ggplot2)
library(viridis)

In [None]:
#Loading dataset to combine the formed groups - saved file of combined school data from DoE csv's
tryCatch({
    df <- read_excel("C:\\") # This is the path to the dataset of the school and HP Index scores on your local device
    message("loaded")
    error = function(e)
    stop("Unable to load",e$message)
})

### Part 1: Grouping based on matching Pobal Deprivation Index Scores

In [None]:
# Grouping the Pobal Index column into groups within 0.01, 0.05, 0.1, 0.2, and 1.0 degrees of separation
# Change 'pobalGrouping' to reflect cluster names you wish to save
pobalGrouping <- function(df, grouping_values = c(0.01, 0.05, 0.1, 0.2, 0.5, 1.0)) {
  min_val <- floor(min(df$indexDif, na.rm = TRUE))
  max_val <- ceiling(max(df$indexDif, na.rm = TRUE))

  for (grouping in grouping_values) {
    breaks <- seq(min_val, max_val, by = grouping)
    
    if (length(breaks) < 2) {
      warning(paste("Skipping grouping", grouping, "- not enough breaks"))
      next
    }

    column <- paste0("PobalGrouping", grouping)
    
    df[[column]] <- cut(df$indexDif,
                        breaks = breaks,
                        labels = FALSE,
                        include.lowest = TRUE,
                        right = FALSE)

    # Plots observations of number of schools per cluster in each grouping method
    ggplot(df, aes_string(x = column)) +
      geom_bar(fill = "#440154FF") +
      labs(
        title = paste("Grouping of indexDif using step =", grouping),
        x = paste("Group (Interval Width =", grouping, ")"),
        y = "Number of Observations"
      ) +
      theme_minimal() -> p

    print(p)
  }
    return(df)  
}
pobalGrouping(df) 

### Part 2: Grouping based on touching electoral districts using K-Nearest Neighbours:

In [None]:
# Isolating the eircode column from the excel data
eircode<-df$'Eircode' #Change 'Eircode' to reflect column name from your dataset

# Creating function to match eircodes of schools with their latitude and longitude
geocode<-function(postcodes) {
    query <- paste0(postcodes, "Ireland")
    viewbox <-"-10.5,55.4,-5.3,51.3" #Restricting location search to only check matching eircodes in Ireland
    url<-paste0("https://nominatim.openstreetmap.org/search?q=",
                utils::URLencode(query, reserved = TRUE),
                "&format=json&addressdetails=1&limit=1",
                "&viewbox=", viewbox,
                "&bounded=1") #Linking online database
    res<- GET(url, user_agent(edMatch (#**INPUT YOUR EMAIL HERE**
    ))) # Requirement by nomination for access, insert your email address here
    json_text <- content(res, "text", encoding = "UTF-8")
    json <- fromJSON(json_text, simplifyVector = FALSE)

if (length(json) > 0 && is.list(json)) { #Ensuring there is a list and that results exist
    if (!is.null(json[[1]]$lat) && !is.null(json[[1]]$lon)) {
        return(data.frame(eircode = postcodes,
                          lat = as.numeric(json[[1]]$lat),
                          long = as.numeric(json[[1]]$lon)
                         ))
        }
    }
    return(data.frame(eircode=postcodes, lat= NA, long= NA)) #Change name as required
}

In [None]:
# Combining the school eircodes from the dataframe with their matching latitutde and longitude
eircode<-unique(eircode)
locationData<-do.call(rbind, lapply(eircode,geocode))
names(locationData)[names(locationData) =="eircode"] <-Eircode 
# Ensuring eircode names match, use column name from your dataset
locationDataFrame<- left_join(df, locationData, by = Eircode)
colnames(locationDataFrame) #Checking it worked

In [None]:
# Now loading a shapefile with the geojson data for each Electoral District
ed_shp <- st_read("C://")  #Add path to save geojson Electoral District data from your local device
head(ed_shp) #Checking it worked

In [None]:
# Getting each electoral districts individual data assigned to a unique code by dissolving the multipart polyons
ed_shp_singlepart <- ed_shp %>%
    group_by(ED_ID_STR) %>%
    summarize(geometry = st_union(geometry), 
              .groups = drop)
    adjacency_matrix <- st_touches(ed_shp_singlepart) #Determines which electoral districts touch
    isolated_indices <- which(lengths(adjacency_matrix) == 0) #Takes any electoral districts which are isolated (E.g., islands_)
    isolated_data <- ed_shp_singlepart[isolated_indices, c("ED_ID_STR", "geometry")
    print(isolated_data) #Check if/how many isolated Electoral Districts
    ed_centroids <- st_centroid(ed_shp_singlepart$geometry) #Calclates centroids of each electoral district
    
    if (length(isolated_indices) > 0) {
      connected_indices <- setdiff(1:nrow(ed_shp_singlepart), isolated_indices) 
      isolated_centroids <- ed_centroids[isolated_indices]
      connected_centroids <- ed_centroids[connected_indices]
      nearest_neighbors <- st_nearest_feature(isolated_centroids, connected_centroids)
    
      isolate_to_nearest <- data.frame(
        from = isolated_indices,
        to = connected_indices[nearest_neighbors]
    ) 
    } else {  
      isolate_to_nearest <- data.frame(from = integer(0), to = integer(0))
    } #Maps any isolated districts to the nearest electoral district to it.
    
    ed_ids <- ed_shp_singlepart$ED_ID_STR #Creates vector of each unique code for electoral districts
    
    edges <- do.call(rbind, lapply(1:length(adjacency_matrix), function(i) {
      if (length(adjacency_matrix[[i]]) == 0) return(NULL)
      data.frame(
        from = ed_ids[i],
        to = ed_ids[adjacency_matrix[[i]]]
    )
    }))#Creates outlining boundaries of each electoral district group
    
    edges <- edges[as.character(edges$from) < as.character(edges$to), ]
    
    if (nrow(isolate_to_nearest) > 0) {
      isolate_edges_named <- data.frame(
        from = ed_ids[isolate_to_nearest$from],
        to = ed_ids[isolate_to_nearest$to]
    )
      edges <- rbind(edges, isolate_edges_named)
    }
    
    edges <- unique(edges)
    vertices_df <- data.frame(name = ed_ids)
    g <- graph_from_data_frame(edges, vertices = vertices_df, directed = FALSE)
    print(g) #Checking it worked

In [None]:
# KNearest Neighbours Clustering of Electoral Districts
ed_centroids <- st_centroid(ed_shp_singlepart$geometry) #Gets centroid of each electoral district
coords <- st_coordinates(ed_centroids) #Takes these co-ordinates
total_EDs <- nrow(ed_shp_singlepart)
group5 <- 5 #5 used as dominant analysis method, also run with 10 and 20 for robustness checks
numCluster5 <- ceiling(total_EDs / group5)
set.seed(42)  # for reproducibility
km5 <- kmeans(coords, centers = numCluster5)
ed_shp_singlepart$clusterED5 <- km5$cluster #Adding to data

In [None]:
# Convert your schools data frame to an sf points object using lat/long
schools_sf <- st_as_sf(locationDataFrame, 
                       coords = c("long.x","lat.x"), 
                       crs = st_crs(ed_shp_singlepart)) 

In [None]:
# Join schools with the ED polygons based on their spatial location\n",
schoolsED5 <- st_join(schools_sf, ed_shp_singlepart[, c("ED_ID_STR", "clusterED5")], left = TRUE)
schoolsED10 <- st_join(schools_sf, ed_shp_singlepart[, c("ED_ID_STR", "clusterED10")], left = TRUE) #Used for robustness check
schoolsED20 <- st_join(schools_sf, ed_shp_singlepart[, c("ED_ID_STR", "clusterED20")], left = TRUE) #Used for robustness check

In [None]:
# Remove geometry so we can join by attribute columns
ed_shp_df <- ed_shp_singlepart %>%
    st_set_geometry(NULL) %>%  # drop spatial information
    select(ED_ID_STR, clusterED5) %>%
    mutate(ED_ID_STR = str_to_upper(str_trim(ED_ID_STR)))
    
    schoolsED5 <- schoolsED5 %>%
        mutate(`ED coded` = str_to_upper(str_trim(`ED coded`)
                                        ))
    
    # Now join without geometry conflict
    schoolsED5 <- schoolsED5 %>%
      left_join(ed_shp_df, by = c("ED coded" ="ED_ID_STR"))
    
    # Check for unmatched EDs (NAs)
    num_NA <- sum(is.na(schoolsED5$clusterED5))
    print(paste("Number of schools without cluster assignment:", num_NA))

In [None]:
# Summarize cluster counts
cluster_summary <- ed_shp_singlepart %>%
    st_set_geometry(NULL) %>%  # drop geometry
      group_by(clusterED5) %>%
      summarise(
        num_EDs = n()
      ) %>%
      arrange(clusterED5)
    print(cluster_summary) #Check

# Number of clusters, and seeing if any overlook group/ED
    num_clusters <- n_distinct(ed_shp_singlepart$clusterED5)
    cat("Number of clusters: ", num_clusters)
    num_unassigned <- sum(is.na(ed_shp_singlepart$clusterED5))
    cat("Number of EDs not assigned to any cluster: ", num_unassigned)
    sum(is.na(ed_shp_singlepart$clusterED5)) 

#### Visualising clusters of touching Electoral Districts

In [None]:
# Visualising the clusters
clusterEDMap<- ggplot(ed_shp_singlepart) +
    geom_sf(aes(fill = factor(clusterED5)), color = NA) +
    scale_fill_viridis_d(option = "plasma") +
    theme_minimal() +
    theme(
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank()
    ) + 
    labs(fill = "KNN Clusters",
    title = "Electoral Districts grouped by KNN Clustering")+
    theme(legend.position = "none")
    ggsave("clusterEDMap.png", plot =clusterEDMap, width = 12, height = 10, dpi = 300)

### Part 3: Saving all the information into dataframe for analysis

In [None]:
# Saving the pobal clusters into a new excel file
schools_with_clusters <- st_join(schools_sf, ed_sf, join = st_within, left = TRUE)

# Drop unwanted columns
final_df <- schools_with_clusters %>%
    st_set_geometry(NULL) %>%  # drop geometry column
    select(-clusterED10, -clusterED20) # Drops the clusters only used for robustness for tidier dataset
    
# Combine this dataset with the main one:
# Load main dataset
tryCatch({
  df<- read_excel("C:\\") #Attach main dataset 
  message("Loaded main data")
}, error = function(e) {
  stop("Unable to load", e$message)
}) 
   
ed_sf <- ed_shp_singlepart

# Spatial join: attach clusters from ED polygons to schools based on location
schools_with_clusters <- st_join(schools_sf, ed_sf, join = st_within, left = TRUE)

# Drop unwanted columns and geometry
clusterdf <- schools_with_clusters %>%
    st_set_geometry(NULL) %>%
        select(-clusterED10, -clusterED20) 
          # Remove from dataset for tidyness, 
          # initially utilised for robustness checks.
    
    # Adding new columns from this clusters dataset to main df
    newCol <- setdiff(names(clusterdf), names(df))
    combined_df <- df %>%
      left_join(
        clusterdf %>% select(Year, Eircode, all_of(newCol)),
        by = c("Year", "Eircode")
      ) 
    # Join both based on 'Year' and 'Eircode' alignment, 
    # so each row of data aligns correctly.  
    # Use column names from your dataset
    
# Optional: check for NAs in new cluster columns
summary(combined_df[newCol])

In [None]:
# Save combined dataset as Excel file
write_xlsx(combined_df, "C:\\") #Path to folder you wish to save and name full dataset