### **01 - Incidence of MLTC**
#### **01D02 - Manuscript outputs - age standardisation outputs**

**Chart**: Progression rates by age and gender

**Imports**

In [1]:
%%pyspark
# required imports

# requires blank line after last import


In [2]:
%%sparkr

if (!requireNamespace("svglite", quietly = TRUE)) {
  install.packages("svglite")
}

In [3]:
# Load necessary libraries
library(SparkR)
library(ggplot2)
library(patchwork)
library(svglite)

**Parameter cell**

In [4]:
%%pyspark
# parameter cell
incidence_schema = ""  # "mltc_incidence_outputs_v40_20230331"

# optional, can be blank


In [5]:
%%pyspark
# Set parameters in Spark configuration with 'param.' prefix (for use in SQL cells)
spark.conf.set("param.incidence_schema", incidence_schema)


---

#### **Creating the plot**

**a - Load data**

In [6]:
data <- sql("SELECT * FROM ${param.incidence_schema}.output_01d02_incidence_results_age_standardisation")

**b - Create plot**

Data cleaning steps

In [7]:
# Replace the "NA" with "All"
# The regex pattern is used to match a string that is exactly "NA"
data <- withColumn(data, "gender_description", regexp_replace(data$gender_description, "^NA$", "All"))

data <- withColumn(data, "breakdown_type", regexp_replace(data$breakdown_type, "^NA$", "All"))

data <- withColumn(data, "socio_demographic_breakdown", regexp_replace(data$socio_demographic_breakdown, "^NA$", "All"))

# Convert progression rates to numeric
data <- withColumn(data, "progression_rate_0_1_plus", 
                   cast(data$progression_rate_0_1_plus, "double"))
data <- withColumn(data, "progression_rate_1_2_plus", 
                   cast(data$progression_rate_1_2_plus, "double"))
data <- withColumn(data, "progression_rate_2_3_plus", 
                   cast(data$progression_rate_2_3_plus, "double"))

# Convert confidence intervals to numeric
data <- withColumn(data, "lower_cl_0_1", 
                   cast(data$lower_cl_0_1, "double"))
data <- withColumn(data, "upper_cl_0_1", 
                   cast(data$upper_cl_0_1, "double"))
data <- withColumn(data, "lower_cl_1_2", 
                   cast(data$lower_cl_1_2, "double"))
data <- withColumn(data, "upper_cl_1_2", 
                   cast(data$upper_cl_1_2, "double"))
data <- withColumn(data, "lower_cl_2_3", 
                   cast(data$lower_cl_2_3, "double"))
data <- withColumn(data, "upper_cl_2_3", 
                   cast(data$upper_cl_2_3, "double"))

# Filter data for Female and Male with breakdown_type "Age"
female_data <- SparkR::filter(data, 
                              data$gender_description == "FEMALE" & 
                              (data$breakdown_type == "Age" | data$breakdown_type == "All"))

male_data <- SparkR::filter(data, 
                            data$gender_description == "MALE" & 
                            (data$breakdown_type == "Age" | data$breakdown_type == "All"))

Unpivot data for easier charting

In [8]:
# Reshape female data
female_long_0_1 <- SparkR::select(female_data, 
                                  female_data$socio_demographic_breakdown,
                                  alias(female_data$progression_rate_0_1_plus, "Value"),
                                  alias(lit("progression_rate_0_1_plus"), "Progression_Rate"),
                                  alias(female_data$lower_cl_0_1, "Lower_CL"),
                                  alias(female_data$upper_cl_0_1, "Upper_CL"))

female_long_1_2 <- SparkR::select(female_data, 
                                  female_data$socio_demographic_breakdown,
                                  alias(female_data$progression_rate_1_2_plus, "Value"),
                                  alias(lit("progression_rate_1_2_plus"), "Progression_Rate"),
                                  alias(female_data$lower_cl_1_2, "Lower_CL"),
                                  alias(female_data$upper_cl_1_2, "Upper_CL"))

female_long_2_3 <- SparkR::select(female_data, 
                                  female_data$socio_demographic_breakdown,
                                  alias(female_data$progression_rate_2_3_plus, "Value"),
                                  alias(lit("progression_rate_2_3_plus"), "Progression_Rate"),
                                  alias(female_data$lower_cl_2_3, "Lower_CL"),
                                  alias(female_data$upper_cl_2_3, "Upper_CL"))

female_long <- unionAll(female_long_0_1, unionAll(female_long_1_2, female_long_2_3))

# Reshape male data
male_long_0_1 <- SparkR::select(male_data, 
                                male_data$socio_demographic_breakdown,
                                alias(male_data$progression_rate_0_1_plus, "Value"),
                                alias(lit("progression_rate_0_1_plus"), "Progression_Rate"),
                                alias(male_data$lower_cl_0_1, "Lower_CL"),
                                alias(male_data$upper_cl_0_1, "Upper_CL"))

male_long_1_2 <- SparkR::select(male_data, 
                                male_data$socio_demographic_breakdown,
                                alias(male_data$progression_rate_1_2_plus, "Value"),
                                alias(lit("progression_rate_1_2_plus"), "Progression_Rate"),
                                alias(male_data$lower_cl_1_2, "Lower_CL"),
                                alias(male_data$upper_cl_1_2, "Upper_CL"))

male_long_2_3 <- SparkR::select(male_data, 
                                male_data$socio_demographic_breakdown,
                                alias(male_data$progression_rate_2_3_plus, "Value"),
                                alias(lit("progression_rate_2_3_plus"), "Progression_Rate"),
                                alias(male_data$lower_cl_2_3, "Lower_CL"),
                                alias(male_data$upper_cl_2_3, "Upper_CL"))

male_long <- unionAll(male_long_0_1, unionAll(male_long_1_2, male_long_2_3))

Create R data frames

In [9]:
# Collect the data into local data frames for plotting
female_long_local <- SparkR::collect(female_long)
male_long_local <- SparkR::collect(male_long)

Create charts

In [10]:
# Calculate the maximum value for the y-axis across both datasets
# Multiply by 1.1 for slight buffer above maximum value
max_value <- max(c(female_long_local$Upper_CL, male_long_local$Upper_CL), na.rm = TRUE) * 1.1

# Create the plot for Female
plot_female <- ggplot(female_long_local, aes(x = socio_demographic_breakdown, y = Value, fill = Progression_Rate)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.95)) +
  geom_errorbar(aes(ymin = Lower_CL, ymax = Upper_CL), width = 0.2, position = position_dodge(width = 0.95)) +
  labs(title = "Female", y = "Progression rate per 100 person years", x = "Age group") +
  scale_fill_manual(values = c("progression_rate_0_1_plus" = "#6CB0BF",
                               "progression_rate_1_2_plus" = "#DEA62A",
                               "progression_rate_2_3_plus" = "#DC3C5D"),
                    labels = c("Progression Rate 0 to 1+", "Progression Rate 1 to 2+", "Progression Rate 2 to 3+")) +
  theme_minimal() +
  theme(
    text = element_text(family = "Helvetica"),
    plot.title = element_text(size = 25),
    axis.title = element_text(size = 20),
    axis.text = element_text(size = 15),
    legend.title = element_blank(),
    legend.text = element_text(size = 25)
  ) +
  ylim(0, max_value)

# Create the plot for Male
plot_male <- ggplot(male_long_local, aes(x = socio_demographic_breakdown, y = Value, fill = Progression_Rate)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.95)) +
  geom_errorbar(aes(ymin = Lower_CL, ymax = Upper_CL), width = 0.2, position = position_dodge(width = 0.95)) +
  labs(title = "Male", y = "Progression rate per 100 person years", x = "Age group") +
  scale_fill_manual(values = c("progression_rate_0_1_plus" = "#6CB0BF",
                               "progression_rate_1_2_plus" = "#DEA62A",
                               "progression_rate_2_3_plus" = "#DC3C5D"),
                    labels = c("Progression Rate 0 to 1+", "Progression Rate 1 to 2+", "Progression Rate 2 to 3+")) +
  theme_minimal() +
  theme(
    text = element_text(family = "Helvetica"),
    plot.title = element_text(size = 25),
    axis.title = element_text(size = 20),
    axis.text = element_text(size = 15),
    legend.title = element_blank(),
    legend.text = element_text(size = 25)
  ) +
  ylim(0, max_value)

# Combine plots using patchwork and add a shared legend
final_plot <- (plot_female | plot_male) + plot_layout(guides = "collect") & theme(legend.position = "bottom")

# Display the combined plot
print(final_plot)