### **01 - Incidence of MLTC**
#### **01F - Manuscript outputs - progression rate by initial condition count**

**Chart**: Progression rate by initial condition count

**Imports**

In [1]:
%%pyspark
# required imports

# requires blank line after last import


In [2]:
%%sparkr

if (!requireNamespace("svglite", quietly = TRUE)) {
  install.packages("svglite")
}

In [3]:
# Load necessary libraries
library(SparkR)
library(ggplot2)
library(patchwork)
library(svglite)

**Parameter cell**

In [4]:
%%pyspark
# parameter cell
incidence_schema = ""  # "mltc_incidence_outputs_v40_20230331"

# optional, can be blank


In [5]:
%%pyspark
# Set parameters in Spark configuration with 'param.' prefix (for use in SQL cells)
spark.conf.set("param.incidence_schema", incidence_schema)


---

#### **Creating the plot**

**a - Load data**

In [6]:
data <- sql("SELECT * FROM ${param.incidence_schema}.output_01F_incidence_results_by_initial_condition_count")

In [7]:
%%sql

SELECT * FROM ${param.incidence_schema}.output_01F_incidence_results_by_initial_condition_count

**b - Create plot**

Create charts

In [8]:
# Convert Spark DataFrame to R DataFrame for ggplot2
data_local <- collect(data)

# Convert previous_condition_count to a factor
data_local$previous_condition_count <- as.factor(data_local$previous_condition_count)

# Remove NAs if they exist
data_local <- na.omit(data_local)  

# Exclude rows where previous_condition_count is 9 (as in reality this is a combined 9+ cohort)
data_local <- subset(data_local, previous_condition_count != 9)

In [9]:


# Create the bar chart for progression_rate
bar_chart <- ggplot(data_local, aes(x = previous_condition_count, y = progression_rate)) +
  geom_bar(stat = "identity", fill = "#6cb1beff", width = 0.7) +
  geom_errorbar(aes(ymin = lower_cl, ymax = upper_cl), width = 0.2) +
  geom_text(aes(label = round(progression_rate, 1)), vjust = 1.5, color = "white", fontface = "bold", size = 8) +
  labs(title = NULL, y = "Progression rate per 100 person years", x = NULL) +
  theme_minimal() +
  theme(
    text = element_text(family = "Helvetica"),
    axis.title = element_text(size = 25, margin = margin(t = 20, r = 20, b = 20, l = 20)),
    axis.text = element_text(size = 25, margin = margin(t = 20, r = 20, b = 20, l = 20)),
    panel.grid.minor = element_blank()
  )

# Create the box plot
box_plot <- ggplot(data_local, aes(x = previous_condition_count)) +
  geom_boxplot(
    aes(
      ymin = perc_05,
      lower = perc_25,
      middle = perc_50,
      upper = perc_75,
      ymax = perc_95
    ),
    stat = "identity",
    fill = "#a7d0d8ff",
    color = "#6cb1beff",
    whisker.linetype = "solid",
    whisker.size = 0.5
  ) +
  geom_segment(aes(x = as.numeric(previous_condition_count) - 0.3, xend = as.numeric(previous_condition_count) + 0.3, y = perc_05, yend = perc_05), color = "black") +
  geom_segment(aes(x = as.numeric(previous_condition_count) - 0.3, xend = as.numeric(previous_condition_count) + 0.3, y = perc_95, yend = perc_95), color = "black") +
  geom_text(aes(y = perc_50, label = round(perc_50, 2)), vjust = 1.5, color = "#5fa3b1", fontface = "bold", size = 8) +
  labs(title = NULL, y = "Age (years) at progression", x = "Initial condition count") +
  theme_minimal() +
  theme(
    text = element_text(family = "Helvetica"),
    axis.title = element_text(size = 25, margin = margin(t = 20, r = 20, b = 20, l = 20)),
    axis.text = element_text(size = 25, margin = margin(t = 20, r = 20, b = 20, l = 20)),
    panel.grid.minor = element_blank()
  )

# Combine the plots using patchwork with added spacing
final_plot <- bar_chart / box_plot + plot_layout(heights = c(1, 1), ncol = 1)

# Display the combined plot
print(final_plot)