### Running instructions

1. Read the `README.md` file
2. Run `pipeline_part_1`
3. Select a R kernel for this Notebook
4. Run all cells
5. Wait a couple of seconds for full execution
6. Interperet the result of each analysis by looking at the figures in the `media` directory

In [None]:
source("r_utilities.R")
# Load dataframe and do some pre-processing
df <- read.csv("questionnaire.csv")
colnames(df) <- sub("X", "Q", colnames(df))
colnames(df) <- sub(".single|.multi|.value|.none", "", colnames(df))
df <- df[, order(names(df))] # Reorder columns
df <- subset(df, diagnosis != "other_diagnosis") ## Remove "other_diagnosis" subset

## Recode select values to ease analysis
df$Q1 <- car::recode(df$Q1, "10:19 = 'Adolescence'; 20:29 = 'Early_adult'; 30:39 = 'Adult'; 40:59 = 'Middle_age'; 60:99 = 'Elder'")
df$Q101 <- case_match(df$Q101, "Yes" ~ "Yes", "No" ~ "No", "I can\'t tell" ~ "Cant_tell")
df$Q6 <- case_match(df$Q6, "Yes" ~ "Yes", "No" ~ "No", "I can\'t tell" ~ "Cant_tell")
df$Q7 <- car::recode(df$Q7, "0:3 = 'Low'; 4:7 = 'Medium'; 8:10 = 'Severe'")
df$Q8 <- car::recode(df$Q8, "0:3 = 'Low'; 4:7 = 'Medium'; 8:10 = 'Severe'")
df$Q9 <- car::recode(df$Q9, "0:3 = 'Low'; 4:7 = 'Medium'; 8:10 = 'Severe'")
df$Q10 <- car::recode(df$Q10, "0:3 = 'Low'; 4:7 = 'Medium'; 8:10 = 'Severe'")
df$Q11 <- car::recode(df$Q11, "0:3 = 'Low'; 4:7 = 'Medium'; 8:10 = 'Severe'")
df$Q12 <- case_match(df$Q12, "I have no problems in walking about" ~ "No_problems",
"I have slight problems in walking about" ~ "Slight_problems",
"I have moderate problems in walking about" ~ "Moderate_problems",
"I have severe problems in walking about" ~ "Severe_problems",
"I am unable to walking about" ~ "Unable")

df$Q13 <- case_match(df$Q13, "I have no problems washing or dressing myself" ~ "No_problems",
"I have slight problems washing or dressing myself" ~ "Slight_problems",
"I have moderate problems washing or dressing myself" ~ "Moderate_problems",
"I have severe problems washing or dressing myself" ~ "Severe_problems",
"I am unable to wash or dress myself" ~ "Unable")

df$Q14 <- case_match(df$Q14, "I have no problems doing my usual activities" ~ "No_problems",
"I have slight problems doing my usual activities" ~ "Slight_problems",
"I have moderate problems doing my usual activities" ~ "Moderate_problems",
"I have severe problems doing my usual activities" ~ "Severe_problems",
"I am unable to do my usual activities" ~ "Unable")

df$Q15 <- case_match(df$Q15, "I have no pain or discomfort" ~ "No_pain",
"I have slight pain or discomfort" ~ "Slight_pain",
"I have moderate pain or discomfort" ~ "Moderate_pain",
"I have severe pain or discomfort" ~ "Severe_pain",
"I have extreme pain or discomfort" ~ "Extreme_pain")

df$Q16 <- case_match(df$Q16, "I am not anxious or depressed" ~ "Not",
"I am slightly anxious or depressed" ~ "Slightly",
"I am moderatly anxious or depressed" ~ "Moderatly",
"I am severely anxious or depressed" ~ "Severely",
"I am extremely anxious or depressed" ~ "Extremely")

df$Q17 <- car::recode(df$Q17, "0:33 = 'Bad'; 34:67 = 'Ok'; 68:100 = 'Good'")

df$Q3 <- case_match(df$Q3, "Less than eight elementary school classes" ~ "Pre_elementary",
"Eight elementary school classes" ~ "elementary",
"Graduation" ~ "Graduation",
"Vocational training" ~ "Vocational",
"College / university degree" ~ "College",
"Doctoral degree, PhD" ~ "Doctoral")
df$Q4 <- case_match(df$Q4, "Single" ~ "Single",
"Living in a relationship" ~ "Relationship",
"Married, living in a cohabiting relationship" ~ "Married",
"Divorced" ~ "Divorced",
"Widowed" ~ "Widowed")

# Rename columns to include some question inforomation
old_col_names <- c("diagnosis", "Q1", "Q10", "Q101", "Q102", "Q11", "Q12",
"Q13", "Q14", "Q15", "Q16", "Q17", "Q2", "Q23", "Q24",
"Q25", "Q26", "Q27", "Q28", "Q29", "Q3", "Q30", "Q31",
"Q4", "Q6", "Q7", "Q8", "Q9")
new_col_names <- c("Diagnosis", "Q1_age", "Q10_dyschezia", "Q101_sun_sensitivity", "Q102_birthmarks", "Q11_dysuria", "Q12_mobility",
"Q13_selfcare", "Q14_activities", "Q15_discomfort", "Q16_anxiety", "Q17_health", "Q2_residence", "Q23_tired", "Q24_stiffness",
"Q25_bodypain", "Q26_headaches", "Q27_unwell_sleep", "Q28_difficulty_concentrating", "Q29_stress_symptoms", "Q3_education", "Q30_neck_tension", "Q31_forgetful",
"Q4_marital", "Q6_infertility", "Q7_dysmenorrhea", "Q8_pelvic", "Q9_dysparunia")
df <- df %>% rename_with(~ new_col_names[which(old_col_names == .x)], .cols = old_col_names)

In [None]:
## Food subset, question 55 and 56
food_subset <- read.csv("food.csv")
colnames(food_subset) <- sub("X", "Q", colnames(food_subset))
colnames(food_subset) <- sub(".single|.multi|.value|.none", "", colnames(food_subset))
food_subset <- food_subset[, order(names(food_subset))] # Reorder columns
food_subset <- subset(food_subset, diagnosis != "other_diagnosis") ## Remove "other_diagnosis" subset

food_subset$Q55 <- case_match(food_subset$Q55, 82 ~ "sugar", 83 ~ "gluten", 84 ~ "coffee", 85 ~ "soy", 86 ~ "dairy", 87 ~ "red_meat")
food_subset$Q56 <- case_match(food_subset$Q56, 82 ~ "sugar", 83 ~ "gluten", 84 ~ "coffee", 85 ~ "soy", 86 ~ "dairy", 87 ~ "red_meat")

old_col_names <- c("diagnosis", "Q55", "Q56")
new_col_names <- c("Diagnosis", "Q55_removed", "Q56_limited")
food_subset <- food_subset %>% rename_with(~ new_col_names[which(old_col_names == .x)], .cols = old_col_names)

setwd("../media/")
endo_prop <- round(prop.table(table(food_subset$Diagnosis)) * 100, 1)[1][[1]] ## Proportion of subset that have endometriosis
rep_times <- c(14, 14) ## Twice the answer alternatives, subtract unused alternatives or add for NaN cases

png("prevalence_food.png", width = 960, height = 960)
describe_categories(food_subset, rep_times, endo_prop)
dev.off()

food_subset <- food_subset %>% drop_na() ## Drop incomplete rows before calculating MCA to avoid NA labels
res_mca <- MCA(food_subset, quali.sup = "Diagnosis", graph = FALSE)

## Scree plot for the first 10 dimensions
png("scree_food.png", width = 960, height = 480)
options(repr.plot.width = 14, repr.plot.height = 7)
fviz_screeplot(res_mca, addlabels = TRUE, ylim = c(0, 30)) + theme(text = element_text(size = 20), axis.title = element_text(size = 25), axis.text = element_text(size = 15))
dev.off()

## Visualize MCA results on a factor map
png("mca_food.png", width = 960, height = 960)
visualize_mca(res_mca)
dev.off()
setwd("../data/")

In [None]:
## General health subset, questions 15, 17, 25, 26 and 29
subset_cols <- c("Diagnosis", "Q15_discomfort", "Q17_health", "Q25_bodypain", "Q26_headaches", "Q29_stress_symptoms")
df_subset <- subset(df, select = subset_cols)
setwd("../media/")

## Relative endometriosis prevalence by category
endo_prop <- round(prop.table(table(df_subset$Diagnosis)) * 100, 1)[1][[1]] ## Proportion of subset that have endometriosis
rep_times <- c(10, 6, 10, 10, 10) ## Twice the answer alternatives, subtract unused alternatives or add for NaN cases
png("prevalence_general.png", width = 960, height = 960)
describe_categories(df_subset, rep_times, endo_prop)
dev.off()

df_subset <- df_subset %>% drop_na() ## Drop incomplete rows before calculating MCA to avoid NA labels
res_mca <- MCA(df_subset, quali.sup = "Diagnosis", graph = FALSE)

## Scree plot for the first 10 dimensions
png("scree_general.png", width = 960, height = 480)
options(repr.plot.width = 14, repr.plot.height = 7)
fviz_screeplot(res_mca, addlabels = TRUE, ylim = c(0, 30)) + theme(text = element_text(size = 20), axis.title = element_text(size = 25), axis.text = element_text(size = 15))
dev.off()

## Visualize MCA results on a factor map
png("mca_general.png", width = 960, height = 960)
visualize_mca(res_mca)
dev.off()

setwd("../data/")

In [None]:
## Activity subset, questions 12, 13, 14, 24 and 30
subset_cols <- c("Diagnosis", "Q12_mobility", "Q13_selfcare", "Q14_activities", "Q24_stiffness", "Q30_neck_tension")
df_subset <- subset(df, select = subset_cols)
setwd("../media/")

## Relative endometriosis prevalence by category
endo_prop <- round(prop.table(table(df_subset$Diagnosis)) * 100, 1)[1][[1]] ## Proportion of subset that have endometriosis
rep_times <- c(10, 10, 10, 10, 10) ## Twice the answer alternatives, subtract unused alternatives or add for NaN cases
png("prevalence_activity.png", width = 960, height = 960)
describe_categories(df_subset, rep_times, endo_prop)
dev.off()

df_subset <- df_subset %>% drop_na() ## Drop incomplete rows before calculating MCA to avoid NA labels
res_mca <- MCA(df_subset, quali.sup = "Diagnosis", graph = FALSE)

## Scree plot for the first 10 dimensions
png("scree_activity.png", width = 960, height = 480)
options(repr.plot.width = 14, repr.plot.height = 7)
fviz_screeplot(res_mca, addlabels = TRUE, ylim = c(0, 30)) + theme(text = element_text(size = 20), axis.title = element_text(size = 25), axis.text = element_text(size = 15))
dev.off()

## Visualize MCA results on a factor map
png("mca_activity.png", width = 960, height = 960)
visualize_mca(res_mca)
dev.off()

setwd("../data/")

In [None]:
## Pelvic realted subset, questions 6, 7, 8, 9, 10 and 11
subset_cols <- c("Diagnosis", "Q6_infertility", "Q7_dysmenorrhea", "Q8_pelvic", "Q9_dysparunia", "Q10_dyschezia", "Q11_dysuria")
df_subset <- subset(df, select = subset_cols)
setwd("../media/")

## Relative endometriosis prevalence by category
endo_prop <- round(prop.table(table(df_subset$Diagnosis)) * 100, 1)[1][[1]] ## Proportion of subset that have endometriosis
rep_times <- c(8, 6, 6, 6, 6, 6) ## Twice the answer alternatives, subtract unused alternatives or add for NaN cases
png("prevalence_pelvic.png", width = 960, height = 960)
describe_categories(df_subset, rep_times, endo_prop)
dev.off()

df_subset <- df_subset %>% drop_na() ## Drop incomplete rows before calculating MCA to avoid NA labels
res_mca <- MCA(df_subset, quali.sup = "Diagnosis", graph = FALSE)

## Scree plot for the first 10 dimensions
png("scree_pelvic.png", width = 960, height = 480)
options(repr.plot.width = 14, repr.plot.height = 7)
fviz_screeplot(res_mca, addlabels = TRUE, ylim = c(0, 30)) + theme(text = element_text(size = 20), axis.title = element_text(size = 25), axis.text = element_text(size = 15))
dev.off()

## Visualize MCA results on a factor map
png("mca_pelvic.png", width = 960, height = 960)
visualize_mca(res_mca)
dev.off()

setwd("../data/")

In [None]:
## Socio-economic subset, questions 1-4
subset_cols <- c("Diagnosis", "Q1_age", "Q2_residence", "Q3_education", "Q4_marital")
df_subset <- subset(df, select = subset_cols)
setwd("../media/")

## Relative endometriosis prevalence by category
endo_prop <- round(prop.table(table(df_subset$Diagnosis)) * 100, 1)[1][[1]] ## Proportion of subset that have endometriosis
rep_times <- c(12, 9, 14, 10) ## Twice the answer alternatives, subtract unused alternatives or add for NaN cases
png("prevalence_socioeco.png", width = 960, height = 960)
describe_categories(df_subset, rep_times, endo_prop)
dev.off()

df_subset <- df_subset %>% drop_na() ## Drop incomplete rows before calculating MCA to avoid NA labels
res_mca <- MCA(df_subset, quali.sup = "Diagnosis", graph = FALSE)

## Scree plot for the first 10 dimensions
png("scree_socioeco.png", width = 960, height = 480)
options(repr.plot.width = 14, repr.plot.height = 7)
fviz_screeplot(res_mca, addlabels = TRUE, ylim = c(0, 30)) + theme(text = element_text(size = 20), axis.title = element_text(size = 25), axis.text = element_text(size = 15))
dev.off()

## Visualize MCA results on a factor map
png("mca_socioeco.png", width = 960, height = 960)
visualize_mca(res_mca)
dev.off()

setwd("../data/")

In [None]:
## Mental subset, questions 16, 23, 27, 28 and 31
subset_cols <- c("Diagnosis", "Q16_anxiety", "Q23_tired", "Q27_unwell_sleep", "Q28_difficulty_concentrating", "Q31_forgetful")
df_subset <- subset(df, select = subset_cols)
setwd("../media/")

## Relative endometriosis prevalence by category
endo_prop <- round(prop.table(table(df_subset$Diagnosis)) * 100, 1)[1][[1]] ## Proportion of subset that have endometriosis
rep_times <- c(10, 10, 10, 10, 10) ## Twice the answer alternatives, subtract unused alternatives or add for NaN cases
png("prevalence_mental.png", width = 960, height = 960)
describe_categories(df_subset, rep_times, endo_prop)
dev.off()

df_subset <- df_subset %>% drop_na() ## Drop incomplete rows before calculating MCA to avoid NA labels
res_mca <- MCA(df_subset, quali.sup = "Diagnosis", graph = FALSE)

## Scree plot for the first 10 dimensions
png("scree_mental.png", width = 960, height = 480)
options(repr.plot.width = 14, repr.plot.height = 7)
fviz_screeplot(res_mca, addlabels = TRUE, ylim = c(0, 30)) + theme(text = element_text(size = 20), axis.title = element_text(size = 25), axis.text = element_text(size = 15))
dev.off()

## Visualize MCA results on a factor map
png("mca_mental.png", width = 960, height = 960)
visualize_mca(res_mca)
dev.off()

setwd("../data/")

In [None]:
## Other indictators subset, questions 101 and 102
subset_cols <- c("Diagnosis", "Q101_sun_sensitivity", "Q102_birthmarks")
df_subset <- subset(df, select = subset_cols)
setwd("../media/")

## Relative endometriosis prevalence by category
endo_prop <- round(prop.table(table(df_subset$Diagnosis)) * 100, 1)[1][[1]] ## Proportion of subset that have endometriosis
rep_times <- c(8, 12) ## Twice the answer alternatives, subtract unused alternatives or add for NaN cases
png("prevalence_other.png", width = 960, height = 960)
describe_categories(df_subset, rep_times, endo_prop)
dev.off()

df_subset <- df_subset %>% drop_na() ## Drop incomplete rows before calculating MCA to avoid NA labels
res_mca <- MCA(df_subset, quali.sup = "Diagnosis", graph = FALSE)

## Scree plot for all 6 dimensions
png("scree_other.png", width = 960, height = 480)
options(repr.plot.width = 14, repr.plot.height = 7)
fviz_screeplot(res_mca, addlabels = TRUE, ylim = c(0, 30)) + theme(text = element_text(size = 20), axis.title = element_text(size = 25), axis.text = element_text(size = 15))
dev.off()

## Visualize MCA results on a factor map
png("mca_other.png", width = 960, height = 960)
visualize_mca(res_mca)
dev.off()

setwd("../data/")