In [None]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import seaborn as sns
from matplotlib import pyplot as plt
from prop_confidence_intervals import wald

%matplotlib inline
sns.set(style="whitegrid", context="paper")

Read in cleaned data

In [None]:
data = pd.read_csv("PIP_cleaned_data.csv")
data = data[data.keep_sample == 1]  # Removing unpaired/erroneous samples

Including 'All' column (i.e., detecting ***any*** microorganism)

In [None]:
data["All"] = data[
    [
        "Coliform (non-E. coli)",
        "Enterococcus sp.",
        "Escherichia coli",
        "Klebsiella pneumoniae",
        "Proteus sp.",
        "Pseudomonas aeruginosa",
        "Staphylococcus saprophyticus",
        "Streptococcus Group B",
    ]
].sum(axis=1)

Define microorganism list

In [None]:
target_vars = [
    "Coliform (non-E. coli)",
    "Enterococcus sp.",
    "Escherichia coli",
    "Klebsiella pneumoniae",
    "Proteus sp.",
    "Pseudomonas aeruginosa",
    "Staphylococcus saprophyticus",
    "Streptococcus Group B",
    "All",
]

Check for missing values

In [None]:
data.isna().sum()

Obtain only sample cultured in ***both*** PiP and control

In [None]:
both_cultured_df = data.groupby("Specimen Number").sum()["cultured"] == 2

In [None]:
both_cultured_specimen_list = both_cultured_df[both_cultured_df].index.to_list()

In [None]:
len(both_cultured_specimen_list)

In [None]:
# Data containing only cultured samples
data = data[data["Specimen Number"].isin(both_cultured_specimen_list)]

Positivity comparison between control and PiP

In [None]:
data[["container"] + target_vars].groupby(by="container").mean()

Store contingency tables & Kappa values in dictionaries

In [None]:
contingency_tbls_dict = {}
kappa_scores_dict = {}
for x in target_vars:

    table = pd.crosstab(
        index=data[data.container == "PLASTIC"][x].reset_index(drop=True),
        columns=data[data.container == "PIP"][x].reset_index(drop=True),
        rownames=["Present in plastic container"],
        colnames=["Present in PIP container"],
    ).reindex(columns=[1, 0], index=[1, 0], fill_value=0)
    contingency_tbls_dict[x] = table

    kappa = cohen_kappa_score(
        data[data.container == "PLASTIC"][x].dropna(how="any"),
        data[data.container == "PIP"][x].dropna(how="any"),
    )
    kappa_scores_dict[x] = kappa

Store stacked contingency table

In [None]:
combined = pd.concat(contingency_tbls_dict.values(), keys=contingency_tbls_dict.keys())
combined.index.names = ["Microorganism", ""]
combined.to_csv("tables/microorganism_comparison_contengency.csv")
combined

Store Kappa values

In [None]:
kappa_data = pd.DataFrame.from_dict(
    kappa_scores_dict, orient="index", columns=["Kappa score"]
)
kappa_data.to_csv("tables/microorganism_kappa_values.csv")
kappa_data

Summary table comparing pip & control testing kits

In [None]:
# Populate summary table
organsism_list = []
lower_ci_list = []
upper_ci_list = []
diff_list = []
pip_proportion_list = []
pla_proportion_list = []

final_table = []

for o in target_vars:

    table_o = contingency_tbls_dict[o]
    A = table_o.loc[1, 1]
    B = table_o.loc[0, 1]
    C = table_o.loc[1, 0]
    D = table_o.loc[0, 0]
    N = len(data) / 2

    diff, lower_ci, upper_ci = wald(A, B, C, D, N)

    organsism_list.append(o)
    lower_ci_list.append(lower_ci)
    upper_ci_list.append(upper_ci)
    diff_list.append(diff)
    pip_proportion_list.append((A + B) / N)
    pla_proportion_list.append((A + C) / N)

    primary_outcomes_df = pd.DataFrame(index=[o])
    primary_outcomes_df["Positive (PiP)"] = A + B
    primary_outcomes_df["Positive (control)"] = A + C
    primary_outcomes_df["Difference in proportion"] = (A + B) / N - (A + C) / N
    primary_outcomes_df["lower 95% CI"] = round(lower_ci, 4)
    primary_outcomes_df["Upper 95% CI"] = round(upper_ci, 4)
    primary_outcomes_df["Estimated control prevalence (per 10,000)"] = int(
        10_000 * (A + C) / N
    )
    primary_outcomes_df["Estimated PiP prevalence (per 10,000)"] = int(
        10_000 * (A + B) / N
    )
    primary_outcomes_df["Lower change (per 10,000)"] = (
        primary_outcomes_df["lower 95% CI"] * 10_000
        + primary_outcomes_df["Estimated control prevalence (per 10,000)"]
    ).astype(int)

    primary_outcomes_df["Upper change (per 10,000)"] = (
        primary_outcomes_df["Upper 95% CI"] * 10_000
        + primary_outcomes_df["Estimated control prevalence (per 10,000)"]
    ).astype(int)

    final_table.append(primary_outcomes_df)

In [None]:
summary_data = pd.concat(final_table).iloc[:-1]
summary_data.to_csv("tables/microorganism_detection_summary.csv")
summary_data

Confidence interval plots

In [None]:
# Plot data
df = summary_data.copy()
df["Mean"] = df["Difference in proportion"]
df["yerr"] = df["Upper 95% CI"] - df["Mean"]  # Symmetric error
df = df.reset_index(drop=False)

# Plot style
sns.set(style="whitegrid", context="paper")

# Confidence interval plot
plt.figure(figsize=(8, 4))
ax = sns.pointplot(
    x="Mean", y="index", data=df.iloc[:-1], linestyle="none", marker=".", markersize=5
)
ax.errorbar(
    df.iloc[:-1]["Mean"],
    df.iloc[:-1]["index"],
    xerr=df.iloc[:-1]["yerr"],
    capsize=4,
    linestyle="None",
    color="b",
)

for i in range(len(df) - 1):
    # Calculate lower / upper bounds
    lower = df["Mean"][i] - df["yerr"][i]
    upper = df["Mean"][i] + df["yerr"][i]

    lower_text = f"{lower:.4f}" if lower != 0 else "0"
    ax.text(
        lower - 0.0005,
        i,
        lower_text,
        ha="right",
        va="center",
        fontsize=10,
        color="black",
        bbox=dict(facecolor="white", edgecolor="none", pad=0.5),
    )

    upper_text = f"{upper:.4f}" if upper != 0 else "0"
    ax.text(
        upper + 0.0005,
        i,
        upper_text,
        ha="left",
        va="center",
        fontsize=10,
        color="black",
        bbox=dict(facecolor="white", edgecolor="none", pad=0.5),
    )

# Plot labels/settings
plt.xlabel("Estimated proportion difference (PiP-control)", fontsize=12)
plt.ylabel("", fontsize=12)
plt.title(
    "Difference in Proportion of Microorganism Detection (Wald 95% CIs)", fontsize=12.5
)
plt.grid(True, linestyle="--", linewidth=0.5)

plt.xlim(-0.02, 0.02)
ax.tick_params(axis="both", which="major", labelsize=12)

plt.tight_layout()

plt.savefig(
    "figures/microorganism_equivalency_comparison.png", dpi=600, bbox_inches="tight"
)

Comparison of culture agreement between testing kits

In [None]:
data = pd.read_csv("PIP_cleaned_data.csv")
data = data[data.keep_sample == 1]  # Removing unpaired/erroneous samples

Contingency table for all samples

In [None]:
compare_culture = data[["container", "cultured"]]

table_1 = pd.crosstab(
    index=compare_culture[compare_culture.container == "PLASTIC"][
        "cultured"
    ].reset_index(drop=True),
    columns=compare_culture[compare_culture.container == "PIP"]["cultured"].reset_index(
        drop=True
    ),
    rownames=["Cultured in plastic"],
    colnames=["Cultured in PIP"],
).reindex(columns=[1, 0], index=[1, 0], fill_value=0)
table_1

In [None]:
A = table_1.loc[1, 1]
B = table_1.loc[0, 1]
C = table_1.loc[1, 0]
D = table_1.loc[0, 0]
N = len(compare_culture) / 2

diff, lower_ci, upper_ci = wald(A, B, C, D, N)
print(diff)

In [None]:
(A + D) / (A + B + C + D)  # Overall agreement perc

Removing antenatal from comparison

In [None]:
compare_culture = data[data.ward == "ANTENATAL"][["container", "cultured"]]

table_2 = pd.crosstab(
    index=compare_culture[compare_culture.container == "PLASTIC"][
        "cultured"
    ].reset_index(drop=True),
    columns=compare_culture[compare_culture.container == "PIP"]["cultured"].reset_index(
        drop=True
    ),
    rownames=["Cultured in plastic"],
    colnames=["Cultured in PIP"],
).reindex(columns=[1, 0], index=[1, 0], fill_value=0)
table_2

In [None]:
compare_culture = data[data.ward != "ANTENATAL"][["container", "cultured"]]

table_3 = pd.crosstab(
    index=compare_culture[compare_culture.container == "PLASTIC"][
        "cultured"
    ].reset_index(drop=True),
    columns=compare_culture[compare_culture.container == "PIP"]["cultured"].reset_index(
        drop=True
    ),
    rownames=["Cultured in plastic"],
    colnames=["Cultured in PIP"],
).reindex(columns=[1, 0], index=[1, 0], fill_value=0)
table_3

In [None]:
A = table.loc[1, 1]
B = table.loc[0, 1]
C = table.loc[1, 0]
D = table.loc[0, 0]
N = len(compare_culture) / 2

diff, lower_ci, upper_ci = wald(A, B, C, D, N)

In [None]:
(A + D) / (A + B + C + D)  # Overall agreement perc (non-antenatal)

In [None]:
pd.concat([table_1, table_2, table_3]).to_csv("tables/cultures_split_by_antenatal.csv")