**Data Description**
- Dataset: Mayo Clinic Primary Biliary Cirrhosis (PBC) dataser (pbc2)

- Observations: 1945 rows, 312 patients

- Key Variables:

    - Survival info: years, status, status2

    - Demographics: age, sex

    - Treatment: drug

    - Clinical / biomarkers: serBilir, serChol, albumin, alkaline, SGOT, platelets, prothrombin

    - Other features: ascites, hepatomegaly, spiders, edema, histologic

- Table: Summary statistics of numeric variables (mean ± SD, median, range).

- Figures

    - Histogram / density plots for age, serBilir, albumin

    - Bar plots for categorical variables (sex, drug, edema)

    - Correlation heatmap for numeric covariates

**Exploratory Analysis**

- Kaplan–Meier survival curves: stratified by drug, sex, edema.

- Log-rank tests: check for significant differences between groups.

- Observations / Patterns: comment on trends in survival across covariates.

- Figures:

    - KM curves with confidence bands

    - Boxplots / violin plots of biomarkers by survival status


In [None]:
# ============================================================
#   PBC Dataset - Data Description & Preprocessing
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from IPython.display import display

# ------------------------------------------------------------
# 3.1 DATASET OVERVIEW
# ------------------------------------------------------------

# Load dataset (change path if needed)
df = pd.read_csv("Project/data/raw/pbc2.csv")
print("Dataset shape:", df.shape)  # Expected: (1945, 312 patients in long format)
display(df.head())


In [None]:
# ------------------------------------------------------------
# 3.2 VARIABLES
# ------------------------------------------------------------

survival_vars = ["years", "status2"]
demographic_vars = ["age", "sex"]
treatment_vars = ["drug"]
biomarker_vars = [
    "serBilir", "albumin", "alkaline", "SGOT",
    "serChol", "prothrombin", "platelets"
]
clinical_vars = ["ascites", "hepatomegaly", "spiders", "edema"]
id_vars = ["id", "year"]

all_vars = id_vars + survival_vars + demographic_vars + treatment_vars + biomarker_vars + clinical_vars

df = df[all_vars]  # Keep only relevant columns

# ------------------------------------------------------------
# 3.3 DATA PREPROCESSING
# ------------------------------------------------------------

df_clean = df.copy()

# --- Numeric missing values (biomarkers) ---
for col in biomarker_vars:
    df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# --- Categorical missing values ---
for col in clinical_vars:
    df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])

# --- Standardize text ---
text_cols = ["ascites", "hepatomegaly", "spiders", "edema", "sex", "drug"]
for col in text_cols:
    df_clean[col] = (df_clean[col].astype(str).str.strip().str.lower())

# Clean edema variants
df_clean["edema"] = df_clean["edema"].astype(str).str.lower().str.replace("without", "no", regex=False).str.strip()

# --- Encoding categorical variables ---
binary_map = {"yes": 1, "no": 0}
sex_map = {"female": 0, "male": 1}
drug_map = {"d-penicil": 1, "placebo": 0}
edema_map = {"no edema": 0, "edema no diuretics": 1, "edema despite diuretics": 2}

df_clean["ascites"] = df_clean["ascites"].map(binary_map)
df_clean["hepatomegaly"] = df_clean["hepatomegaly"].map(binary_map)
df_clean["spiders"] = df_clean["spiders"].map(binary_map)
df_clean["edema"] = df_clean["edema"].map(edema_map)
df_clean["sex"] = df_clean["sex"].map(sex_map)
df_clean["drug"] = df_clean["drug"].map(drug_map)

assert df_clean[["ascites", "hepatomegaly", "spiders", "edema", "sex", "drug"]].isnull().sum().sum() == 0

# Save cleaned dataset
df_clean.to_csv("data/processed/pbc_clean.csv", index=False)

# --- Create baseline dataset for patient-level analysis ---
df_baseline = df_clean.sort_values(["id","year"]).groupby("id", as_index=False).first()

# Impute serChol missing at baseline and add missing indicator
# df_baseline["serChol_missing"] = df_baseline["serChol"].isnull().astype(int)
# df_baseline["serChol"] = df_baseline["serChol"].fillna(df_baseline["serChol"].median())

# --- Scaling numeric variables ---
# scale_numeric = False

# if scale_numeric:
#     scaler = StandardScaler()
#     df_baseline[biomarker_vars] = scaler.fit_transform(df_baseline[biomarker_vars])


# ------------------------------------------------------------
# 3.4 SUMMARY STATISTICS (TABLE 1)
# ------------------------------------------------------------

# Numeric summary
numeric_vars = ["years"] + biomarker_vars
numeric_summary = (
    df_baseline[numeric_vars]
    .describe()
    .loc[["mean", "std", "50%", "min", "max"]]
    .rename(index={"50%":"median","std":"SD"})
    .T
    .round(2)
)
numeric_summary.to_csv("results/tables/table1_numeric.csv")
numeric_summary.to_csv("results/tables/summary_statistics_numeric.csv")
print("\n=== Numeric Summary ===")
print(numeric_summary)

# Categorical summary (baseline)
categorical_vars = ["sex", "drug", "ascites", "hepatomegaly", "spiders", "edema"]
cat_tables = {}
for col in categorical_vars:
    counts = df_baseline[col].value_counts()
    perc = counts / counts.sum() * 100
    cat_tables[col] = pd.DataFrame({"Count": counts, "Percent": perc.round(1)})
    cat_tables[col].to_csv(f"results/tables/table1_{col}.csv")
    print(f"\n=== {col} ===")
    print(cat_tables[col])
    
# ------------------------------------------------------------
# 3.5 FIGURES
# ------------------------------------------------------------

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

# Histograms / density plots
for var in ["serBilir","albumin","age"]:
    plt.figure(figsize=(5,4))
    sns.histplot(df_baseline[var], bins=30, kde=True)
    plt.xlabel(var)
    plt.ylabel("Frequency")
    plt.title(f"Distribution of {var}")
    plt.tight_layout()
    plt.savefig(f"results/figures/dist_{var}.png", dpi=300)
    plt.close()

# Bar plots for categorical variables
for var in ["sex","drug","edema"]:
    plt.figure(figsize=(4,4))
    df_baseline[var].value_counts().sort_index().plot(kind="bar")
    plt.xticks(rotation=0)
    plt.ylabel("Count")
    plt.title(f"{var.capitalize()} Distribution")
    plt.tight_layout()
    plt.savefig(f"results/figures/bar_{var}.png", dpi=300)
    plt.close()

# Correlation heatmap (numeric)
plt.figure(figsize=(8,6))
sns.heatmap(df_baseline[numeric_vars].corr(), cmap="coolwarm", square=True, annot=True, cbar_kws={"shrink":0.8})
plt.title("Correlation of Numeric Variables")
plt.tight_layout()
plt.savefig("results/figures/correlation_heatmap.png", dpi=300)
plt.close()

print("\nAll Section 3 processing, statistics, and figures generated successfully!")


In [None]:
# ============================================================
#   Exploratory Data Analysis (EDA)
#   Kaplan–Meier Curves, Log-Rank Tests, Biomarker Distributions
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test

sns.set_style("whitegrid")

# ------------------------------------------------------------
# Load baseline dataset (created in Section 3)
# ------------------------------------------------------------

df = pd.read_csv("data/processed/pbc_clean.csv")

# Create baseline dataset (one row per patient)
df_baseline = (
    df.sort_values(["id", "year"])
      .groupby("id", as_index=False)
      .first()
)

# Survival variables
T = df_baseline["years"]
E = df_baseline["status2"]   # 1 = event (death/transplant), 0 = censored

# ------------------------------------------------------------
# 4.1 KAPLAN–MEIER SURVIVAL ESTIMATES
# ------------------------------------------------------------

kmf = KaplanMeierFitter()

# ========== Figure 4: Overall KM Curve ==========
plt.figure(figsize=(6, 5))
kmf.fit(T, event_observed=E, label="Overall Survival")
kmf.plot(ci_show=True)
plt.xlabel("Time (years)")
plt.ylabel("Survival Probability")
plt.title("Kaplan–Meier Survival Curve (All Patients)")
plt.tight_layout()
plt.savefig("results/figures/km_overall.png", dpi=300)
plt.close()

# ========== Figure 5: Stratified KM Curves ==========
strata_vars = {
    "drug": {0: "Placebo", 1: "D-Penicillamine"},
    "sex": {0: "Female", 1: "Male"},
    "edema": {
        0: "No Edema",
        1: "Edema (No Diuretics)",
        2: "Edema (Despite Diuretics)"
    }
}

for var, labels in strata_vars.items():
    plt.figure(figsize=(6, 5))

    for level, label in labels.items():
        mask = df_baseline[var] == level
        kmf.fit(
            T[mask],
            event_observed=E[mask],
            label=label
        )
        kmf.plot(ci_show=True)

    plt.xlabel("Time (years)")
    plt.ylabel("Survival Probability")
    plt.title(f"Kaplan–Meier Curves Stratified by {var.capitalize()}")
    plt.tight_layout()
    plt.savefig(f"results/figures/km_by_{var}.png", dpi=300)
    plt.close()

# ------------------------------------------------------------
# 4.2 LOG-RANK TESTS
# ------------------------------------------------------------

logrank_results = []

# --- Drug ---
group0 = df_baseline["drug"] == 0
group1 = df_baseline["drug"] == 1

result = logrank_test(
    T[group0], T[group1],
    event_observed_A=E[group0],
    event_observed_B=E[group1]
)

logrank_results.append({
    "Variable": "Drug",
    "Group A": "Placebo",
    "Group B": "D-Penicillamine",
    "Test Statistic": result.test_statistic,
    "p-value": result.p_value
})

# --- Sex ---
group0 = df_baseline["sex"] == 0
group1 = df_baseline["sex"] == 1

result = logrank_test(
    T[group0], T[group1],
    event_observed_A=E[group0],
    event_observed_B=E[group1]
)

logrank_results.append({
    "Variable": "Sex",
    "Group A": "Female",
    "Group B": "Male",
    "Test Statistic": result.test_statistic,
    "p-value": result.p_value
})

# --- Edema (No edema vs Any edema) ---
group0 = df_baseline["edema"] == 0
group1 = df_baseline["edema"] > 0

result = logrank_test(
    T[group0], T[group1],
    event_observed_A=E[group0],
    event_observed_B=E[group1]
)

logrank_results.append({
    "Variable": "Edema",
    "Group A": "No Edema",
    "Group B": "Any Edema",
    "Test Statistic": result.test_statistic,
    "p-value": result.p_value
})

# Save log-rank results
logrank_df = pd.DataFrame(logrank_results).round(4)
logrank_df.to_csv("results/tables/logrank_tests.csv", index=False)

print("\n=== Log-Rank Test Results ===")
print(logrank_df)

# ------------------------------------------------------------
# 4.3 ADDITIONAL VISUALIZATIONS
# ------------------------------------------------------------

biomarkers = [
    "serBilir", "albumin", "alkaline",
    "SGOT", "serChol", "platelets", "prothrombin"
]

# Create survival status label
df_baseline["Survival Status"] = np.where(
    df_baseline["status2"] == 1,
    "Event",
    "Censored"
)

# ========== Figure 6: Biomarker Distributions by Status ==========
for var in biomarkers:
    plt.figure(figsize=(5, 4))
    sns.violinplot(
        x="Survival Status",
        y=var,
        data=df_baseline,
        inner="box",
        cut=0
    )
    plt.title(f"{var} by Survival Status")
    plt.tight_layout()
    plt.savefig(f"results/figures/violin_{var}_by_status.png", dpi=300)
    plt.close()

print("\nSection 4 EDA completed successfully!")