In [13]:
phdproj (Python 3.11.x)


SyntaxError: invalid syntax. Perhaps you forgot a comma? (1535391732.py, line 1)

In [None]:
import pandas as pd
from pathlib import Path

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]

usecols = [
    "ClinicalSignificance",
    "ReviewStatus",
    "NumberSubmitters",
    "LastEvaluated",
]

df = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    usecols=lambda c: c in usecols,
    nrows=500_000,
    low_memory=False,
)

df.shape


(500000, 4)

In [None]:
import pandas as pd
from pathlib import Path

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]

usecols = [
    "ClinicalSignificance",
    "ReviewStatus",
    "ConfidenceLevel",
    "NumberSubmitters",
    "LastEvaluated",
]

df = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    usecols=lambda c: c in usecols,
    nrows=500_000,
    low_memory=False,
)

df.shape


(500000, 4)

In [None]:
df["LastEvaluated"] = pd.to_datetime(df["LastEvaluated"], errors="coerce")
df["years_since_review"] = (
    pd.Timestamp("today") - df["LastEvaluated"]
).dt.days / 365.25

df.groupby("ConfidenceLevel")["years_since_review"].describe().round(2)


KeyError: 'ConfidenceLevel'

In [None]:
import pandas as pd
from pathlib import Path

# -----------------------------
# Load data (Phase 2 is standalone)
# -----------------------------
raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]

usecols = [
    "ClinicalSignificance",
    "ReviewStatus",
    "LastEvaluated",
]

df = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    usecols=lambda c: c in usecols,
    nrows=500_000,
    low_memory=False
)

# -----------------------------
# Rebuild ConfidenceLevel (explicit)
# -----------------------------
def confidence_from_reviewstatus(rs):
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()
    if rs in [
        "practice guideline",
        "reviewed by expert panel",
        "criteria provided, multiple submitters, no conflicts",
    ]:
        return "High"
    if rs == "criteria provided, single submitter":
        return "Medium"
    if rs == "no assertion criteria provided":
        return "Low"
    return "Unknown"

df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)

# -----------------------------
# Time since last evaluation
# -----------------------------
df["LastEvaluated"] = pd.to_datetime(df["LastEvaluated"], errors="coerce")

df["years_since_review"] = (
    (pd.Timestamp.today() - df["LastEvaluated"])
    .dt.days / 365.25
)

# -----------------------------
# Stability summary
# -----------------------------
df.groupby("ConfidenceLevel")["years_since_review"].describe().round(2)


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
ConfidenceLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
High,211956.0,2.88,3.11,0.03,0.92,1.22,3.81,21.84
Low,47609.0,15.65,6.25,0.05,12.27,14.48,16.47,61.01
Medium,125580.0,8.3,3.18,0.04,7.98,7.98,9.77,20.01
Unknown,93366.0,1.63,1.94,0.04,0.81,1.03,1.64,18.49


In [None]:
# -----------------------------
# Staleness flags + simple risk tiers
# -----------------------------
stale_cutoff_years = 5  # you can change to 3 or 7 later

df["is_stale"] = df["years_since_review"].ge(stale_cutoff_years) | df["years_since_review"].isna()

# Simple tiering (can refine later)
def risk_tier(row):
    conf = row["ConfidenceLevel"]
    yrs = row["years_since_review"]

    if pd.isna(yrs):
        return "Tier 3 (Unknown date)"
    if conf == "Low" and yrs >= stale_cutoff_years:
        return "Tier 3 (Low + Stale)"
    if conf in ["Medium", "Unknown"] and yrs >= stale_cutoff_years:
        return "Tier 2 (Med/Unk + Stale)"
    if conf == "High" and yrs >= stale_cutoff_years:
        return "Tier 1 (High but Stale)"
    return "Tier 0 (Recent)"

df["RiskTier"] = df.apply(risk_tier, axis=1)

# Distribution (how much falls into each tier)
tier_counts = df["RiskTier"].value_counts(dropna=False)
tier_props = (tier_counts / len(df)).round(4)

pd.DataFrame({"count": tier_counts, "proportion": tier_props})


Unnamed: 0_level_0,count,proportion
RiskTier,Unnamed: 1_level_1,Unnamed: 2_level_1
Tier 0 (Recent),277826,0.5557
Tier 2 (Med/Unk + Stale),111688,0.2234
Tier 3 (Low + Stale),46743,0.0935
Tier 1 (High but Stale),42254,0.0845
Tier 3 (Unknown date),21489,0.043


In [None]:
pd.crosstab(df["ConfidenceLevel"], df["RiskTier"], normalize="index").round(3)


RiskTier,Tier 0 (Recent),Tier 1 (High but Stale),Tier 2 (Med/Unk + Stale),Tier 3 (Low + Stale),Tier 3 (Unknown date)
ConfidenceLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
High,0.799,0.199,0.0,0.0,0.002
Low,0.016,0.0,0.0,0.86,0.124
Medium,0.149,0.0,0.831,0.0,0.02
Unknown,0.839,0.0,0.049,0.0,0.112


In [None]:
# ----------------------------------------
# Pathogenic-only stability risk analysis
# ----------------------------------------
path_df = df[df["is_pathogenic"]].copy()

tier_counts_p = path_df["RiskTier"].value_counts(dropna=False)
tier_props_p = (tier_counts_p / len(path_df)).round(4)

pd.DataFrame({
    "count": tier_counts_p,
    "proportion": tier_props_p
})


KeyError: 'is_pathogenic'

In [None]:
# ----------------------------------------
# Rebuild derived columns (Phase 2 safe)
# ----------------------------------------

# 1) ConfidenceLevel (needed for RiskTier logic)
def confidence_from_reviewstatus(rs):
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()
    if rs in [
        "practice guideline",
        "reviewed by expert panel",
        "criteria provided, multiple submitters, no conflicts",
    ]:
        return "High"
    if rs == "criteria provided, single submitter":
        return "Medium"
    if rs == "no assertion criteria provided":
        return "Low"
    return "Unknown"

df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)

# 2) Pathogenic flag (REQUIRED)
df["is_pathogenic"] = (
    df["ClinicalSignificance"]
    .astype(str)
    .str.contains("Pathogenic", case=False, na=False)
)

# sanity check
print(
    "Columns exist:",
    "ConfidenceLevel" in df.columns,
    "is_pathogenic" in df.columns,
)


Columns exist: True True


In [None]:
# ----------------------------------------
# Pathogenic-only stability risk analysis
# ----------------------------------------
path_df = df[df["is_pathogenic"]].copy()

tier_counts_p = path_df["RiskTier"].value_counts(dropna=False)
tier_props_p = (tier_counts_p / len(path_df)).round(4)

pd.DataFrame({
    "count": tier_counts_p,
    "proportion": tier_props_p
})


Unnamed: 0_level_0,count,proportion
RiskTier,Unnamed: 1_level_1,Unnamed: 2_level_1
Tier 0 (Recent),139322,0.6796
Tier 3 (Low + Stale),25742,0.1256
Tier 2 (Med/Unk + Stale),21665,0.1057
Tier 1 (High but Stale),14935,0.0729
Tier 3 (Unknown date),3345,0.0163


In [None]:
pd.crosstab(
    path_df["ConfidenceLevel"],
    path_df["RiskTier"],
    normalize="index"
).round(3)


RiskTier,Tier 0 (Recent),Tier 1 (High but Stale),Tier 2 (Med/Unk + Stale),Tier 3 (Low + Stale),Tier 3 (Unknown date)
ConfidenceLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
High,0.735,0.265,0.0,0.0,0.0
Low,0.023,0.0,0.0,0.888,0.089
Medium,0.332,0.0,0.641,0.0,0.027
Unknown,0.956,0.0,0.044,0.0,0.0


## Phase 2: Clinical Stability Risk of ClinVar Assertions


### What is being measured?
This analysis evaluates the temporal stability of ClinVar variant assertions by combining:

- **ConfidenceLevel** (review rigor)
- **Time since last evaluation**
- **Clinical significance (Pathogenic vs others)**

Each variant is assigned a **RiskTier**, where higher tiers indicate
greater clinical risk due to stale review or low confidence.

---

### RiskTier Definitions
- **Tier 0 (Recent):** Reviewed recently (low risk)
- **Tier 1 (High but stale):** High confidence but outdated
- **Tier 2 (Medium/Unknown + stale):** Moderate interpretability risk
- **Tier 3 (Low confidence + stale):** Highest risk
- **Tier 3 (Unknown date):** Unverifiable review stability


## Key Findings: Stability Risk

### Overall observations
- A majority of variants fall into **Tier 0 (recently reviewed)**.
- However, a substantial fraction of variants occupy **stale tiers**,
  including high-confidence but outdated assertions.

### Confidence Ã— Stability interaction
- **High-confidence variants** are mostly recent, but a non-trivial
  portion are **stale**, indicating delayed re-evaluation.
- **Low-confidence variants** disproportionately fall into **Tier 3**,
  representing the **highest clinical interpretability risk**.

### Pathogenic-only risk
Among pathogenic variants:
- A significant share are **stale or low-confidence**
- These represent variants most likely to cause **clinical misinterpretation**
  if relied upon without revalidation.
