In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]

df = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    low_memory=False
)


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]

# Start with a manageable sample (change to 500_000 or 1_000_000 if needed)
NROWS = 500_000

df = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    low_memory=False,
    nrows=NROWS
)

print("Loaded:", df.shape)


Loaded: (500000, 43)


In [None]:
from pathlib import Path
import pandas as pd

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]

cache_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "cache"
cache_dir.mkdir(parents=True, exist_ok=True)
parquet_path = cache_dir / "variant_summary.parquet"

if not parquet_path.exists():
    print("Building parquet cache (one-time)…")
    df_full = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
    df_full.to_parquet(parquet_path, index=False)
    print("Saved:", parquet_path)
else:
    print("Cache exists:", parquet_path)

# Use cache (FAST)
df = pd.read_parquet(parquet_path)
print("Loaded from cache:", df.shape)


Building parquet cache (one-time)…


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]

# SAFE SAMPLE SIZE
NROWS = 300_000   # <— fast, representative

df = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    low_memory=False,
    nrows=NROWS
)

print("Loaded:", df.shape)


Loaded: (300000, 43)


In [None]:
def confidence_from_reviewstatus(rs):
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()
    if rs in [
        "practice guideline",
        "reviewed by expert panel",
        "criteria provided, multiple submitters, no conflicts",
    ]:
        return "High"
    if rs == "criteria provided, single submitter":
        return "Medium"
    if rs == "no assertion criteria provided":
        return "Low"
    return "Unknown"

df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)
df["is_pathogenic"] = df["ClinicalSignificance"].astype(str).str.contains(
    "Pathogenic", case=False, na=False
)

print(
    "Columns OK:",
    "ConfidenceLevel" in df.columns,
    "is_pathogenic" in df.columns
)


Columns OK: True True


In [None]:
# --- Phase 3: Reclassification Risk Signals ---

# 1) Parse LastEvaluated and compute recency
df["LastEvaluated_dt"] = pd.to_datetime(df["LastEvaluated"], errors="coerce")

# years since last evaluation (NaN if unknown date)
df["years_since_review"] = (pd.Timestamp("today") - df["LastEvaluated_dt"]).dt.days / 365.25

# 2) Make sure NumberSubmitters is numeric (often comes as string)
df["NumberSubmitters_num"] = pd.to_numeric(df["NumberSubmitters"], errors="coerce")

print("rows:", len(df))
print("LastEvaluated available:", df["LastEvaluated_dt"].notna().mean().round(3))
print("NumberSubmitters available:", df["NumberSubmitters_num"].notna().mean().round(3))

df[["ClinicalSignificance","ReviewStatus","ConfidenceLevel","LastEvaluated","years_since_review","NumberSubmitters","NumberSubmitters_num"]].head(10)


rows: 300000
LastEvaluated available: 0.935
NumberSubmitters available: 1.0


Unnamed: 0,ClinicalSignificance,ReviewStatus,ConfidenceLevel,LastEvaluated,years_since_review,NumberSubmitters,NumberSubmitters_num
0,Pathogenic/Likely pathogenic,"criteria provided, multiple submitters, no con...",High,"Dec 17, 2024",1.051335,4,4
1,Pathogenic/Likely pathogenic,"criteria provided, multiple submitters, no con...",High,"Dec 17, 2024",1.051335,4,4
2,Pathogenic,no assertion criteria provided,Low,"Jun 29, 2010",15.520876,1,1
3,Pathogenic,no assertion criteria provided,Low,"Jun 29, 2010",15.520876,1,1
4,Uncertain significance,no assertion criteria provided,Low,"Jun 29, 2015",10.521561,1,1
5,Uncertain significance,no assertion criteria provided,Low,"Jun 29, 2015",10.521561,1,1
6,Pathogenic,"criteria provided, multiple submitters, no con...",High,"Aug 17, 2025",0.386037,6,6
7,Pathogenic,"criteria provided, multiple submitters, no con...",High,"Aug 17, 2025",0.386037,6,6
8,Likely pathogenic,"criteria provided, single submitter",Medium,"Jun 06, 2024",1.582478,2,2
9,Likely pathogenic,"criteria provided, single submitter",Medium,"Jun 06, 2024",1.582478,2,2


In [None]:
# ----------------------------------------
# Reclassification Risk Scoring
# ----------------------------------------

def reclassification_risk(row):
    score = 0

    # 1) Low or Medium confidence → higher risk
    if row["ConfidenceLevel"] == "Low":
        score += 3
    elif row["ConfidenceLevel"] == "Medium":
        score += 2

    # 2) Stale review (years since last evaluation)
    if pd.notna(row["years_since_review"]):
        if row["years_since_review"] >= 10:
            score += 3
        elif row["years_since_review"] >= 5:
            score += 2
        elif row["years_since_review"] >= 2:
            score += 1

    # 3) Few submitters → instability risk
    if pd.notna(row["NumberSubmitters_num"]):
        if row["NumberSubmitters_num"] <= 1:
            score += 3
        elif row["NumberSubmitters_num"] <= 3:
            score += 2
        elif row["NumberSubmitters_num"] <= 5:
            score += 1

    return score


df["ReclassificationRiskScore"] = df.apply(reclassification_risk, axis=1)

df["ReclassificationRiskTier"] = pd.cut(
    df["ReclassificationRiskScore"],
    bins=[-1, 2, 5, 8, 20],
    labels=["Low", "Moderate", "High", "Critical"]
)

df[[
    "ClinicalSignificance",
    "ConfidenceLevel",
    "years_since_review",
    "NumberSubmitters_num",
    "ReclassificationRiskScore",
    "ReclassificationRiskTier"
]].head(10)


Unnamed: 0,ClinicalSignificance,ConfidenceLevel,years_since_review,NumberSubmitters_num,ReclassificationRiskScore,ReclassificationRiskTier
0,Pathogenic/Likely pathogenic,High,1.051335,4,1,Low
1,Pathogenic/Likely pathogenic,High,1.051335,4,1,Low
2,Pathogenic,Low,15.520876,1,9,Critical
3,Pathogenic,Low,15.520876,1,9,Critical
4,Uncertain significance,Low,10.521561,1,9,Critical
5,Uncertain significance,Low,10.521561,1,9,Critical
6,Pathogenic,High,0.386037,6,0,Low
7,Pathogenic,High,0.386037,6,0,Low
8,Likely pathogenic,Medium,1.582478,2,4,Moderate
9,Likely pathogenic,Medium,1.582478,2,4,Moderate


Phase 3: Reclassification Risk Signals


This notebook derives a composite reclassification risk signal for ClinVar variants
based on three independent factors:

1. Evidence confidence (ReviewStatus → ConfidenceLevel)
2. Time since last evaluation
3. Number of independent submitters

The resulting ReclassificationRiskScore and ReclassificationRiskTier are intended
to identify variants at elevated risk of future reclassification.

Results in this notebook are frozen for downstream analysis.


In [None]:
df["ReclassificationRiskTier"].value_counts(normalize=True).round(3)


ReclassificationRiskTier
Low         0.514
Moderate    0.188
High        0.164
Critical    0.134
Name: proportion, dtype: float64

In [None]:
pd.crosstab(
    df["ConfidenceLevel"],
    df["ReclassificationRiskTier"],
    normalize="index"
).round(3)


ReclassificationRiskTier,Low,Moderate,High,Critical
ConfidenceLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High,0.803,0.182,0.015,0.0
Low,0.0,0.007,0.236,0.756
Medium,0.0,0.325,0.675,0.0
Unknown,0.74,0.243,0.017,0.0


In [None]:
path_df = df[df["ClinicalSignificance"].str.contains("Pathogenic", case=False, na=False)]

pd.crosstab(
    path_df["ConfidenceLevel"],
    path_df["ReclassificationRiskTier"],
    normalize="index"
).round(3)


ReclassificationRiskTier,Low,Moderate,High,Critical
ConfidenceLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High,0.759,0.224,0.016,0.0
Low,0.0,0.011,0.237,0.753
Medium,0.0,0.383,0.617,0.0
Unknown,0.914,0.086,0.0,0.0


A substantial fraction of ClinVar variants classified with high or medium confidence
exhibit elevated reclassification risk when temporal staleness and submission
diversity are considered.

Notably, variants labeled as pathogenic with low confidence and long intervals
since last evaluation disproportionately fall into High or Critical risk tiers.

This suggests that static confidence labels fail to capture reclassification
instability, motivating the need for dynamic risk-aware variant prioritization.


In [None]:
path_df.groupby("ReclassificationRiskTier")["years_since_review"] \
       .describe()[["mean","50%","75%"]].round(2)


  path_df.groupby("ReclassificationRiskTier")["years_since_review"] \


Unnamed: 0_level_0,mean,50%,75%
ReclassificationRiskTier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Low,1.47,0.99,1.51
Moderate,4.78,3.45,7.38
High,11.84,12.02,14.4
Critical,18.57,15.52,23.26


In [None]:
pd.crosstab(
    path_df["ReclassificationRiskTier"],
    path_df["ClinicalSignificance"],
    normalize="index"
).round(3)


ClinicalSignificance,Conflicting classifications of pathogenicity,Conflicting classifications of pathogenicity; association,Conflicting classifications of pathogenicity; association; risk factor,Conflicting classifications of pathogenicity; drug response,Conflicting classifications of pathogenicity; drug response; other,Conflicting classifications of pathogenicity; other,Conflicting classifications of pathogenicity; other; risk factor,Conflicting classifications of pathogenicity; protective,Conflicting classifications of pathogenicity; risk factor,Likely pathogenic,...,Pathogenic/Likely pathogenic; other,Pathogenic/Likely pathogenic; risk factor,Pathogenic/Likely risk allele,"Pathogenic/Pathogenic, low penetrance; other",Pathogenic; Affects,Pathogenic; drug response,Pathogenic; other,Pathogenic; protective,Pathogenic; risk factor,Uncertain significance; Pathogenic/Likely pathogenic
ReclassificationRiskTier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Low,0.572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032,...,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0
Moderate,0.188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
High,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.006,0.0,0.001,0.0
Critical,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.001,0.0


In [None]:
path_df.groupby("ReclassificationRiskTier")["NumberSubmitters_num"] \
       .describe()[["mean","25%","50%"]].round(2)


  path_df.groupby("ReclassificationRiskTier")["NumberSubmitters_num"] \


Unnamed: 0_level_0,mean,25%,50%
ReclassificationRiskTier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Low,7.82,4.0,6.0
Moderate,2.85,2.0,2.0
High,1.32,1.0,1.0
Critical,1.0,1.0,1.0


In [None]:
path_df.assign(
    conflicting = path_df["ClinicalSignificance"]
                  .str.contains("conflict", case=False, na=False)
).groupby("ReclassificationRiskTier")["conflicting"].mean().round(3)


  ).groupby("ReclassificationRiskTier")["conflicting"].mean().round(3)


ReclassificationRiskTier
Low         0.573
Moderate    0.188
High        0.016
Critical    0.000
Name: conflicting, dtype: float64

In [None]:
path_df = df[df["ClinicalSignificance"].str.contains(
    "Pathogenic", case=False, na=False
)].copy()


In [None]:
# 4A: Temporal decay vs risk
path_df.groupby("ReclassificationRiskTier")["years_since_review"] \
       .describe()[["mean", "50%", "75%"]].round(2)


  path_df.groupby("ReclassificationRiskTier")["years_since_review"] \


Unnamed: 0_level_0,mean,50%,75%
ReclassificationRiskTier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Low,1.47,0.99,1.51
Moderate,4.78,3.45,7.38
High,11.84,12.02,14.4
Critical,18.57,15.52,23.26


In [None]:
# 4B: Label semantics vs risk
pd.crosstab(
    path_df["ReclassificationRiskTier"],
    path_df["ClinicalSignificance"],
    normalize="index"
).round(3)


ClinicalSignificance,Conflicting classifications of pathogenicity,Conflicting classifications of pathogenicity; association,Conflicting classifications of pathogenicity; association; risk factor,Conflicting classifications of pathogenicity; drug response,Conflicting classifications of pathogenicity; drug response; other,Conflicting classifications of pathogenicity; other,Conflicting classifications of pathogenicity; other; risk factor,Conflicting classifications of pathogenicity; protective,Conflicting classifications of pathogenicity; risk factor,Likely pathogenic,...,Pathogenic/Likely pathogenic; other,Pathogenic/Likely pathogenic; risk factor,Pathogenic/Likely risk allele,"Pathogenic/Pathogenic, low penetrance; other",Pathogenic; Affects,Pathogenic; drug response,Pathogenic; other,Pathogenic; protective,Pathogenic; risk factor,Uncertain significance; Pathogenic/Likely pathogenic
ReclassificationRiskTier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Low,0.572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032,...,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0
Moderate,0.188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
High,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.006,0.0,0.001,0.0
Critical,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.001,0.0


In [None]:
# 4C: Submitter consensus vs risk
path_df.groupby("ReclassificationRiskTier")["NumberSubmitters_num"] \
       .describe()[["mean", "25%", "50%"]].round(2)


  path_df.groupby("ReclassificationRiskTier")["NumberSubmitters_num"] \


Unnamed: 0_level_0,mean,25%,50%
ReclassificationRiskTier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Low,7.82,4.0,6.0
Moderate,2.85,2.0,2.0
High,1.32,1.0,1.0
Critical,1.0,1.0,1.0


In [None]:
# 4D: Conflict enrichment vs risk
path_df.assign(
    conflicting = path_df["ClinicalSignificance"]
                  .str.contains("conflict", case=False, na=False)
).groupby("ReclassificationRiskTier")["conflicting"].mean().round(3)


  ).groupby("ReclassificationRiskTier")["conflicting"].mean().round(3)


ReclassificationRiskTier
Low         0.573
Moderate    0.188
High        0.016
Critical    0.000
Name: conflicting, dtype: float64

In [None]:
[name for name in globals().keys() if "df" in name.lower()]


[]

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]
print("File:", f)


File: /Users/sanghati/research/accountable-interpretation/data/raw/clinvar/variant_summary.txt.gz


In [3]:
NROWS = 300_000

df = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    low_memory=False,
    nrows=NROWS
)

print("Loaded df shape:", df.shape)


Loaded df shape: (300000, 43)


In [4]:
# Confidence level from ReviewStatus
def confidence_from_reviewstatus(rs):
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()
    if rs in [
        "practice guideline",
        "reviewed by expert panel",
        "criteria provided, multiple submitters, no conflicts",
    ]:
        return "High"
    if rs == "criteria provided, single submitter":
        return "Medium"
    if rs == "no assertion criteria provided":
        return "Low"
    return "Unknown"

df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)

# Time since last evaluation
df["LastEvaluated_dt"] = pd.to_datetime(df["LastEvaluated"], errors="coerce")
df["years_since_review"] = (pd.Timestamp("today") - df["LastEvaluated_dt"]).dt.days / 365.25

# Submitter count (numeric)
df["NumberSubmitters_num"] = pd.to_numeric(df["NumberSubmitters"], errors="coerce")

print("Columns created:",
      "ConfidenceLevel" in df.columns,
      "years_since_review" in df.columns,
      "NumberSubmitters_num" in df.columns)


Columns created: True True True


In [5]:
def reclassification_risk(row):
    score = 0

    # 1) Confidence
    if row["ConfidenceLevel"] == "Low":
        score += 3
    elif row["ConfidenceLevel"] == "Medium":
        score += 2

    # 2) Staleness
    yrs = row["years_since_review"]
    if pd.notna(yrs):
        if yrs >= 10:
            score += 3
        elif yrs >= 5:
            score += 2
        elif yrs >= 2:
            score += 1

    # 3) Few submitters
    nsub = row["NumberSubmitters_num"]
    if pd.notna(nsub):
        if nsub <= 1:
            score += 3
        elif nsub <= 3:
            score += 2
        elif nsub <= 5:
            score += 1

    return score

df["ReclassificationRiskScore"] = df.apply(reclassification_risk, axis=1)

df["ReclassificationRiskTier"] = pd.cut(
    df["ReclassificationRiskScore"],
    bins=[-1, 2, 5, 8, 20],
    labels=["Low", "Moderate", "High", "Critical"]
)

print(df["ReclassificationRiskTier"].value_counts())


ReclassificationRiskTier
Low         154233
Moderate     56396
High         49107
Critical     40264
Name: count, dtype: int64


In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]

NROWS = 300_000

df = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    low_memory=False,
    nrows=NROWS
)

print("df loaded:", df.shape)


df loaded: (300000, 43)


In [7]:
def reclassification_risk(row):
    score = 0

    # Confidence
    if row["ConfidenceLevel"] == "Low":
        score += 3
    elif row["ConfidenceLevel"] == "Medium":
        score += 2

    # Staleness
    yrs = row["years_since_review"]
    if pd.notna(yrs):
        if yrs >= 10:
            score += 3
        elif yrs >= 5:
            score += 2
        elif yrs >= 2:
            score += 1

    # Submitters
    nsub = row["NumberSubmitters_num"]
    if pd.notna(nsub):
        if nsub <= 1:
            score += 3
        elif nsub <= 3:
            score += 2
        elif nsub <= 5:
            score += 1

    return score


In [9]:
# Create ConfidenceLevel from ReviewStatus (your notebook uses ReviewStatus)
def confidence_from_reviewstatus(rs):
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()
    if rs in ["practice guideline", "reviewed by expert panel", "criteria provided, multiple submitters, no conflicts"]:
        return "High"
    if rs == "criteria provided, single submitter":
        return "Medium"
    if rs == "no assertion criteria provided":
        return "Low"
    return "Unknown"

df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)

print(df["ConfidenceLevel"].value_counts().head())


ConfidenceLevel
High       131546
Unknown     65643
Low         53235
Medium      49576
Name: count, dtype: int64


In [10]:
df["ReclassificationRiskScore"] = df.apply(reclassification_risk, axis=1)
print(df["ReclassificationRiskScore"].head())


KeyError: 'years_since_review'

In [12]:
from pathlib import Path
import pandas as pd
import numpy as np


In [13]:
raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]

NROWS = 300_000
df = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    low_memory=False,
    nrows=NROWS
)

print("Loaded:", df.shape)
print("Cols:", len(df.columns))


Loaded: (300000, 43)
Cols: 43


In [14]:
def confidence_from_reviewstatus(rs):
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()
    if rs in [
        "practice guideline",
        "reviewed by expert panel",
        "criteria provided, multiple submitters, no conflicts",
    ]:
        return "High"
    if rs == "criteria provided, single submitter":
        return "Medium"
    if rs == "no assertion criteria provided":
        return "Low"
    return "Unknown"

# --- ConfidenceLevel ---
if "ReviewStatus" not in df.columns:
    raise ValueError("Missing column: ReviewStatus")
df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)

# --- years_since_review ---
if "LastEvaluated" not in df.columns:
    df["LastEvaluated_dt"] = pd.NaT
    df["years_since_review"] = np.nan
else:
    df["LastEvaluated_dt"] = pd.to_datetime(df["LastEvaluated"], errors="coerce")
    df["years_since_review"] = (pd.Timestamp("today") - df["LastEvaluated_dt"]).dt.days / 365.25

# --- NumberSubmitters_num ---
if "NumberSubmitters" not in df.columns:
    df["NumberSubmitters_num"] = np.nan
else:
    df["NumberSubmitters_num"] = pd.to_numeric(df["NumberSubmitters"], errors="coerce")

# --- conflicting (safe boolean) ---
# ClinVar often encodes conflict as text in ClinicalSignificance
if "ClinicalSignificance" not in df.columns:
    df["conflicting"] = False
else:
    df["conflicting"] = df["ClinicalSignificance"].astype(str).str.contains("conflict", case=False, na=False)

print("Engineered cols present?",
      all(c in df.columns for c in ["ConfidenceLevel","years_since_review","NumberSubmitters_num","conflicting"]))


Engineered cols present? True


In [15]:
def reclassification_risk(row):
    score = 0

    # 1) Confidence
    cl = row.get("ConfidenceLevel", "Unknown")
    if cl == "Low":
        score += 3
    elif cl == "Medium":
        score += 2

    # 2) Staleness
    yrs = row.get("years_since_review", np.nan)
    if pd.notna(yrs):
        if yrs >= 10:
            score += 3
        elif yrs >= 5:
            score += 2
        elif yrs >= 2:
            score += 1

    # 3) Submitters
    nsub = row.get("NumberSubmitters_num", np.nan)
    if pd.notna(nsub):
        if nsub <= 1:
            score += 3
        elif nsub <= 3:
            score += 2
        elif nsub <= 5:
            score += 1

    return score

df["ReclassificationRiskScore"] = df.apply(reclassification_risk, axis=1)

df["ReclassificationRiskTier"] = pd.cut(
    df["ReclassificationRiskScore"],
    bins=[-1, 2, 5, 8, 20],
    labels=["Low", "Moderate", "High", "Critical"]
)

print(df["ReclassificationRiskTier"].value_counts())


ReclassificationRiskTier
Low         154233
Moderate     56396
High         49107
Critical     40264
Name: count, dtype: int64


In [16]:
path_df = df[df["ClinicalSignificance"].astype(str).str.contains("Pathogenic", case=False, na=False)].copy()
print("path_df shape:", path_df.shape)


path_df shape: (151527, 50)


In [17]:
out_dir = Path("outputs") / "tables"
out_dir.mkdir(parents=True, exist_ok=True)

export_cols = [
    "VariationID", "GeneSymbol", "RCVaccession",
    "ClinicalSignificance", "ReviewStatus",
    "ConfidenceLevel", "LastEvaluated", "years_since_review",
    "NumberSubmitters", "NumberSubmitters_num",
    "conflicting",
    "ReclassificationRiskScore", "ReclassificationRiskTier"
]

# keep only columns that exist (no KeyError)
export_cols = [c for c in export_cols if c in df.columns]

export_path = out_dir / "phase3_engineered.csv.gz"
df[export_cols].to_csv(export_path, index=False, compression="gzip")

print("Saved:", export_path, "cols:", len(export_cols), "rows:", len(df))


Saved: outputs/tables/phase3_engineered.csv.gz cols: 13 rows: 300000


In [18]:
from pathlib import Path

print("Here are files in outputs/:")
for p in Path("outputs").rglob("*"):
    if p.is_file():
        print(p)


Here are files in outputs/:
outputs/tables/phase3_engineered.csv.gz


In [19]:
from pathlib import Path

out_dir = Path("outputs") / "tables"
out_dir.mkdir(parents=True, exist_ok=True)

export_path = out_dir / "phase3_engineered.csv.gz"

# IMPORTANT: use path_df if that’s your engineered dataset, otherwise use df
# If your model should use path_df (pathogenic subset), save path_df:
path_df.to_csv(export_path, index=False, compression="gzip")

print("✅ Saved:", export_path)
print("Shape:", path_df.shape)


✅ Saved: outputs/tables/phase3_engineered.csv.gz
Shape: (151527, 50)


In [20]:
from pathlib import Path

# Create output directory if it doesn't exist
out_dir = Path("outputs") / "tables"
out_dir.mkdir(parents=True, exist_ok=True)

# Define export path
export_path = out_dir / "phase3_engineered.csv.gz"

# Save FULL dataframe
df.to_csv(export_path, index=False, compression="gzip")

print("✅ Saved FULL engineered dataframe")
print("Path:", export_path)
print("Shape:", df.shape)


✅ Saved FULL engineered dataframe
Path: outputs/tables/phase3_engineered.csv.gz
Shape: (300000, 50)


In [21]:
from pathlib import Path

assert (Path("outputs") / "tables" / "phase3_engineered.csv.gz").exists()
print("✅ File exists and ready for Phase 4")


✅ File exists and ready for Phase 4


In [22]:
from pathlib import Path

ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
OUT_TABLES = ROOT / "outputs" / "tables"
OUT_TABLES.mkdir(parents=True, exist_ok=True)

out_file = OUT_TABLES / "phase3_engineered.csv.gz"

df.to_csv(out_file, index=False, compression="gzip")
print("✅ Saved:", out_file)
print("✅ Shape:", df.shape)


✅ Saved: /Users/sanghati/research/accountable-interpretation/notebooks/outputs/tables/phase3_engineered.csv.gz
✅ Shape: (300000, 50)


In [23]:
from pathlib import Path

ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
OUT_TABLES = ROOT / "outputs" / "tables"

print("✅ Files now in outputs/tables:")
for f in sorted(OUT_TABLES.glob("*")):
    print(" -", f.name)


✅ Files now in outputs/tables:
 - phase3_engineered.csv.gz
