In [132]:
from pathlib import Path
import pandas as pd
import numpy as np

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]
f


PosixPath('/Users/sanghati/research/accountable-interpretation/data/raw/clinvar/variant_summary.txt.gz')

In [133]:
df_sample = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    nrows=100_000,
    low_memory=False
)

df_sample.shape
df_sample.columns.tolist()


['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID',
 'PositionVCF',
 'ReferenceAlleleVCF',
 'AlternateAlleleVCF',
 'SomaticClinicalImpact',
 'SomaticClinicalImpactLastEvaluated',
 'ReviewStatusClinicalImpact',
 'Oncogenicity',
 'OncogenicityLastEvaluated',
 'ReviewStatusOncogenicity',
 'SCVsForAggregateGermlineClassification',
 'SCVsForAggregateSomaticClinicalImpact',
 'SCVsForAggregateOncogenicityClassification']

In [134]:
# Phase 2 — Define Confidence Levels
confidence_map = {
    "reviewed by expert panel": "High",
    "practice guideline": "High",
    "criteria provided, multiple submitters, no conflicts": "High",
    "criteria provided, single submitter": "Medium",
    "criteria provided, conflicting classifications": "Low",
    "no assertion criteria provided": "Low",
    "no classification provided": "Low"
}

df_sample["ConfidenceLevel"] = (
    df_sample["ReviewStatus"]
    .map(confidence_map)
    .fillna("Unknown")
)

df_sample["ConfidenceLevel"].value_counts()


ConfidenceLevel
High       41815
Low        38895
Medium     18909
Unknown      381
Name: count, dtype: int64

In [135]:
# Focus on pathogenic calls
pathogenic_mask = df_sample["ClinicalSignificance"].str.contains(
    "Pathogenic", na=False
)

df_pathogenic = df_sample[pathogenic_mask]

df_pathogenic["ConfidenceLevel"].value_counts(normalize=True).round(3)


ConfidenceLevel
High      0.508
Low       0.320
Medium    0.172
Name: proportion, dtype: float64

In [136]:
# Pathogenic but NOT high confidence
mismatch_rate = (
    df_pathogenic["ConfidenceLevel"]
    .isin(["Low", "Medium"])
    .mean()
)

mismatch_rate


np.float64(0.4918593798610391)

In [137]:
pd.crosstab(
    df_pathogenic["ClinicalSignificance"],
    df_pathogenic["ConfidenceLevel"],
    normalize="index"
).round(3)


ConfidenceLevel,High,Low,Medium
ClinicalSignificance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pathogenic,0.419,0.374,0.207
"Pathogenic, low penetrance",0.0,0.0,1.0
Pathogenic/Likely pathogenic,0.97,0.029,0.0
Pathogenic/Likely pathogenic/Likely risk allele,1.0,0.0,0.0
"Pathogenic/Likely pathogenic/Pathogenic, low penetrance",1.0,0.0,0.0
"Pathogenic/Likely pathogenic/Pathogenic, low penetrance; risk factor",1.0,0.0,0.0
Pathogenic/Likely pathogenic; other,0.0,1.0,0.0
Pathogenic/Likely pathogenic; risk factor,1.0,0.0,0.0
Pathogenic/Likely risk allele,0.667,0.333,0.0
"Pathogenic/Pathogenic, low penetrance; other",1.0,0.0,0.0


## Key Finding: Confidence–Label Mismatch


In [138]:
from pathlib import Path
import pandas as pd
import numpy as np

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]
f


PosixPath('/Users/sanghati/research/accountable-interpretation/data/raw/clinvar/variant_summary.txt.gz')

## Key Finding: Confidence–Label Mismatch


In [139]:
df_sample.shape


(100000, 44)

In [140]:
from pathlib import Path
import pandas as pd
import numpy as np

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]
f


PosixPath('/Users/sanghati/research/accountable-interpretation/data/raw/clinvar/variant_summary.txt.gz')

In [141]:
usecols = [
    "ClinicalSignificance",
    "ReviewStatus",
    "NumberSubmitters",
    "Guidelines",
    "LastEvaluated",
    "GeneSymbol",
    "Type",
]

df = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    usecols=lambda c: c in usecols,
    low_memory=False,
    nrows=500_000,  # start with 500k (fast). Later we can go full.
)

df.shape


(500000, 7)

In [142]:
summary = pd.Series({
    "rows_loaded": len(df),
    "pathogenic_rows": int(df["is_pathogenic"].sum()),
    "mismatch_rows": int(df["is_mismatch"].sum()),
    "mismatch_rate_among_pathogenic": float(df.loc[df["is_pathogenic"], "is_mismatch"].mean()),
})
summary


KeyError: 'is_pathogenic'

In [None]:
# --- rebuild derived columns safely (run after df is created) ---

def confidence_from_reviewstatus(rs: str) -> str:
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()

    high = [
        "practice guideline",
        "reviewed by expert panel",
        "criteria provided, multiple submitters, no conflicts",
    ]
    medium = ["criteria provided, single submitter"]
    low = ["no assertion criteria provided"]

    if rs in [s.lower() for s in high]:
        return "High"
    if rs in [s.lower() for s in medium]:
        return "Medium"
    if rs in [s.lower() for s in low]:
        return "Low"
    return "Unknown"

df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)

df["is_pathogenic"] = df["ClinicalSignificance"].astype(str).str.contains("Pathogenic", case=False, na=False)
df["is_mismatch"] = df["is_pathogenic"] & (df["ConfidenceLevel"] != "High")

df[["ClinicalSignificance","ReviewStatus","ConfidenceLevel","is_pathogenic","is_mismatch"]].head()


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]

df = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    nrows=100_000,
    low_memory=False
)

df.shape


(100000, 43)

In [None]:
def confidence_from_reviewstatus(rs: str) -> str:
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()

    if rs == "criteria provided, multiple submitters, no conflicts":
        return "High"
    if rs == "reviewed by expert panel":
        return "High"
    if rs == "practice guideline":
        return "High"
    if rs == "criteria provided, single submitter":
        return "Medium"
    if rs == "no assertion criteria provided":
        return "Low"
    return "Unknown"

df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)

df["is_pathogenic"] = df["ClinicalSignificance"].astype(str).str.contains(
    "Pathogenic", case=False, na=False
)

df["is_mismatch"] = df["is_pathogenic"] & (df["ConfidenceLevel"] != "High")

df[["ClinicalSignificance","ReviewStatus","ConfidenceLevel","is_pathogenic","is_mismatch"]].head()


Unnamed: 0,ClinicalSignificance,ReviewStatus,ConfidenceLevel,is_pathogenic,is_mismatch
0,Pathogenic/Likely pathogenic,"criteria provided, multiple submitters, no con...",High,True,False
1,Pathogenic/Likely pathogenic,"criteria provided, multiple submitters, no con...",High,True,False
2,Pathogenic,no assertion criteria provided,Low,True,True
3,Pathogenic,no assertion criteria provided,Low,True,True
4,Uncertain significance,no assertion criteria provided,Low,False,False


In [None]:
("is_pathogenic" in df.columns,
 "is_mismatch" in df.columns,
 "ConfidenceLevel" in df.columns)


(True, True, True)

In [None]:
summary = pd.DataFrame([{
    "rows_loaded": len(df),
    "pathogenic_rows": int(df["is_pathogenic"].sum()),
    "mismatch_rows (pathogenic & not high confidence)": int(df["is_mismatch"].sum()),
    "mismatch_rate_of_pathogenic": round(
        df["is_mismatch"].sum() / max(df["is_pathogenic"].sum(), 1), 4
    ),
    "mismatch_rate_overall": round(
        df["is_mismatch"].sum() / max(len(df), 1), 4
    ),
}])

summary


KeyError: 'is_pathogenic'

In [None]:
df = df_sample.copy()


In [None]:
df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)

df["is_pathogenic"] = df["ClinicalSignificance"].astype(str).str.contains(
    "Pathogenic", case=False, na=False
)

df["is_mismatch"] = df["is_pathogenic"] & (df["ConfidenceLevel"] != "High")


In [None]:
df[["ClinicalSignificance","ConfidenceLevel","is_pathogenic","is_mismatch"]].head()


In [None]:
summary = pd.DataFrame([{
    "rows_loaded": len(df),
    "pathogenic_rows": int(df["is_pathogenic"].sum()),
    "mismatch_rows (pathogenic & not high confidence)": int(df["is_mismatch"].sum()),
    "mismatch_rate_of_pathogenic": round(
        df["is_mismatch"].sum() / max(df["is_pathogenic"].sum(), 1), 4
    ),
    "mismatch_rate_overall": round(
        df["is_mismatch"].sum() / max(len(df), 1), 4
    ),
}])

summary


KeyError: 'is_pathogenic'

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
f = sorted(raw_dir.glob("*variant_summary*"))[0]

df = pd.read_csv(
    f,
    sep="\t",
    compression="gzip",
    nrows=100_000,
    low_memory=False
)

df.shape


(100000, 43)

In [None]:
def confidence_from_reviewstatus(rs: str) -> str:
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()

    if rs in [
        "practice guideline",
        "reviewed by expert panel",
        "criteria provided, multiple submitters, no conflicts",
    ]:
        return "High"
    if rs == "criteria provided, single submitter":
        return "Medium"
    if rs == "no assertion criteria provided":
        return "Low"
    return "Unknown"

df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)


In [None]:
df["is_pathogenic"] = df["ClinicalSignificance"].astype(str).str.contains(
    "Pathogenic", case=False, na=False
)

df["is_mismatch"] = df["is_pathogenic"] & (df["ConfidenceLevel"] != "High")


KeyError: 'ConfidenceLevel'

In [None]:
# --- Cell 3 (safe): create flags even if notebook state/order changed ---

# 1) Make sure df exists
if "df" not in globals():
    raise NameError("df is not defined. Run Cell 1 first (the pd.read_csv(...) cell).")

# 2) Make sure required columns exist
required = ["ClinicalSignificance", "ReviewStatus"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise KeyError(f"Missing columns in df: {missing}\nAvailable columns: {list(df.columns)[:20]} ...")

# 3) Ensure ConfidenceLevel exists (create it if Cell 2 wasn’t run)
if "ConfidenceLevel" not in df.columns:
    def confidence_from_reviewstatus(rs: str) -> str:
        if pd.isna(rs):
            return "Unknown"
        rs = str(rs).strip().lower()
        if rs in [
            "practice guideline",
            "reviewed by expert panel",
            "criteria provided, multiple submitters, no conflicts",
        ]:
            return "High"
        if rs == "criteria provided, single submitter":
            return "Medium"
        if rs == "no assertion criteria provided":
            return "Low"
        return "Unknown"

    df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)

# 4) Now build the flags
df["is_pathogenic"] = df["ClinicalSignificance"].astype(str).str.contains("Pathogenic", case=False, na=False)
df["is_mismatch"] = df["is_pathogenic"] & (df["ConfidenceLevel"] != "High")

df[["ClinicalSignificance","ReviewStatus","ConfidenceLevel","is_pathogenic","is_mismatch"]].head()


Unnamed: 0,ClinicalSignificance,ReviewStatus,ConfidenceLevel,is_pathogenic,is_mismatch
0,Pathogenic/Likely pathogenic,"criteria provided, multiple submitters, no con...",High,True,False
1,Pathogenic/Likely pathogenic,"criteria provided, multiple submitters, no con...",High,True,False
2,Pathogenic,no assertion criteria provided,Low,True,True
3,Pathogenic,no assertion criteria provided,Low,True,True
4,Uncertain significance,no assertion criteria provided,Low,False,False


In [None]:
# --- Summary metrics ---
summary = pd.DataFrame([{
    "rows_loaded": len(df),
    "pathogenic_rows": int(df["is_pathogenic"].sum()),
    "mismatch_rows (pathogenic & not high confidence)": int(df["is_mismatch"].sum()),
    "mismatch_rate_of_pathogenic": round(
        df["is_mismatch"].sum() / max(df["is_pathogenic"].sum(), 1), 4
    ),
    "mismatch_rate_overall": round(
        df["is_mismatch"].sum() / max(len(df), 1), 4
    ),
}])

summary


KeyError: 'is_pathogenic'

In [None]:
# --- Make sure we're using the same dataframe everywhere ---
# If you loaded as df_sample earlier, this ensures df points to it
try:
    df
except NameError:
    df = df_sample

# --- Rebuild derived columns (always safe to run) ---
def confidence_from_reviewstatus(rs):
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()
    if rs in [
        "practice guideline",
        "reviewed by expert panel",
        "criteria provided, multiple submitters, no conflicts",
    ]:
        return "High"
    if rs == "criteria provided, single submitter":
        return "Medium"
    if rs == "no assertion criteria provided":
        return "Low"
    return "Unknown"

df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)
df["is_pathogenic"] = df["ClinicalSignificance"].astype(str).str.contains("Pathogenic", case=False, na=False)
df["is_mismatch"] = df["is_pathogenic"] & (df["ConfidenceLevel"] != "High")

# quick sanity check (should print True True True)
("is_pathogenic" in df.columns, "is_mismatch" in df.columns, "ConfidenceLevel" in df.columns)


In [None]:
# --- Make sure we're using the same dataframe everywhere ---
# If you loaded as df_sample earlier, this ensures df points to it
try:
    df
except NameError:
    df = df_sample

import pandas as pd

# --- Rebuild derived columns (always safe to run) ---
def confidence_from_reviewstatus(rs):
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()

    if rs in [
        "practice guideline",
        "reviewed by expert panel",
        "criteria provided, multiple submitters, no conflicts",
    ]:
        return "High"
    if rs == "criteria provided, single submitter":
        return "Medium"
    if rs == "no assertion criteria provided":
        return "Low"
    return "Unknown"

# build columns
df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)
df["is_pathogenic"] = df["ClinicalSignificance"].astype(str).str.contains(
    "Pathogenic", case=False, na=False
)
df["is_mismatch"] = df["is_pathogenic"] & (df["ConfidenceLevel"] != "High")

# --- FORCE VISIBLE OUTPUT ---
print("SANITY CHECK:")
print("Columns present:",
      ("is_pathogenic" in df.columns,
       "is_mismatch" in df.columns,
       "ConfidenceLevel" in df.columns))

print("\nData shape:", df.shape)

print("\nPreview:")
print(df[
    ["ClinicalSignificance", "ReviewStatus", "ConfidenceLevel",
     "is_pathogenic", "is_mismatch"]
].head(5))


SANITY CHECK:
Columns present: (True, True, True)

Data shape: (500000, 10)

Preview:
           ClinicalSignificance  \
0  Pathogenic/Likely pathogenic   
1  Pathogenic/Likely pathogenic   
2                    Pathogenic   
3                    Pathogenic   
4        Uncertain significance   

                                        ReviewStatus ConfidenceLevel  \
0  criteria provided, multiple submitters, no con...            High   
1  criteria provided, multiple submitters, no con...            High   
2                     no assertion criteria provided             Low   
3                     no assertion criteria provided             Low   
4                     no assertion criteria provided             Low   

   is_pathogenic  is_mismatch  
0           True        False  
1           True        False  
2           True         True  
3           True         True  
4          False        False  


In [None]:
# --- Summary metrics ---
summary = pd.DataFrame([{
    "rows_loaded": int(len(df)),
    "pathogenic_rows": int(df["is_pathogenic"].sum()),
    "mismatch_rows_pathogenic_not_high": int(df["is_mismatch"].sum()),
    "mismatch_rate_of_pathogenic": round(
        float(df["is_mismatch"].sum()) / max(float(df["is_pathogenic"].sum()), 1.0), 4
    ),
    "mismatch_rate_overall": round(
        float(df["is_mismatch"].sum()) / max(float(len(df)), 1.0), 4
    ),
}])

summary


KeyError: 'is_pathogenic'

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

# 1) Ensure df exists (use df_sample if that's what you used)
if "df" not in globals():
    if "df_sample" in globals():
        df = df_sample
    else:
        # If nothing loaded yet, load a 100k sample again
        raw_dir = Path.home() / "research" / "accountable-interpretation" / "data" / "raw" / "clinvar"
        f = sorted(raw_dir.glob("*variant_summary*"))[0]
        df = pd.read_csv(f, sep="\t", compression="gzip", nrows=100_000, low_memory=False)

# 2) Confidence mapping
def confidence_from_reviewstatus(rs):
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()
    if rs in [
        "practice guideline",
        "reviewed by expert panel",
        "criteria provided, multiple submitters, no conflicts",
    ]:
        return "High"
    if rs == "criteria provided, single submitter":
        return "Medium"
    if rs == "no assertion criteria provided":
        return "Low"
    return "Unknown"

# 3) Rebuild derived columns (always safe)
df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)
df["is_pathogenic"] = df["ClinicalSignificance"].astype(str).str.contains("Pathogenic", case=False, na=False)
df["is_mismatch"] = df["is_pathogenic"] & (df["ConfidenceLevel"] != "High")

# 4) Confirm (should print True True True)
print("cols exist?:", 
      "is_pathogenic" in df.columns, 
      "is_mismatch" in df.columns, 
      "ConfidenceLevel" in df.columns)

df[["ClinicalSignificance","ReviewStatus","ConfidenceLevel","is_pathogenic","is_mismatch"]].head()


cols exist?: True True True


Unnamed: 0,ClinicalSignificance,ReviewStatus,ConfidenceLevel,is_pathogenic,is_mismatch
0,Pathogenic/Likely pathogenic,"criteria provided, multiple submitters, no con...",High,True,False
1,Pathogenic/Likely pathogenic,"criteria provided, multiple submitters, no con...",High,True,False
2,Pathogenic,no assertion criteria provided,Low,True,True
3,Pathogenic,no assertion criteria provided,Low,True,True
4,Uncertain significance,no assertion criteria provided,Low,False,False


In [None]:
summary = pd.DataFrame([{
    "rows_loaded": int(len(df)),
    "pathogenic_rows": int(df["is_pathogenic"].sum()),
    "mismatch_rows_pathogenic_not_high": int(df["is_mismatch"].sum()),
    "mismatch_rate_of_pathogenic": round(
        float(df["is_mismatch"].sum()) / max(float(df["is_pathogenic"].sum()), 1.0), 4
    ),
    "mismatch_rate_overall": round(
        float(df["is_mismatch"].sum()) / max(float(len(df)), 1.0), 4
    ),
}])

summary


KeyError: 'is_pathogenic'

In [143]:
# ---- SAFE SUMMARY (works even if df got overwritten) ----
import pandas as pd

# 1) Make sure df points to the real dataframe you want
# If you used df_sample earlier, this ties df to it.
if "df" not in globals() and "df_sample" in globals():
    df = df_sample

print("Using df id:", id(df))
print("Has is_pathogenic before rebuild?", "is_pathogenic" in df.columns)

# 2) Rebuild derived columns IF missing
def confidence_from_reviewstatus(rs):
    if pd.isna(rs):
        return "Unknown"
    rs = str(rs).strip().lower()
    if rs in [
        "practice guideline",
        "reviewed by expert panel",
        "criteria provided, multiple submitters, no conflicts",
    ]:
        return "High"
    if rs == "criteria provided, single submitter":
        return "Medium"
    if rs == "no assertion criteria provided":
        return "Low"
    return "Unknown"

if "ConfidenceLevel" not in df.columns:
    df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)

if "is_pathogenic" not in df.columns:
    df["is_pathogenic"] = df["ClinicalSignificance"].astype(str).str.contains(
        "Pathogenic", case=False, na=False
    )

if "is_mismatch" not in df.columns:
    df["is_mismatch"] = df["is_pathogenic"] & (df["ConfidenceLevel"] != "High")

print("Now has is_pathogenic?", "is_pathogenic" in df.columns)

# 3) Summary
summary = pd.DataFrame([{
    "rows_loaded": int(len(df)),
    "pathogenic_rows": int(df["is_pathogenic"].sum()),
    "mismatch_rows_pathogenic_not_high": int(df["is_mismatch"].sum()),
    "mismatch_rate_of_pathogenic": round(
        float(df["is_mismatch"].sum()) / max(float(df["is_pathogenic"].sum()), 1.0), 4
    ),
    "mismatch_rate_overall": round(
        float(df["is_mismatch"].sum()) / max(float(len(df)), 1.0), 4
    ),
}])

summary


Using df id: 5355500944
Has is_pathogenic before rebuild? False
Now has is_pathogenic? True


Unnamed: 0,rows_loaded,pathogenic_rows,mismatch_rows_pathogenic_not_high,mismatch_rate_of_pathogenic,mismatch_rate_overall
0,500000,205009,148648,0.7251,0.2973


In [144]:
# --- Phase 2A: Where mismatch comes from (ReviewStatus) ---

patho = df[df["is_pathogenic"]].copy()

review_summary = (
    patho.groupby(["ReviewStatus", "ConfidenceLevel"], dropna=False)
    .size()
    .reset_index(name="n")
    .sort_values("n", ascending=False)
)

# Mismatch drivers: pathogenic rows where confidence != High
mismatch_by_review = (
    patho[patho["is_mismatch"]]
    .groupby("ReviewStatus", dropna=False)
    .size()
    .sort_values(ascending=False)
    .head(15)
)

review_summary.head(20), mismatch_by_review


(                                        ReviewStatus ConfidenceLevel      n
 0     criteria provided, conflicting classifications         Unknown  92151
 1  criteria provided, multiple submitters, no con...            High  40257
 3                     no assertion criteria provided             Low  29002
 2                criteria provided, single submitter          Medium  27495
 5                           reviewed by expert panel            High  16056
 4                                 practice guideline            High     48,
 ReviewStatus
 criteria provided, conflicting classifications    92151
 no assertion criteria provided                    29002
 criteria provided, single submitter               27495
 dtype: int64)

In [145]:
# --- Phase 2B: Pathogenic confidence distribution ---
patho_conf = patho["ConfidenceLevel"].value_counts(dropna=False)
patho_conf_pct = (patho_conf / len(patho)).round(4)

patho_conf, patho_conf_pct


(ConfidenceLevel
 Unknown    92151
 High       56361
 Low        29002
 Medium     27495
 Name: count, dtype: int64,
 ConfidenceLevel
 Unknown    0.4495
 High       0.2749
 Low        0.1415
 Medium     0.1341
 Name: count, dtype: float64)

In [146]:
# --- Phase 2C: Which pathogenic-style labels are driving mismatch? ---

patho_label_mix = (
    patho.groupby(["ConfidenceLevel", "ClinicalSignificance"], dropna=False)
    .size()
    .sort_values(ascending=False)
)

patho_label_mix.head(25)


ConfidenceLevel  ClinicalSignificance                                     
Unknown          Conflicting classifications of pathogenicity                 92059
High             Pathogenic                                                   33620
Low              Pathogenic                                                   25826
Medium           Pathogenic                                                   19963
High             Pathogenic/Likely pathogenic                                 18107
Medium           Likely pathogenic                                             7510
High             Likely pathogenic                                             4458
Low              Likely pathogenic                                             2201
                 Conflicting classifications of pathogenicity                   386
                 Pathogenic/Likely pathogenic                                   359
                 Pathogenic; other                                              146
H

In [147]:
# --- Phase 2D: Mismatch rate vs NumberSubmitters ---

# Make numeric safely
df["NumberSubmitters_num"] = pd.to_numeric(df["NumberSubmitters"], errors="coerce")

# Only pathogenic rows
patho2 = df[df["is_pathogenic"] & df["NumberSubmitters_num"].notna()].copy()

submitter_bins = pd.cut(
    patho2["NumberSubmitters_num"],
    bins=[0,1,2,3,5,10,1000],
    labels=["1","2","3","4-5","6-10","11+"],
    right=True,
    include_lowest=True
)

submitter_mismatch = (
    patho2.assign(submitter_bin=submitter_bins)
    .groupby("submitter_bin", dropna=False)
    .agg(
        pathogenic_rows=("is_pathogenic", "size"),
        mismatch_rows=("is_mismatch", "sum"),
        mismatch_rate=("is_mismatch", "mean"),
    )
)

submitter_mismatch


  .groupby("submitter_bin", dropna=False)


Unnamed: 0_level_0,pathogenic_rows,mismatch_rows,mismatch_rate
submitter_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,43709,42507,0.9725
2,37061,29054,0.783951
3,26425,17209,0.651239
4-5,36674,23517,0.641244
6-10,41886,26389,0.63002
11+,19254,9972,0.517918


In [148]:
# SAFE SUMMARY (works even if df got overwritten)
print("Using df id:", id(df))
print("Has is_pathogenic before rebuild?", "is_pathogenic" in df.columns)

if "ConfidenceLevel" not in df.columns:
    df["ConfidenceLevel"] = df["ReviewStatus"].map(confidence_from_reviewstatus)

if "is_pathogenic" not in df.columns:
    df["is_pathogenic"] = df["ClinicalSignificance"].astype(str).str.contains(
        "Pathogenic", case=False, na=False
    )

if "is_mismatch" not in df.columns:
    df["is_mismatch"] = df["is_pathogenic"] & (df["ConfidenceLevel"] != "High")


Using df id: 5355500944
Has is_pathogenic before rebuild? True
