# Analysis Contract

- One frozen dataset: `analysis_df`
- One outcome definition (`y`)
- No feature engineering after freezing `analysis_df`
- All analyses are associative / descriptive (no causal claims)

This notebook builds and freezes the single dataset used by Options 1, 2, and 4.


In [1]:
import pandas as pd

scored_path = "outputs/phase4/tables/phase4_scored.csv.gz"
df = pd.read_csv(scored_path)

df.shape


FileNotFoundError: [Errno 2] No such file or directory: 'outputs/phase4/tables/phase4_scored.csv.gz'

In [2]:
import pandas as pd

scored_path = "../outputs/phase4/tables/phase4_scored.csv.gz"
df = pd.read_csv(scored_path)

df.shape


(300000, 52)

In [3]:
df.columns.tolist()


['#AlleleID',
 'Type',
 'Name',
 'GeneID',
 'GeneSymbol',
 'HGNC_ID',
 'ClinicalSignificance',
 'ClinSigSimple',
 'LastEvaluated',
 'RS# (dbSNP)',
 'nsv/esv (dbVar)',
 'RCVaccession',
 'PhenotypeIDS',
 'PhenotypeList',
 'Origin',
 'OriginSimple',
 'Assembly',
 'ChromosomeAccession',
 'Chromosome',
 'Start',
 'Stop',
 'ReferenceAllele',
 'AlternateAllele',
 'Cytogenetic',
 'ReviewStatus',
 'NumberSubmitters',
 'Guidelines',
 'TestedInGTR',
 'OtherIDs',
 'SubmitterCategories',
 'VariationID',
 'PositionVCF',
 'ReferenceAlleleVCF',
 'AlternateAlleleVCF',
 'SomaticClinicalImpact',
 'SomaticClinicalImpactLastEvaluated',
 'ReviewStatusClinicalImpact',
 'Oncogenicity',
 'OncogenicityLastEvaluated',
 'ReviewStatusOncogenicity',
 'SCVsForAggregateGermlineClassification',
 'SCVsForAggregateSomaticClinicalImpact',
 'SCVsForAggregateOncogenicityClassification',
 'ConfidenceLevel',
 'LastEvaluated_dt',
 'years_since_review',
 'NumberSubmitters_num',
 'conflicting',
 'ReclassificationRiskScore',
 'R

In [4]:
import numpy as np

# --- Minimal frozen columns used across Options 1, 2, 4 ---
keep_cols = [
    "VariationID", "#AlleleID", "GeneSymbol",
    "ClinicalSignificance", "ClinSigSimple", "ReviewStatus",
    "ConfidenceLevel", "years_since_review", "NumberSubmitters_num", "conflicting",
    "model_risk_proba", "model_risk_tier",
    "ReclassificationRiskScore", "ReclassificationRiskTier",
]

analysis_df = df.loc[:, keep_cols].copy()

# --- Outcome y (PROXY): 1 if High/Critical tier, else 0 ---
analysis_df["y"] = analysis_df["model_risk_tier"].isin(["High", "Critical"]).astype(int)

# Basic cleaning: ensure numeric types are numeric
analysis_df["years_since_review"] = pd.to_numeric(analysis_df["years_since_review"], errors="coerce")
analysis_df["NumberSubmitters_num"] = pd.to_numeric(analysis_df["NumberSubmitters_num"], errors="coerce")
analysis_df["model_risk_proba"] = pd.to_numeric(analysis_df["model_risk_proba"], errors="coerce")

analysis_df.shape, analysis_df["y"].value_counts(dropna=False).to_dict()



((300000, 15), {0: 216832, 1: 83168})

In [5]:
out_path = "../outputs/tables/analysis_df.csv.gz"
analysis_df.to_csv(out_path, index=False, compression="gzip")
out_path


'../outputs/tables/analysis_df.csv.gz'