# 🧬 Calmodulin Variant Effect Atlas — Exploratory Analysis

This notebook loads the **CaM‑VEA** dataset and performs exploratory summaries of variant distributions
by EF‑hand motifs, regions, and mechanistic priors.

Data file: `../outputs/cam_saturation_variants_with_clinvar.csv`


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

root = Path('..')
data_path = root / 'outputs' / 'cam_saturation_variants_with_clinvar.csv'
df = pd.read_csv(data_path)
df.head()

## 📊 Variant Counts by EF‑hand Region

In [None]:
ef_counts = df['ef_loop'].value_counts(dropna=False)
ef_counts.plot(kind='bar', figsize=(8,4), title='Variants per EF‑hand')
plt.ylabel('Variant count')
plt.show()

## 🧩 Mechanism Prior Distribution

In [None]:
mech_counts = df['mechanism_prior'].value_counts()
plt.figure(figsize=(8,4))
plt.barh(mech_counts.index, mech_counts.values)
plt.xlabel('Count')
plt.ylabel('Mechanism class')
plt.title('Distribution of Mechanism Priors')
plt.show()

## 🧠 ClinVar Coverage

In [None]:
annotated = df['ClinVar_Variation_ID'].notna().sum()
total = len(df)
print(f'ClinVar‑linked variants: {annotated} / {total} ({annotated/total:.2%})')
df[df['ClinVar_Variation_ID'].notna()][['position','wt_aa','alt_aa','Gene','Condition','ClinVar_Variation_ID']].head()

## 🧾 Save summary statistics

In [None]:
summary = {
    'total_variants': len(df),
    'efhand_distribution': df['ef_loop'].value_counts(dropna=False).to_dict(),
    'mechanism_distribution': df['mechanism_prior'].value_counts().to_dict(),
    'clinvar_annotated': int(annotated)
}

import json
out_path = Path('../outputs/summary_stats.json')
json.dump(summary, open(out_path,'w'), indent=2)
print(f'Saved summary to {out_path}')