In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.append(os.path.abspath('..'))
from src.utils.seed import set_seed
from src.utils.config import Config
from src.aggregation import GeneAggregator

set_seed(42)
print("Gene-level ranking and aggregation notebook")

In [None]:
config = Config('../configs/config.yaml')

variant_scores = np.random.rand(1000)
gene_ids = np.random.choice([f"GENE_{i:04d}" for i in range(100)], 1000)

df = pd.DataFrame({
    'gene_id': gene_ids,
    'variant_score': variant_scores
})

print(f"Loaded {len(df)} variant scores across {df['gene_id'].nunique()} genes")
df.head()

In [None]:
aggregator = GeneAggregator(method='max')

gene_scores = aggregator.aggregate(
    scores=df['variant_score'].values,
    gene_ids=df['gene_id'].values
)

print(f"Aggregated to {len(gene_scores)} gene scores")
print("\nTop 10 genes by score:")
for gene, score in sorted(gene_scores.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"  {gene}: {score:.4f}")

In [None]:
gene_score_df = pd.DataFrame(list(gene_scores.items()), columns=['gene_id', 'score'])
gene_score_df = gene_score_df.sort_values('score', ascending=False).reset_index(drop=True)

plt.figure(figsize=(12, 5))
plt.barh(range(20), gene_score_df['score'].head(20).values)
plt.yticks(range(20), gene_score_df['gene_id'].head(20).values)
plt.xlabel('Gene Score')
plt.title('Top 20 Genes by Aggregated Score')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
os.makedirs('../reports/results', exist_ok=True)
gene_score_df.to_csv('../reports/results/ranked_genes.csv', index=False)

print("âœ“ Gene-level ranking complete!")
print("Results saved to: reports/results/ranked_genes.csv")