In [12]:
import pandas as pd
data = pd.read_csv('../data_train/stringtie_tss_labeled.csv')
print(data['label'].value_counts())
# print minimum 5 and maximum 5 delta coverage rows

print(data.loc[data['delta_coverage'].idxmax(), ['chrom', 'position', 'delta_coverage']])
print(data.loc[data['delta_coverage'].idxmin(), ['chrom', 'position', 'delta_coverage']])

# print five chromosomes and positions with label 1
print(data.loc[data['label'] == 1, ['chrom', 'position', 'delta_coverage']].sample(frac=1).head(5))
print(data.columns)

label
0    6851
1    5639
Name: count, dtype: int64
chrom                 chrX
position          12975107
delta_coverage      873620
Name: 12134, dtype: object
chrom                 chr7
position           5527669
delta_coverage    -2267990
Name: 10607, dtype: object
       chrom   position  delta_coverage
9921    chr5  180803884            -621
3505   chr14   50312317            1138
10712   chr7   38909191            -243
8605    chr3  120742537            -371
5231   chr17   46923124            1534
Index(['chrom', 'position', 'strand', 'total_reads', 'read_start_density',
       'read_end_density', 'soft_clip_mean', 'soft_clip_max', 'mean_mapq',
       'std_mapq', 'strand_ratio', 'coverage_before', 'coverage_after',
       'delta_coverage', 'nearest_splice_dist', 'softclip_bias',
       'start_entropy', 'end_entropy', 'label'],
      dtype='object')


In [15]:
import os
import pandas as pd
from glob import glob

# Paths
REPORTS_DIR = "../reports"
OUTPUT_MD = os.path.join(REPORTS_DIR, "metrics_summary.md")

# Model variants we expect in filenames
MODELS = ["xgboost", "randomforest"]
SITE_TYPES = ["tss", "tes"]
TOOLS = ["stringtie", "isoquant"]

results = []

# Search for all metrics summary files
for site_type in SITE_TYPES:
    for tool in TOOLS:
        for model in MODELS:
            path = os.path.join(REPORTS_DIR, site_type, f"{tool}_{model}_metrics_summary.txt")
            if not os.path.exists(path):
                print(f"⚠️ No metrics file found for {tool} {model} on {site_type}. Skipping.")
                continue

            metrics = {
                "Tool": tool,
                "Site Type": site_type,
                "Model": model
            }
            with open(path) as f:
                for line in f:
                    line = line.strip()
                    if line.startswith("confusion_matrix") or not line:
                        continue
                    if ":" in line:
                        key, value = line.split(":", 1)
                        key = key.strip().lower()
                        try:
                            metrics[key] = float(value.strip())
                        except ValueError:
                            continue
            results.append(metrics)

# Create markdown report
if results:
    df = pd.DataFrame(results)
    df = df[["Tool", "Site Type", "Model", "accuracy", "precision", "recall", "f1", "aupr", "auc"]]  # consistent order
    os.makedirs(REPORTS_DIR, exist_ok=True)
    with open(OUTPUT_MD, "w") as f:
        f.write("# Combined Model Evaluation Summary\n\n")
        f.write(df.to_markdown(index=False))
    print(f"✅ Markdown summary written to {OUTPUT_MD}")
else:
    print("⚠️ No metrics files found. Nothing to summarize.")


✅ Markdown summary written to ../reports/metrics_summary.md
