In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Read data
input_file = "dMean Occupancy/InVitro_dMean.txt"
output_file = "dMean Occupancy/InVitro_dMean_sampled.txt"

print(f"Reading data: {input_file}")
df = pd.read_csv(input_file, sep='\t', header=None, 
                 names=['Chromosome', 'Position', 'Occupancy'])

print(f"Original data: {len(df)} rows")

# Group by chromosome and perform random sampling
sampled_data = []
chromosomes = df['Chromosome'].unique()

for chrom in chromosomes:
    # Filter data for the current chromosome
    chrom_data = df[df['Chromosome'] == chrom]
    
    # Calculate 1% sample size
    sample_size = max(1, int(len(chrom_data) * 0.01))  
    
    # Random sampling
    sampled_chrom = chrom_data.sample(n=sample_size, random_state=42)
    sampled_data.append(sampled_chrom)
    
    print(f"Chromosome {chrom}: Original {len(chrom_data)} rows -> Sampled {len(sampled_chrom)} rows")

# Combine sampling results
sampled_df = pd.concat(sampled_data, ignore_index=True)

# Save results
print(f"Saving sampled data to: {output_file}")
sampled_df.to_csv(output_file, sep='\t', na_rep='nan', header=False, index=False)

print(f"Processing complete! Total sampled data: {len(sampled_df)} rows")

Reading data: dMean Occupancy/InVitro_dMean.txt
Original data: 10774972 rows
Chromosome 1: Original 168234 rows -> Sampled 1682 rows
Chromosome 10: Original 659529 rows -> Sampled 6595 rows
Chromosome 11: Original 632626 rows -> Sampled 6326 rows
Chromosome 12: Original 931931 rows -> Sampled 9319 rows
Chromosome 13: Original 848262 rows -> Sampled 8482 rows
Chromosome 14: Original 706446 rows -> Sampled 7064 rows
Chromosome 15: Original 997401 rows -> Sampled 9974 rows
Chromosome 16: Original 856960 rows -> Sampled 8569 rows
Chromosome 2: Original 748431 rows -> Sampled 7484 rows
Chromosome 3: Original 244966 rows -> Sampled 2449 rows
Chromosome 4: Original 1378420 rows -> Sampled 13784 rows
Chromosome 5: Original 515003 rows -> Sampled 5150 rows
Chromosome 6: Original 236402 rows -> Sampled 2364 rows
Chromosome 7: Original 983305 rows -> Sampled 9833 rows
Chromosome 8: Original 484641 rows -> Sampled 4846 rows
Chromosome 9: Original 382415 rows -> Sampled 3824 rows
Saving sampled dat