`antman` kernel

## Convert coverage files to h5 files, split by chromosome 
Only need to do this once. h5 files allow for faster querying. 

In [None]:
import Bio.SeqIO as SeqIO
import numpy as np
import h5py
import os

In [None]:
# Load genome
genome_fasta_path = '../genome/Obir.assembly.v5.4.fasta'
genome_seq = SeqIO.to_dict(SeqIO.parse(genome_fasta_path, "fasta"))

In [None]:
# Get chromsizes for genome
chromsizes = {}
for chrom in genome_seq:
    if "Chr" in chrom:
        chromsizes[chrom] = len(genome_seq[chrom].seq)

input_directory = "../first_of_pair_strand_coverage"  # Directory containing ~20 input files
output_directory = "../first_of_pair_strand_coverage_chrom_h5"  # Directory to store split files

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Remove any existing files in the output directory
for file_name in os.listdir(output_directory):
    os.remove(os.path.join(output_directory, file_name))

# Iterate over files in the input directory
for file_name in os.listdir(input_directory):

    # Initialize data storage for each chromosome
    chromosome_data = {chrom: [] for chrom in chromsizes}

    # Process the file
    input_path = os.path.join(input_directory, file_name)
    print(f"Processing {input_path}")
    with open(input_path, 'r') as infile:
        for line in infile:
            columns = line.strip().split('\t')
            
            chromosome, bp_index, coverage = columns
            if chromosome in chromosome_data:
                chromosome_data[chromosome].append(int(coverage))

    # Validate size 
    for chrom, expected_size in chromsizes.items():
        coverage_list = chromosome_data[chrom]
        
        # Ensure the correct size
        if len(coverage_list) != expected_size:
            raise ValueError(f"Chromosome {chrom} has incorrect size: "
                                f"expected {expected_size}, got {len(coverage_list)}")
        
    # Write to output file
    base_name = os.path.splitext(file_name)[0]
    output_file = os.path.join(output_directory, f"{base_name}.h5")
    with h5py.File(output_file, 'w') as f:
        for chrom, coverage in chromosome_data.items():
            f.create_dataset(chrom, data=np.array(coverage, dtype=int))
    print(f"Wrote {output_file}")

In [None]:
# Get chromsizes for genome
chromsizes = {}
for chrom in genome_seq:
    if "Chr" in chrom:
        chromsizes[chrom] = len(genome_seq[chrom].seq)

input_directory = "../JL20240124_FemaleRNAseq_first_of_pair_strand_coverage"  # Directory containing ~20 input files
output_directory = "../JL20240124_FemaleRNAseq_first_of_pair_strand_coverage_chrom_h5"  # Directory to store split files

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Remove any existing files in the output directory
for file_name in os.listdir(output_directory):
    os.remove(os.path.join(output_directory, file_name))

# Iterate over files in the input directory
for file_name in os.listdir(input_directory):

    # Initialize data storage for each chromosome
    chromosome_data = {chrom: [] for chrom in chromsizes}

    # Process the file
    input_path = os.path.join(input_directory, file_name)
    print(f"Processing {input_path}")
    with open(input_path, 'r') as infile:
        for line in infile:
            columns = line.strip().split('\t')
            
            chromosome, bp_index, coverage = columns
            if chromosome in chromosome_data:
                chromosome_data[chromosome].append(int(coverage))

    # Validate size 
    for chrom, expected_size in chromsizes.items():
        coverage_list = chromosome_data[chrom]
        
        # Ensure the correct size
        if len(coverage_list) != expected_size:
            raise ValueError(f"Chromosome {chrom} has incorrect size: "
                                f"expected {expected_size}, got {len(coverage_list)}")
        
    # Write to output file
    base_name = os.path.splitext(file_name)[0]
    output_file = os.path.join(output_directory, f"{base_name}.h5")
    with h5py.File(output_file, 'w') as f:
        for chrom, coverage in chromosome_data.items():
            f.create_dataset(chrom, data=np.array(coverage, dtype=int))
    print(f"Wrote {output_file}")