In [2]:
import os
import pandas as pd
import requests
import gzip

**Generate human chromosome size data**


In [None]:
def fetch_chrom_sizes(genome, output_file):
    """Utility for downloading chromosome sizes from UCSC."""
    full_file_path = os.path.join(os.getcwd(), output_file)

    url = f"http://hgdownload.soe.ucsc.edu/goldenPath/{genome}/bigZips/{genome}.chrom.sizes"
    response = requests.get(url)

    if response.status_code == 200:
        with open(full_file_path, "w") as f:
            for line in response.text.splitlines():
                chrom, size = line.split()
                f.write(f"{chrom}\t{size}\n")
        print(f"Chromosome sizes saved to {full_file_path}")
    else:
        print(f"Failed to fetch chromosome sizes for {genome}")


output_file = "chromosome_sizes.txt"
fetch_chrom_sizes("hg38", output_file)

Chromosome sizes saved to /Users/siyuanzhao/Documents/GitHub/CS522_Project/Scripts/chromosome_sizes.txt


**Generate folding input files**


In [6]:
import os
import pandas as pd

def make_dir(d):
    """Utility for making a directory if not existing."""
    if not os.path.exists(d):
        os.makedirs(d)

def get_spe_inter(hic_data, alpha=0.05):
    """Filter Hi-C data for significant interactions based on the alpha threshold."""
    hic_spe = hic_data.loc[hic_data['fdr'] < alpha]
    return hic_spe

def get_fold_inputs(spe_df):
    """Prepare folding input file from the filtered significant interactions."""
    spe_out_df = spe_df[['ibp', 'jbp', 'fq', 'chr', 'fdr']]
    spe_out_df['w'] = 1
    result = spe_out_df[['chr', 'ibp', 'jbp', 'fq', 'w']]
    return result

def process_hic_files(input_folder, seqs_folder, output_folder, alpha=0.05):
    """Process Hi-C files by matching with seqs files for reference data and save results in the output folder."""
    
    make_dir(output_folder)
    
    # Iterate through each file in the Hi-C input folder
    for hic_file_name in os.listdir(input_folder):
        if hic_file_name.endswith(".csv.gz"):
            key_word = hic_file_name.split("_")[0]  # Assume keyword is the prefix before the first underscore
            
            # Find the matching reference file in the seqs folder
            seq_file_name = f"{key_word}_ranges.csv.gz"
            seq_file_path = os.path.join(seqs_folder, seq_file_name)
            
            if not os.path.exists(seq_file_path):
                print(f"No matching reference file found for {hic_file_name}")
                continue
            
            # Load Hi-C data and the corresponding reference data
            hic_file_path = os.path.join(input_folder, hic_file_name)
            all_hic = pd.read_csv(hic_file_path)
            spe_hic = get_spe_inter(all_hic, alpha)
            reference_df = pd.read_csv(seq_file_path, usecols=["chrID", "start_value", "end_value"])

            # Process each row in the reference file for filtering
            for _, row in reference_df.iterrows():
                chrID = row["chrID"]
                start_value = row["start_value"]
                end_value = row["end_value"]
                
                # Filter Hi-C data based on chrID and ibp range
                chr_hic_data = spe_hic[
                    (spe_hic["chr"] == chrID) &
                    (spe_hic["ibp"] >= start_value) &
                    (spe_hic["ibp"] <= end_value)
                ]
                
                if chr_hic_data.empty:
                    continue

                # Prepare folding input data
                fold_hic = get_fold_inputs(chr_hic_data)

                # Generate output file name and save
                output_file_name = f"{key_word}.{chrID}.{start_value}.{end_value}.txt"
                fold_hic_path = os.path.join(output_folder, output_file_name)

                fold_hic.to_csv(fold_hic_path, header=None, index=None, sep="\t", mode="a")

input_folder = '../Data/refined_processed_HiC'
seqs_folder = '../Data/seqs'
output_folder = '../Data/Folding_input'
process_hic_files(input_folder, seqs_folder, output_folder)


**Epigenetic tracks**

In [4]:
with gzip.open("../Data/epigenetic_tracks/GM_H3K27me3.bed.gz", "rt") as f:
    df = pd.read_csv(f, sep="\t", header=None)
    df.columns = ["chrom", "start", "end", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak"]

filtered_df = df[
    (df["chrom"] == "chr8") &
    (df["start"] >= 127150000) &
    (df["end"] <= 128260000)]



Unnamed: 0,chrom,start,end,name,score,strand,signalValue,pValue,qValue,peak
