In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re
import glob
import random

In [6]:
file_path = "../Data/all_processed_HiC/K_all_hic_cont.csv.gz"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,chr,ibp,jbp,fq,fdr,rawc
0,chr1,69740000,69780000,0.780465,0.0,29.353172
1,chr1,69740000,69790000,0.370194,0.0,22.741314
2,chr1,69740000,69950000,0.068101,0.0,12.943521
3,chr1,69740000,71210000,0.035838,0.0,10.222382
4,chr1,69745000,69800000,0.531329,0.0,25.584558


In [3]:
df_refined_GM = pd.read_csv("../Data/all_processed_HiC/GM_all_hic_cont.csv.gz")

# get the average value of 'fq' and 'fdr' for each group
df_refined_GM = df_refined_GM.groupby(["chr", "ibp", "jbp"], as_index=False).agg(
    {"fq": "mean", "fdr": "mean", "rawc": "first"}
)

In [5]:
df_refined_GM.head()

Unnamed: 0,chr,ibp,jbp,fq,fdr,rawc
0,chr1,3320000,3320000,1.0,1.0,156.0216
1,chr1,3320000,3325000,1.0,1.0,59.187775
2,chr1,3320000,3330000,1.0,1.0,39.21937
3,chr1,3320000,3335000,0.998299,0.98167,24.210155
4,chr1,3320000,3340000,0.809792,0.855084,20.77917


**Gene Data**


In [2]:
file_path = "../Data/ncbi_dataset.tsv"
gene_df = pd.read_csv(file_path, sep="\t")
gene_df = gene_df[["Gene ID", "Name", "Symbol", "Chromosome", "Begin", "End"]]
gene_df

Unnamed: 0,Gene ID,Name,Symbol,Chromosome,Begin,End
0,100287102,DEAD/H-box helicase 11 like 1 (pseudogene),DDX11L1,1,11874,14409
1,653635,"WASP family homolog 7, pseudogene",WASH7P,1,14362,29370
2,102466751,microRNA 6859-1,MIR6859-1,1,17369,17436
3,107985730,MIR1302-2 host gene,MIR1302-2HG,1,29774,35418
4,100302278,microRNA 1302-2,MIR1302-2,1,30366,30503
...,...,...,...,...,...,...
68320,100289087,testis specific protein Y-linked 10,TSPY10,Y,752540,755334
68321,642631,"testis specific protein Y-linked 15, pseudogene",TSPY15P,Y,772716,775527
68322,64593,RNA binding motif protein Y-linked family 3 me...,RBMY3AP,Y,835375,839807
68323,100874288,"testis specific protein Y-linked 25, pseudogene",TSPY25P,Y,848837,851006


In [2]:
data_dir = "../Data/all_processed_HiC"
out_put_dir = "../Data/refined_processed_HiC"


files = [f for f in os.listdir(data_dir) if f.endswith(".csv.gz")]

for file in files:
    # cell line name
    cell_line = file.split("_")[0]

    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)

    # get the average value of 'fq' and 'fdr' for each group
    df = df.groupby(["chr", "ibp", "jbp"], as_index=False).agg(
        {"fq": "mean", "fdr": "mean", "rawc": "first"}
    )

    df["cell_line"] = cell_line

    # save the processed HiC file
    processed_file_path = os.path.join(out_put_dir, f"{cell_line}_processed.csv.gz")
    df.to_csv(processed_file_path, index=False, compression="gzip")
    print(f"Processed file saved to {processed_file_path}")

Processed file saved to ../Data/refined_processed_HiC/K_processed.csv.gz
Processed file saved to ../Data/refined_processed_HiC/IMR_processed.csv.gz
Processed file saved to ../Data/refined_processed_HiC/GM_processed.csv.gz


In [5]:
file_path = "../Data/refined_processed_HiC/IMR_processed.csv.gz"
IMR_df = pd.read_csv(file_path)
print(IMR_df.tail())

            chr        ibp        jbp   fq  fdr        rawc cell_line
121150427  chrX  154055000  154060000  1.0  1.0   98.492010       IMR
121150428  chrX  154055000  154065000  1.0  1.0   59.830734       IMR
121150429  chrX  154060000  154060000  1.0  1.0  187.421430       IMR
121150430  chrX  154060000  154065000  1.0  1.0  102.419624       IMR
121150431  chrX  154065000  154065000  1.0  1.0  198.546620       IMR


**Get example data from the refined csv.gz, get 3 chrs from each file, these chrs exist in each file**

In [3]:
def process_and_sample_data(input_folder, output_folder, sample_count=2):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Collect the intersection of 'chr' values across all files
    common_chromosomes = None

    # First pass: Identify common 'chr' values
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv.gz"):
            input_path = os.path.join(input_folder, filename)

            # Load the compressed CSV file
            df = pd.read_csv(input_path, compression="gzip")

            # Check if 'chr' column exists
            if "chr" not in df.columns:
                print(f"'chr' column not found in {filename}. Skipping this file.")
                continue

            # Update the set of common chromosomes
            file_chromosomes = set(df["chr"].unique())
            if common_chromosomes is None:
                common_chromosomes = file_chromosomes
            else:
                common_chromosomes &= file_chromosomes

    # Check if we have enough common chromosomes
    if common_chromosomes is None or len(common_chromosomes) < sample_count:
        print(f"Not enough common chromosomes across all files. Process aborted.")
        return

    # Randomly select the required number of common chromosomes
    sampled_chromosomes = random.sample(common_chromosomes, sample_count)
    print(f"Sampled chromosomes: {sampled_chromosomes}")

    # Second pass: Process each file with the sampled chromosomes
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv.gz"):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)

            # Load the compressed CSV file
            df = pd.read_csv(input_path, compression="gzip")

            # Filter the dataframe to include only the selected chromosomes
            sampled_data = df[df["chr"].isin(sampled_chromosomes)]

            # Save the sampled data to the output file in gzip format
            sampled_data.to_csv(output_path, index=False, compression="gzip")
            print(f"Processed and saved {filename} with {sample_count} common 'chr' groups to {output_folder}")


# Define your input and output folders
input_folder = "../Data/refined_processed_HiC"
output_folder = "../Example_Data/refined_processed_HiC"

# Run the function
process_and_sample_data(input_folder, output_folder)


since Python 3.9 and will be removed in a subsequent version.
  sampled_chromosomes = random.sample(common_chromosomes, sample_count)


Sampled chromosomes: ['chr17', 'chr12']
Processed and saved IMR_processed.csv.gz with 2 common 'chr' groups to ../Example_Data/refined_processed_HiC
Processed and saved GM_processed.csv.gz with 2 common 'chr' groups to ../Example_Data/refined_processed_HiC
Processed and saved K_processed.csv.gz with 2 common 'chr' groups to ../Example_Data/refined_processed_HiC


**Find each cell line chromosome's range**


In [3]:
import os
import pandas as pd
from pathos.multiprocessing import ProcessingPool as Pool

def process_file(args):
    input_folder, filename, output_folder = args
    if not filename.endswith(".csv.gz"):
        return

    cell_line = filename.split("_")[0]
    file_path = os.path.join(input_folder, filename)
    df = pd.read_csv(file_path, compression="gzip")

    df["ibp_end"] = df["ibp"] + 5000
    df["jbp_end"] = df["jbp"] + 5000

    # batch processing
    ibp_ranges = df[["chr", "ibp", "ibp_end"]].rename(columns={"ibp": "start_value", "ibp_end": "end_value"})
    jbp_ranges = df[["chr", "jbp", "jbp_end"]].rename(columns={"jbp": "start_value", "jbp_end": "end_value"})
    ranges = pd.concat([ibp_ranges, jbp_ranges])
    ranges["cell_line"] = cell_line

    ranges = ranges.drop_duplicates().sort_values(by=["chr", "start_value", "end_value"])

    # merge adjacent ranges
    merged_results = []
    for chr_id, group in ranges.groupby("chr"):
        group = group.sort_values(by="start_value").to_dict(orient="records")
        merged = [group[0]]

        for entry in group[1:]:
            last = merged[-1]
            if last["end_value"] >= entry["start_value"]:
                last["end_value"] = max(last["end_value"], entry["end_value"])
            else:
                merged.append(entry)

        for entry in merged:
            merged_results.append({
                "cell_line": cell_line,
                "chrID": chr_id,
                "start_value": entry["start_value"],
                "end_value": entry["end_value"],
            })

    result_df = pd.DataFrame(merged_results, columns=["cell_line", "chrID", "start_value", "end_value"])

    # save
    output_filename = f"{cell_line}_ranges.csv.gz"
    output_path = os.path.join(output_folder, output_filename)
    result_df.to_csv(output_path, index=False, compression="gzip")
    print(f"Processed and saved: {output_filename}")

def main(input_folder, output_folder, num_workers=4):
    files = [(input_folder, filename, output_folder) for filename in os.listdir(input_folder) if filename.endswith(".csv.gz")]

    # process files in parallel
    with Pool(num_workers) as pool:
        pool.map(process_file, files)

input_folder = "../Data/refined_processed_HiC"
output_folder = "../Data/seqs"


main(input_folder, output_folder)

Processed and saved: K_ranges.csv.gz
Processed and saved: IMR_ranges.csv.gz
Processed and saved: GM_ranges.csv.gz


In [6]:
file_path = "../Example_Data/seqs/K_ranges.csv.gz"
IMR_df = pd.read_csv(file_path)

print(f"Total rows: {len(IMR_df)}")
print(IMR_df)

equal_values_df = IMR_df[IMR_df["start_value"] == IMR_df["end_value"]]

print(f"Rows where start_value == end_value: {len(equal_values_df)}")

if len(equal_values_df) > 0:
    print("Rows where start_value == end_value:")
    print(equal_values_df)
else:
    print("No rows where start_value == end_value found.")


Total rows: 44
   cell_line  chrID  start_value  end_value
0          K  chr12       315000    8410000
1          K  chr12      8415000    8425000
2          K  chr12      8435000    9515000
3          K  chr12      9530000    9555000
4          K  chr12      9565000   11360000
5          K  chr12     11370000   11375000
6          K  chr12     11390000   34510000
7          K  chr12     39180000  132600000
8          K  chr17       605000   12450000
9          K  chr17     12455000   18460000
10         K  chr17     18465000   19155000
11         K  chr17     19160000   19165000
12         K  chr17     19180000   19190000
13         K  chr17     19205000   19215000
14         K  chr17     19220000   21800000
15         K  chr17     21810000   21905000
16         K  chr17     21920000   21925000
17         K  chr17     21930000   21955000
18         K  chr17     21965000   21995000
19         K  chr17     22040000   22520000
20         K  chr17     27085000   36380000
21         K  chr

In [5]:
# file_path = "../Backend/K_chr1_770000_2705000.csv.gz"
# IMR_df = pd.read_csv(file_path)
# print(IMR_df.tail())
file_path = "../Data/refined_processed_HiC/K_processed.csv.gz"
IMR_df = pd.read_csv(file_path)
IMR_df.head()

Unnamed: 0,chr,ibp,jbp,fq,fdr,rawc,cell_line
0,chr1,770000,770000,1.0,1.0,274.20648,K
1,chr1,770000,775000,1.0,1.0,184.122,K
2,chr1,770000,780000,1.0,0.0,42.198,K
3,chr1,770000,785000,1.0,0.0,55.489918,K
4,chr1,770000,790000,1.0,0.0,58.17174,K
