In [1]:
import pandas as pd
from natsort import natsort_keygen, natsorted
from tqdm import tqdm

In [2]:
def assign_bins(chromosome, start, end, gene, binsize):
    bins = []
    for pos in range(start, end, binsize):
        bin_start = ((pos // binsize) * binsize) + 1
        bin_end = bin_start + binsize
        bins.append((f"{chromosome}:{bin_start}-{bin_end}", gene))
    return bins

In [3]:
def prepare_annotations(gene_pos, binsize):
    binned_data = []
    for _, row in tqdm(gene_pos.iterrows()):
        gene = row["gene"]
        chromosome = row["chrom"]
        start = row["start"]
        end = row["end"]
        binned_data.extend(
            assign_bins(chromosome, start, end, gene, binsize=binsize * 1000)
        )
    binned_df = pd.DataFrame(binned_data, columns=["bin", "gene_name"])

    return binned_df

In [4]:
def gene2bin(scrna_mtx, gene_pos):

    gene_pos_f = gene_pos[gene_pos.gene_name.isin(scrna_mtx.columns)]
    gene_pos_f = gene_pos_f.sort_values(by="bin", key=natsort_keygen(), ascending=True)
    scrna = scrna_mtx[gene_pos_f.gene_name]
    gene_to_bin = gene_pos_f.set_index("gene_name")["bin"]
    expression_with_bins = scrna.T
    expression_with_bins["bin"] = gene_to_bin
    avg_expression_per_bin = expression_with_bins.groupby("bin").mean().T
    avg_expression_per_bin = avg_expression_per_bin[
        natsorted(avg_expression_per_bin.columns)
    ]

    return avg_expression_per_bin

In [5]:
scrna_original = pd.read_csv(
    "../data/numbat_smoothed_expressions.tsv.gz", sep="\t", index_col=0
)

In [6]:
gene_pos = pd.read_csv("../data/hg38_gene_locations.csv.gz", index_col=0)

In [7]:
for bins in [1000, 500, 100, 50, 10]:
    print(f"Working on bin {bins}")
    gene_pos_bins = prepare_annotations(gene_pos, binsize=bins)
    g2b = gene2bin(scrna_original, gene_pos_bins)
    g2b.to_csv(f"../data/numbat_rna_windows_{bins}.csv.gz")

Working on bin 1000


58521it [00:01, 55375.51it/s]


Working on bin 500


58521it [00:01, 56235.59it/s]


Working on bin 100


58521it [00:01, 56834.99it/s]


Working on bin 50


58521it [00:01, 53467.48it/s]


Working on bin 10


58521it [00:01, 52649.90it/s]
