In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path

DATA_DIR = "data/"
ATAC_SEQ_PATH = os.path.join(DATA_DIR, "ImmGenATAC18_AllOCRsInfo.csv")
REFFLAT_PATH = os.path.join(DATA_DIR, "refFlat.txt.gz")

# Load ATAC-seq data
atac = pd.read_csv(ATAC_SEQ_PATH, index_col=[0,1,2])
atac.index.names = ["id","chrom","summit"]
peaks = atac.reset_index()
# Load gene annotation data
refFlat = pd.read_csv(REFFLAT_PATH, sep="\t", header=None, compression="gzip")
refFlat.columns = ["geneName", "transcriptName", "chrom", "strand", "tx5p", "tx3p",
    "cdsStart", "cdsEnd", "exonCount", "exonStarts", "exonEnds"]

atac[atac["TSS"].notna()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mm10.60way.phastCons_scores,_-log10_bestPvalue,Included.in.systematic.analysis,TSS,genes.within.100Kb,LTHSC.34-.BM,LTHSC.34+.BM,STHSC.150-.BM,MPP4.135+.BM,proB.CLP.BM,...,DC.4+.Sp,DC.8+.Sp,DC.pDC.Sp,DC.103+11b+.SI,DC.103+11b-.SI,FRC.SLN,IAP.SLN,BEC.SLN,LEC.SLN,Ep.MEChi.Th
id,chrom,summit,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
ImmGenATAC1219.peak_21,chr1,3671437,0.72,2.02,1.0,Xkr4,Xkr4,0.41,3.68,3.52,4.84,3.60,...,2.53,0.10,0.11,1.87,0.92,4.97,2.62,2.64,2.83,17.14
ImmGenATAC1219.peak_140,chr1,4785675,0.41,15.87,1.0,Mrpl15,"Mrpl15,Lypla1,Tcea1",9.93,20.06,9.19,15.49,20.19,...,21.99,24.19,21.76,22.06,20.80,33.25,23.63,20.90,13.30,22.38
ImmGenATAC1219.peak_146,chr1,4807877,0.47,25.12,1.0,Lypla1,"Mrpl15,Lypla1,Tcea1",29.54,42.40,17.12,25.07,31.38,...,26.85,17.69,41.52,42.24,42.23,32.82,31.65,38.34,26.68,33.27
ImmGenATAC1219.peak_158,chr1,4857752,0.12,45.65,1.0,Tcea1,"Mrpl15,Lypla1,Tcea1",115.42,141.25,85.08,82.00,94.13,...,80.17,108.52,85.43,79.55,92.80,133.71,85.20,101.38,137.79,129.55
ImmGenATAC1219.peak_160,chr1,4858351,0.02,7.78,1.0,Tcea1,"Mrpl15,Lypla1,Tcea1",4.37,6.07,4.33,7.89,9.04,...,10.55,11.20,10.07,10.37,9.80,8.34,8.76,17.31,8.39,11.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ImmGenATAC1219.peak_512468,chrX_GL456233_random,39222,0.32,73.75,1.0,Vamp7,"Vamp7,Spry3",57.63,42.40,37.43,59.79,52.20,...,37.95,58.81,61.00,38.21,37.03,63.17,48.63,61.16,82.32,48.00
ImmGenATAC1219.peak_512478,chrX_GL456233_random,159627,0.12,16.28,1.0,Tmlhe,"Spry3,Tmlhe",22.66,21.83,13.69,13.66,18.14,...,3.12,2.21,9.63,0.47,1.75,16.68,8.32,13.71,29.25,5.34
ImmGenATAC1219.peak_512507,chrY,1010556,0.18,13.45,1.0,Eif2s3y,"Eif2s3y,Tspy-ps",12.99,11.79,9.19,10.15,14.38,...,21.53,20.30,17.67,20.86,16.23,9.28,24.00,12.94,11.58,18.65
ImmGenATAC1219.peak_512523,chrY,1245808,0.18,18.59,1.0,Uty,"Uty,Ddx3y",19.40,25.33,17.12,20.43,23.24,...,34.26,34.61,21.57,23.95,24.12,30.97,36.01,30.23,38.30,35.58


In [20]:
# Compute TSS for each transcript:
# If strand == "+", TSS = tx5p; if "-", TSS = tx3p
refFlat["tss"] = np.where(refFlat["strand"] == "+",
                          refFlat["tx5p"],
                          refFlat["tx3p"])

# Build a dict of sorted TSS positions per chromosome
tss_by_chrom = {}
for chrom, grp in refFlat.groupby("chrom"):
    # unique and sorted
    tss_by_chrom[chrom] = np.sort(grp["tss"].unique())

def dist_to_nearest_tss(chrom, pos):
    if chrom not in tss_by_chrom:
        print(f"Chromosome {chrom} not found in TSS data.")
        return np.nan
    arr = tss_by_chrom[chrom]
    # find insertion point
    idx = np.searchsorted(arr, pos)
    # check neighbor distances
    dists = []
    if idx > 0:
        dists.append(pos - arr[idx-1])
    if idx < len(arr):
        dists.append(arr[idx] - pos)
    return min(dists)


In [None]:
# 3. Annotate each peak
dists = []
is_promoter = []
is_intragenic = []

# pre-group gene intervals per chromosome
genes_by_chr = {c: g[["cdsStart","cdsEnd"]].values for c,g in refFlat.groupby("chrom")}

for _, row in peaks.iterrows():
    chrom, summit = row["chrom"], row["summit"]

    # distance to nearest TSS
    d = dist_to_nearest_tss(chrom, summit)
    dists.append(d)

    # promoter if within ±1kb
    is_promoter.append(abs(d) <= 1000)

    # intragenic if center falls within any gene interval
    intr = False
    for gs, ge in genes_by_chr.get(chrom, []):
        if gs <= summit <= ge:
            intr = True
            break
    is_intragenic.append(intr)

# attach to peaks DataFrame
peaks["dist_to_TSS"]   = dists
peaks["is_promoter"]   = is_promoter
peaks["is_enhancer"]   = ~peaks["is_promoter"]
peaks["is_intragenic"] = is_intragenic
peaks["is_extragenic"] = ~peaks["is_intragenic"]

# 4. Save annotated peaks
peaks.to_csv("data/peaks_annotated.csv", index=False)

# 5. Bar plot of distance distribution
#    Bin distances into categories
bins = [-np.inf, -10000, -5000, -1000, 0, 1000, 5000, 10000, np.inf]
labels = ["<-10kb","-10kb~-5kb","-5kb~-1kb","-1kb~TSS","TSS~+1kb","+1kb~+5kb","+5kb~+10kb",">+10kb"]
peaks["dist_bin"] = pd.cut(peaks["dist_to_TSS"], bins=bins, labels=labels)

# count per bin
counts = peaks["dist_bin"].value_counts().reindex(labels)

# plot
plt.figure(figsize=(8,4))
counts.plot(kind="bar")
plt.xlabel("Distance to TSS")
plt.ylabel("Number of CREs")
plt.title("CRE Distance to Nearest TSS")
plt.tight_layout()
#plt.savefig("reports/distance_to_tss_barplot.png")
plt.show()

peaks