# 1. Link ATAC-seq peaks to expression 

We want to do this via correlation or distance.

Before starting, I want to make sure that ATAC-seq and the RNA-seq datasets align.

In [19]:
import pandas as pd

# Loading the data
atac_peak_matrix = pd.read_csv("ATAC-seq/filtered_ATAC_abT_Tact_Stem.csv")
atac_signal_matrix = atac_peak_matrix.iloc[:, 8:]  # dropping metadata columns
atac_transposed_df = atac_signal_matrix.T

rna_df = pd.read_csv("raw datasets/mmc2.csv", index_col=0)
rna_df.T
rna_transposed_df = rna_df.T

from sklearn.preprocessing import StandardScaler
rna_scaled_array = StandardScaler().fit_transform(rna_transposed_df)
rna_scaled_df = pd.DataFrame(rna_scaled_array, index=rna_transposed_df.index, columns=rna_transposed_df.columns)

# Ensuring sample alignment
common_samples = atac_transposed_df.index.intersection(rna_scaled_df.index)
atac = atac_transposed_df.loc[common_samples]
rna = rna_scaled_df.loc[common_samples]

In [20]:
# Making sure datasets are aligned
## Comparing shapes
print("ATAC shape:", atac.shape)
print("RNA shape:", rna.shape)

## Comparing indices
for sample_a, sample_r in zip(atac.index, rna.index):
    print(f"ATAC: {sample_a}  |  RNA: {sample_r}")

## Inspecting common samples 
print("Common samples used for correlation:")
print(list(common_samples))


ATAC shape: (28, 512595)
RNA shape: (28, 17535)
ATAC: preT.DN1.Th  |  RNA: preT.DN1.Th
ATAC: preT.DN2a.Th  |  RNA: preT.DN2a.Th
ATAC: preT.DN2b.Th  |  RNA: preT.DN2b.Th
ATAC: preT.DN3.Th  |  RNA: preT.DN3.Th
ATAC: T.DN4.Th  |  RNA: T.DN4.Th
ATAC: T.ISP.Th  |  RNA: T.ISP.Th
ATAC: T.DP.Th  |  RNA: T.DP.Th
ATAC: T.4.Th  |  RNA: T.4.Th
ATAC: T.8.Th  |  RNA: T.8.Th
ATAC: T.4.Nve.Sp  |  RNA: T.4.Nve.Sp
ATAC: T.4.Nve.Fem.Sp  |  RNA: T.4.Nve.Fem.Sp
ATAC: T.4.Sp.aCD3+CD40.18hr  |  RNA: T.4.Sp.aCD3+CD40.18hr
ATAC: T.8.Nve.Sp  |  RNA: T.8.Nve.Sp
ATAC: Treg.4.25hi.Sp  |  RNA: Treg.4.25hi.Sp
ATAC: Treg.4.FP3+.Nrplo.Co  |  RNA: Treg.4.FP3+.Nrplo.Co
ATAC: T8.TN.P14.Sp  |  RNA: T8.TN.P14.Sp
ATAC: T8.TE.LCMV.d7.Sp  |  RNA: T8.TE.LCMV.d7.Sp
ATAC: T8.MP.LCMV.d7.Sp  |  RNA: T8.MP.LCMV.d7.Sp
ATAC: T8.Tcm.LCMV.d180.Sp  |  RNA: T8.Tcm.LCMV.d180.Sp
ATAC: T8.Tem.LCMV.d180.Sp  |  RNA: T8.Tem.LCMV.d180.Sp
ATAC: NKT.Sp  |  RNA: NKT.Sp
ATAC: NKT.Sp.LPS.3hr  |  RNA: NKT.Sp.LPS.3hr
ATAC: NKT.Sp.LPS.18hr  |  RNA: NKT

Now that we have established that the datasets do align, we can start with the correlation of the ATAC peaks with nearby genes across the 28 aligned samples using Spearman correlation.

In [21]:
import pandas as pd
from scipy.stats import spearmanr
from tqdm import tqdm

# Load peak-to-gene mapping
peak_gene_map = pd.read_csv("peaks_tss_distance.csv")

# Renaming columns for consistency
peak_gene_map = peak_gene_map.rename(columns={
    "ImmGenATAC1219.peakID": "peak_id",
    "closest_tss": "gene_name"
})

# Filtering to peaks within ±100 kb of TSS (DO I LEAVE THAT???)
peak_gene_map = peak_gene_map[peak_gene_map["distance_to_tss"].abs() <= 100000]

# Keeping only peak-gene pairs present in data
valid_peaks = set(atac.columns)
valid_genes = set(rna.columns)

filtered_map = peak_gene_map[
    peak_gene_map["peak_id"].isin(valid_peaks) &
    peak_gene_map["gene_name"].isin(valid_genes)
]

# Step 5: Compute Spearman correlations
results = []
print(f"Correlating {len(filtered_map)} peak–gene pairs...")

for _, row in tqdm(filtered_map.iterrows(), total=len(filtered_map)):
    peak = row["peak_id"]
    gene = row["gene_name"]
    dist = row["distance_to_tss"]

    peak_signal = atac[peak]
    gene_expr = rna[gene]

    corr, pval = spearmanr(peak_signal, gene_expr)

    results.append({
        "peak_id": peak,
        "gene_name": gene,
        "distance_to_tss": dist,
        "spearman_rho": corr,
        "pval": pval
    })

# Collecting and filtering results
cor_df = pd.DataFrame(results)
linked_peaks = cor_df[
    (cor_df["pval"] < 0.05) & (cor_df["spearman_rho"].abs() > 0.5)
]

# Saving results
linked_peaks.to_csv("peak_gene_links_significant.csv", index=False)
print(f"Done! {len(linked_peaks)} significant peak–gene links saved.")


KeyboardInterrupt: 

In [None]:
print(cor_df.columns)
print(cor_df.head())


RangeIndex(start=0, stop=0, step=1)
Empty DataFrame
Columns: []
Index: []


In [None]:
print("Filtered map shape:", filtered_map.shape)


Filtered map shape: (0, 5)


In [None]:
print("Peaks in peak_gene_map:", peak_gene_map["peak_id"].nunique())
print("Peaks in atac matrix:", len(valid_peaks))
print("Genes in peak_gene_map:", peak_gene_map["gene_name"].nunique())
print("Genes in rna matrix:", len(valid_genes))


Peaks in peak_gene_map: 437081
Peaks in atac matrix: 512595
Genes in peak_gene_map: 28521
Genes in rna matrix: 17535


In [None]:
print("Example peak IDs in peak_gene_map:")
print(peak_gene_map["peak_id"].head(5).tolist())

print("Example ATAC matrix columns:")
print(list(atac.columns[:5]))


Example peak IDs in peak_gene_map:
['ImmGenATAC1219.peak_17', 'ImmGenATAC1219.peak_18', 'ImmGenATAC1219.peak_19', 'ImmGenATAC1219.peak_20', 'ImmGenATAC1219.peak_21']
Example ATAC matrix columns:
[0, 1, 2, 3, 4]


In [None]:
import pandas as pd
from scipy.stats import spearmanr
from tqdm import tqdm
# Step 1: Load ATAC matrix with metadata
full_atac = pd.read_csv("ATAC-seq/filtered_ATAC_abT_Tact_Stem.csv")

# Step 2: Extract signal and assign peak IDs to rows
atac_signal_matrix = full_atac.iloc[:, 8:]  # columns = samples
atac_signal_matrix.index = full_atac["ImmGenATAC1219.peakID"]  # rows = peaks

# Step 3: Transpose → samples × peaks
atac_transposed_df = atac_signal_matrix.T

# 🔹 Step 3: Align with RNA data (assuming rna_scaled_df already exists)
common_samples = atac_transposed_df.index.intersection(rna_scaled_df.index)
atac = atac_transposed_df.loc[common_samples]
rna = rna_scaled_df.loc[common_samples]

print("✅ ATAC shape:", atac.shape)
print("✅ RNA shape:", rna.shape)

# 🔹 Step 4: Load and rename peak–TSS mapping
peak_gene_map = pd.read_csv("peaks_tss_distance.csv")
peak_gene_map = peak_gene_map.rename(columns={
    "ImmGenATAC1219.peakID": "peak_id",
    "closest_tss": "gene_name"
})

# Filter to peaks within ±100 kb of a TSS
peak_gene_map = peak_gene_map[peak_gene_map["distance_to_tss"].abs() <= 100000]

# 🔹 Step 5: Filter for peak–gene pairs that are present in both matrices
valid_peaks = set(atac.columns)
valid_genes = set(rna.columns)

filtered_map = peak_gene_map[
    peak_gene_map["peak_id"].isin(valid_peaks) &
    peak_gene_map["gene_name"].isin(valid_genes)
]

print(f"🔍 Correlating {len(filtered_map)} peak–gene pairs...")

# 🔹 Step 6: Correlate each peak–gene pair
results = []

for _, row in tqdm(filtered_map.iterrows(), total=len(filtered_map)):
    peak = row["peak_id"]
    gene = row["gene_name"]
    dist = row["distance_to_tss"]

    peak_signal = atac[peak]
    gene_expr = rna[gene]

    corr, pval = spearmanr(peak_signal, gene_expr)

    results.append({
        "peak_id": peak,
        "gene_name": gene,
        "distance_to_tss": dist,
        "spearman_rho": corr,
        "pval": pval
    })

# 🔹 Step 7: Save and filter results
cor_df = pd.DataFrame(results)
linked_peaks = cor_df[
    (cor_df["pval"] < 0.05) & (cor_df["spearman_rho"].abs() > 0.5)
]

linked_peaks.to_csv("peak_gene_links_significant.csv", index=False)
print(f"✅ Done! {len(linked_peaks)} significant peak–gene links saved.")


✅ ATAC shape: (28, 512595)
✅ RNA shape: (28, 17535)
🔍 Correlating 0 peak–gene pairs...


0it [00:00, ?it/s]


KeyError: 'pval'

In [None]:
print("filtered_map shape:", filtered_map.shape)


filtered_map shape: (0, 5)


In [None]:
# How many peak IDs in each set?
print("ATAC matrix peak columns:", len(atac.columns))
print("Peak IDs in peak_gene_map:", peak_gene_map["peak_id"].nunique())

# What's the overlap?
overlapping_peaks = set(peak_gene_map["peak_id"]) & set(atac.columns)
print("Overlapping peak IDs:", len(overlapping_peaks))

# Show a few missing examples:
non_matching = set(peak_gene_map["peak_id"]) - set(atac.columns)
print("Example of unmatched peak IDs:", list(non_matching)[:5])


ATAC matrix peak columns: 512595
Peak IDs in peak_gene_map: 437081
Overlapping peak IDs: 437081
Example of unmatched peak IDs: []


In [None]:
print("Genes in peak_gene_map:", peak_gene_map['gene_name'].nunique())
print("Genes in RNA matrix:", len(rna.columns))

overlapping_genes = set(peak_gene_map['gene_name']) & set(rna.columns)
print("Overlapping genes:", len(overlapping_genes))

# Show a few missing examples
non_matching_genes = set(peak_gene_map['gene_name']) - set(rna.columns)
print("Example of unmatched gene names:", list(non_matching_genes)[:5])


Genes in peak_gene_map: 28521
Genes in RNA matrix: 17535
Overlapping genes: 0
Example of unmatched gene names: [135659546.0, 59375653.0, 78512168.0, 17694765.0, 128974895.0]


In [22]:
import mygene

mg = mygene.MyGeneInfo()

# Convert Entrez IDs (remove decimal points first)
entrez_ids = peak_gene_map["gene_name"].dropna().astype(str).str.split(".").str[0].unique().tolist()

# Query MyGene for mappings
annotation = mg.querymany(entrez_ids, scopes="entrezgene", fields="symbol", species="mouse")

# Convert to DataFrame
annot_df = pd.DataFrame(annotation)
id_to_symbol = annot_df.set_index("query")["symbol"].dropna().to_dict()



Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
28516 input query terms found no hit:	['3671498', '4360303', '4360314', '4409241', '4497354', '4785726', '4807822', '4857693', '4858327', 


In [23]:
# Clean Entrez IDs in original map
peak_gene_map["entrez_id"] = peak_gene_map["gene_name"].astype(str).str.split(".").str[0]

# Map to gene symbols
peak_gene_map["gene_symbol"] = peak_gene_map["entrez_id"].map(id_to_symbol)


In [24]:
peak_gene_map = peak_gene_map.dropna(subset=["gene_symbol"])

filtered_map = peak_gene_map[
    peak_gene_map["peak_id"].isin(atac.columns) &
    peak_gene_map["gene_symbol"].isin(rna.columns)
]

print("Filtered map shape after ID conversion:", filtered_map.shape)


Filtered map shape after ID conversion: (0, 7)


In [27]:
import pandas as pd
RNA_filtered = pd.read_csv("rna_filtered_abT_Tact_Stem.csv")
RNA_filtered.head()

Unnamed: 0.1,Unnamed: 0,preT.DN1.Th,preT.DN2a.Th,preT.DN2b.Th,preT.DN3.Th,T.DN4.Th,T.ISP.Th,T.DP.Th,T.4.Th,T.8.Th,...,T8.Tcm.LCMV.d180.Sp,T8.Tem.LCMV.d180.Sp,NKT.Sp,NKT.Sp.LPS.3hr,NKT.Sp.LPS.18hr,NKT.Sp.LPS.3d,LTHSC.34-.BM,LTHSC.34+.BM,STHSC.150-.BM,MPP4.135+.BM
0,0610005C13Rik,1.022363,1.389747,1.024819,1.024482,1.02643,1.026217,3.01092,1.024462,1.024819,...,1.025833,1.024819,1.385805,1.025833,1.575395,1.024819,1.096732,1.096732,1.02175,1.021812
1,0610007P14Rik,162.641117,206.945221,209.187788,198.421365,215.056475,225.56536,73.904647,138.841383,139.863904,...,206.241084,205.309922,165.69072,133.23492,127.894194,195.147548,206.053987,246.105317,192.424636,204.298358
2,0610009B22Rik,68.070719,82.468806,89.769337,57.661619,76.399214,84.671456,32.828651,27.207241,36.169759,...,36.1057,34.348965,25.168975,33.305724,29.284365,33.322384,78.272059,78.83703,68.844751,76.418169
3,0610009L18Rik,15.450717,13.573968,14.42762,8.249482,1.683173,4.001953,5.595954,6.367369,6.505833,...,8.645607,7.268431,3.840215,1.025833,6.28354,12.791348,8.577159,16.791386,15.511549,16.947354
4,0610009O20Rik,160.246297,125.475307,155.928005,120.692893,118.433597,149.630866,92.040668,76.781112,87.529814,...,87.608325,56.128251,109.175415,91.992319,102.035627,108.414405,168.645852,157.926022,155.941641,186.261464
