In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr, ttest_ind
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import logging
import os

DATA_DIR = "data/"
DATA_PROC_DIR = "data-processed/"
ATAC_SEQ_PATH = os.path.join(DATA_DIR, "ImmGenATAC18_AllOCRsInfo.csv")
RNA_SEQ_PATH = os.path.join(DATA_DIR, "mmc2.csv")
PROC_PEAKS_PATH = os.path.join(DATA_PROC_DIR, "peaks_annotated.csv")

RESULTS_PATH = os.path.join(DATA_PROC_DIR, "gene_regression.csv")

# ——————————————————————————————————————————————————————————————
# 1) Load & align ATAC & RNA
# ——————————————————————————————————————————————————————————————

atac = pd.read_csv(ATAC_SEQ_PATH, index_col=[0,1,2])#.drop(columns=["mm10.60way.phastCons_scores", "_-log10_bestPvalue", "Included.in.systematic.analysis"])
# keep only numeric columns (cell types)
atac = atac.select_dtypes(include=[np.number])

rna = pd.read_csv(RNA_SEQ_PATH, index_col=0)

# intersect & reorder
common = atac.columns.intersection(rna.columns)
atac = atac[common]
rna  = rna[common]

# ——————————————————————————————————————————————————————————————
# 2) Load peak annotation & build gene→peak map
# ——————————————————————————————————————————————————————————————

peaks = pd.read_csv(PROC_PEAKS_PATH)
peaks["gene_list"] = peaks["genes.within.100Kb"].str.split(",") # multiple genes per peak
pe = peaks.explode("gene_list").rename(columns={"gene_list":"gene"})
pe = pe[pe["gene"].notna()]
gene2peaks = pe.groupby("gene")["id"].apply(list).to_dict()
#Alternatively, if we would want to use the TSS_GeneName field: (meaning only one peak per gene)
#gene2peaks = peaks.groupby("TSS_GeneName")["id"].apply(list).to_dict()

# ——————————————————————————————————————————————————————————————
# 3) Define our lineage
# ——————————————————————————————————————————————————————————————

ilc_cells = ["NK.27+11b-.BM","NK.27+11b+.BM","NK.27-11b+.BM","NK.27+11b-.Sp","NK.27+11b+.Sp","NK.27-11b+.Sp","ILC2.SI", "ILC3.NKp46-CCR6-.SI", "ILC3.NKp46+.SI", "ILC3.CCR6+.SI"]

# ——————————————————————————————————————————————————————————————
# 4) Prepare for modeling
# ——————————————————————————————————————————————————————————————

model = LinearRegression()
records = []      # will hold regression + correlation results

# ——————————————————————————————————————————————————————————————
# 5) Loop over genes
# ——————————————————————————————————————————————————————————————

for gene, peak_ids in gene2peaks.items():
    # require gene in rna & ≥2 CREs
    if gene not in rna.index or len(peak_ids) < 2:
        continue

    try:
        sub = atac.loc[peak_ids]  # rows=CREs × cols=cell types
    except KeyError:
        continue
    # drop CREs with any missing data
    sub = sub.dropna(axis=0, how="any")
    if sub.shape[0] < 2:
        continue

    # global regression
    Xg = sub.values.T   # shape: samples × features
    yg = rna.loc[gene].values
    model.fit(Xg, yg)
    r2g = model.score(Xg, yg)
    betag = model.coef_

    # lineage regression
    sub_l = sub[ilc_cells].dropna(axis=0, how="any")
    if sub_l.shape[0] < 2:
        continue
    Xl = sub_l.values.T
    yl = rna.loc[gene, ilc_cells].values
    model.fit(Xl, yl)
    r2l = model.score(Xl, yl)
    betal = model.coef_

    # record per CRE
    for idx, pid in enumerate(sub.index):
        # correlation
        x = atac.loc[pid]
        r, p = pearsonr(x, rna.loc[gene])
        # effect sign
        eff = "activating" if betag[idx] > 0 else "repressing"
        records.append({
            "gene":          gene,
            "id":            pid[0],
            "R2_global":     r2g,
            "R2_lineage":    r2l,
            "beta_global":   betag[idx],
            "beta_lineage":  betal[idx] if idx < len(betal) else np.nan,
            "pearson_r":     r,
            "effect":        eff
        })

# ——————————————————————————————————————————————————————————————
# 6) Compile into DataFrame
# ——————————————————————————————————————————————————————————————

df = pd.DataFrame(records)
df["beta_diff"] = df["beta_lineage"] - df["beta_global"]

# merge location annotations
loc = peaks[["id","is_promoter","is_intragenic"]]
df = df.merge(loc, on="id")

# save
df.to_csv(RESULTS_PATH, index=False)