Can one use regression to associate CREs with gene expression?

we will try to answer this question by using linear (lasso) and regularized (ridge) regression to predict gene expression based on accessibility of linked promoter and enhancers (CREs)

In [4]:
import importlib
import functions
importlib.reload(functions)
functions.set_user('Helen')
data = functions.call_data_clean(p_threshold=1.3)

from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns

ATAC = data['ATAC_seq']
ATAC_scores = data['norm_scores']
ATAC_scores_T = ATAC_scores.T
RNA = data['RNA_seq']
RNA_T = data['RNA_seq_T']
Exons = data['exons']

def get_tss(row):
    # Use the correct column names from Exons
    if row["Strand"] == "+":
        return row["Transcription Start"]
    else:
        return row["Transcription Start"]  # or another column if appropriate

Exons["TSS"] = Exons.apply(get_tss, axis=1)

linked_data = []
for _, gene_row in Exons.iterrows():
    gene_name = gene_row["Genname"]
    chrom = gene_row["Chromosom"]
    tss = gene_row["TSS"]

    
    nearby_peaks = ATAC[
        (ATAC["Chromosom"] == chrom) &
        (np.abs(ATAC["Summit"] - tss) <= 100_000)
    ]

    for _, peak_row in nearby_peaks.iterrows():
        linked_data.append({
            "GeneName": gene_name,
            "PeakID": peak_row["ImmGenATAC1219.peakID"],
            "DistanceToTSS": np.abs(peak_row["Summit"] - tss)
        })

link_df = pd.DataFrame(linked_data)

#lasso regression
results = []
coefs = {}

for gene in RNA.index:
    gene_links = link_df[link_df["GeneName"] == gene]
    if gene_links.empty:
        continue
    
    # Peaks for this gene
    peaks = gene_links["PeakID"].unique()
    atac_subset = ATAC[ATAC["ImmGenATAC1219.peakID"].isin(peaks)]
    atac_matrix = atac_subset[RNA.columns].transpose().fillna(0).values
    y = RNA.loc[gene].values

    if atac_matrix.shape[1] == 0:
        continue

    model = LassoCV(cv=3).fit(atac_matrix, y)
    y_pred = model.predict(atac_matrix)
    r2 = r2_score(y, y_pred)

    results.append({
        "Gene": gene,
        "R2": r2,
        "Num_CRs": atac_matrix.shape[1]
    })

    coefs[gene] = pd.Series(model.coef_, index=peaks)

results_df = pd.DataFrame(results).sort_values("R2", ascending=False)

# R² values (explain variance)
plt.figure(figsize=(10, 5))
sns.barplot(data=results_df, x="Gene", y="R2", palette="viridis")
plt.title("Variance in Gene Expression Explained by Linked CREs")
plt.ylabel("R² Score")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

top_gene = results_df.iloc[0]["Gene"]
top_coef_series = coefs[top_gene]

plt.figure(figsize=(8, 4))
top_coef_series.plot(kind='bar', color=['red' if v < 0 else 'blue' for v in top_coef_series])
plt.title(f"Lasso Coefficients for Gene {top_gene}")
plt.ylabel("Effect on Expression")
plt.xlabel("CRE (ATAC peak)")
plt.tight_layout()
plt.show()


KeyError: 'Chromosom'