In [None]:
import glob
import random
import re
import sys

import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from muon import atac as ac
from muon._atac import tools as tools
from natsort import natsorted
from tqdm import tqdm

sys.path.insert(1, "../helper_functions")
from helper_functions import gini, lorenz

In [None]:
all_count_files = glob.glob("../../sci/LFS_*/readCount_filtered_bam/*.seg")
len(all_count_files)

In [None]:
barcode_regex = r"([ACGT]+-\d+)[.]"

# Function to process a single file


def process_file(filepath):
    with open(filepath, "r") as file:
        counts = {}
        for line in file:
            if line.startswith("fixedStep"):
                # Parse the header to update chromosome, start, and step
                parts = line.strip().split()
                chrom = parts[1].split("=")[1]
                start = int(parts[2].split("=")[1])
                step = int(parts[3].split("=")[1])
            else:
                # Process count lines
                end = start + step - 1  # Assuming span equals step
                coord_key = f"{chrom}:{start}-{end}"
                counts[coord_key] = int(line.strip())
                start += step  # Prepare start for the next segment
        return counts


# Initialize a DataFrame to collect counts for all files
all_counts = []
all_barcodes = []
# Process each file and collect counts
for file_path in tqdm(all_count_files):
    file_counts = process_file(file_path)
    all_counts.append(file_counts)
    match = re.search(barcode_regex, file_path)
    barcode_id = match.group(1)
    idx = "_".join(file_path.split("/")[-3].split("-")[:2])
    barcode_id = idx + "_" + barcode_id
    all_barcodes.append(barcode_id)

count_matrix = pd.DataFrame(all_counts)
count_matrix.index = all_barcodes
count_matrix
count_matrix.to_csv("../data/sciHIPSD_raw_counts.csv.gz")

In [None]:
count_matrix = pd.read_csv("../data/sciHIPSD_raw_counts.csv.gz", index_col=0)

In [None]:
gini_matac = count_matrix.apply(lambda row: gini(row), axis=1).values
lorenz_matac = count_matrix.apply(lambda row: lorenz(row), axis=1).values
lorenz_matac_2d = np.array([np.array(x) for x in lorenz_matac])

In [None]:
median_lorenz = np.nanmedian(lorenz_matac_2d, axis=0)
# Calculate the 95% confidence interval for each point
lower_bound = np.nanpercentile(lorenz_matac_2d, 2.5, axis=0)
upper_bound = np.nanpercentile(lorenz_matac_2d, 97.5, axis=0)

# X-axis values - normalized to range from 0 to 1
x_values = np.arange(len(median_lorenz)) / (len(median_lorenz) - 1)

# Plotting the median Lorenz curve
plt.plot(x_values, median_lorenz, label="Median Lorenz Curve", lw=2, color="blue")
plt.plot(x_values, x_values, label="uniform", ls="--", color="grey")


# Shading the area representing the confidence interval
plt.fill_between(
    x_values,
    lower_bound,
    upper_bound,
    color="lightblue",
    alpha=0.5,
    label="95% Confidence Interval",
)


# Additional plot formatting
plt.xlabel("Fraction of genome")
plt.ylabel("Cumulative Share of reads")
plt.legend()
plt.grid(True)

In [None]:
plt.scatter(gini_matac, count_matrix.sum(axis=1))
plt.xlabel("Gini")
plt.ylabel("Total counts")

In [None]:
atac = ad.AnnData(count_matrix)

In [None]:
sc.pp.calculate_qc_metrics(atac, percent_top=None, log1p=False, inplace=True)
atac.obs.rename(
    columns={
        "n_genes_by_counts": "n_features_per_cell",
    },
    inplace=True,
)
atac.obs["log_total_counts"] = np.log10(atac.obs["total_counts"])
atac.obs["idx_file"] = [x.split("_")[1] for x in atac.obs.index]

In [None]:
features = pd.read_csv("../../rna_features.csv")

In [None]:
idx_files = atac.obs["idx_file"].unique()
all_atac = []
for file in idx_files:
    print(file)
    tmp = atac[atac.obs.idx_file == file].copy()
    tmp.obs.index = [x.split("_")[-1] for x in tmp.obs.index]
    ac.tl.locate_fragments(
        tmp,
        f"../../aurelie_data/revision_data/dna/sciHIPSD_merged/LFS_{file}/outs/fragments.tsv.gz",
    )
    ac.tl.nucleosome_signal(tmp)
    tss = ac.tl.tss_enrichment(tmp, n_tss=100000, random_state=666, features=features)

    tmp.obs_names = f"LFS_{file}_" + tmp.obs_names

    all_atac.append(tmp)

In [None]:
merged = pd.concat([ads.obs for ads in all_atac])
merged
merged.to_csv("../data/sciHIPSD_qc_parameters.csv.gz")

In [None]:
tss_scores = []
nuc_signal = []
for file in all_atac:
    tss_scores.append(file.obs.tss_score.to_list())
    nuc_signal.append(file.obs.nucleosome_signal.to_list())

In [None]:
tss_scores_flat = [item for sublist in tss_scores for item in sublist]
nuc_signal_flat = [item for sublist in nuc_signal for item in sublist]

In [None]:
sns.histplot(x=tss_scores_flat)
plt.xlim((0, 5))

In [None]:
sns.histplot(x=nuc_signal_flat)
plt.title("Distribution of the nucleosome signal")
plt.xlim(0, 2)

In [None]:
region = "chr1:1-248956422"
hg_size = 3099706404
sd = []
lengths = []
coverages = []
for file in all_atac:
    print(file)
    file.obs.index = file.obs.index.str.split("_").str[-1]
    fragment_path = file.uns["files"]["fragments"]
    fragments = tools.fetch_regions_to_df(fragment_path=fragment_path, features=region)

    fragments["length"] = fragments.End - fragments.Start
    fragments.set_index(keys="Cell", inplace=True)
    fragments = fragments.join(file.obs, how="right")
    f_length = fragments.length.median()
    file.obs["coverage"] = (file.obs["total_counts"] * f_length) / hg_size
    sd.append(file.obs)
    lengths.append(fragments["length"].to_list())
    coverages.append(file.obs["coverage"].to_list())

In [None]:
lengths_flat = [item for sublist in lengths for item in sublist]
coverages_flat = [item for sublist in coverages for item in sublist]

In [None]:
filtered_list = [x for x in lengths_flat if x <= 1000]
random_subset = random.sample(filtered_list, 1000000)

In [None]:
plt.hist(x=random_subset, bins=1000, density=True)
plt.xlabel("Fragment length (bp)")
plt.ylabel("Probability")
plt.xlim((0, 1000))
plt.tight_layout()

In [None]:
sns.histplot(x=coverages_flat)
plt.axvline(
    np.nanmedian(coverages_flat),
    label=f"Median at {np.nanmedian(coverages_flat):.4f}",
    ls="--",
    c="grey",
)
plt.title("Coverage")
plt.legend()
plt.show()

### Prepare CNVs

In [None]:
all_cnv_files = glob.glob("../../sci/LFS*/hmmcopy_cells/*.bed")

In [None]:
len(all_cnv_files)

In [None]:
cnas = []

for file in tqdm(all_cnv_files):
    if file.endswith(".bed"):
        cell = file.split("/")[-1].split("_")[2]
        idx = "_".join(file.split("/")[-3].split("-")[:2])
        new_cell = idx + "_" + cell
        try:
            cell_file = pd.read_csv(file, header=None, sep="\t")

            cell_file["bin"] = (
                cell_file[0].astype(str)
                + ":"
                + cell_file[1].astype(str)
                + "-"
                + cell_file[2].astype(str)
            )
            cell_file = cell_file.set_index("bin")
            cell_file = cell_file[[3]]
            cell_file.columns = [new_cell]
            cnas.append(cell_file)

        except:
            print(file)
            continue

In [None]:
cna = pd.concat(cnas, ignore_index=False, axis=1)
cna = cna - 1
cna.replace(0, 1, inplace=True)
cna = cna.T
cna = cna[natsorted(cna.columns)]
cna.to_csv("../data/CNVs_sciHIPSD_raw.csv.gz")

In [None]:
qc = pd.read_csv("../data/sciHIPSD_qc_parameters.csv.gz", index_col=0)
qc

In [None]:
cna = pd.read_csv("../data/CNVs_sciHIPSD_raw.csv.gz", index_col=0)

In [None]:
filtered = qc[qc["n_features_per_cell"] > (0.9 * 3102)].copy()

In [None]:
cna = cna.loc[cna.index.isin(filtered.index)].copy()

CNVs filtered

In [None]:
filtered_columns = [col for col in cna.columns if not col.startswith("chrY")]
cna = cna[filtered_columns]
cna = (cna - 2) / 3

In [None]:
adata = sc.AnnData(cna)
adata.obs = adata.obs.join(qc)

In [None]:
adata.var["chromosome"] = adata.var.index.str.split(":").str[0]
adata.var["start"] = (
    adata.var.index.str.split(":").str[1].str.split("-").str[0].astype(int)
)
adata.var["end"] = (
    adata.var.index.str.split(":").str[1].str.split("-").str[1].astype(int)
)

In [None]:
adata.var.columns = ["chromosome", "start", "end"]
adata.obsm["X_cnv"] = adata.X
adata.var["pos"] = np.arange(adata.var.shape[0])
chrom_dict = {}
chrom_dict["chr_pos"] = {}
for tup in adata.var.itertuples():
    if tup.chromosome not in chrom_dict["chr_pos"]:
        chrom_dict["chr_pos"][tup.chromosome] = tup.pos
    if chrom_dict["chr_pos"][tup.chromosome] > tup.pos:
        chrom_dict["chr_pos"][tup.chromosome] = tup.pos
adata.uns["cnv"] = chrom_dict

In [None]:
sc.tl.pca(adata, svd_solver="arpack")
sc.pp.neighbors(adata, n_pcs=10)
sc.tl.leiden(adata, key_added="cnv_leiden", resolution=0.15)
sc.tl.umap(adata)

In [None]:
adata.obs["cnv_leiden"] = adata.obs["cnv_leiden"].replace("0", "Cluster 1")
adata.obs["cnv_leiden"] = adata.obs["cnv_leiden"].replace("1", "Cluster 0")

In [None]:
sc.pl.umap(
    adata,
    color="cnv_leiden",
    title="Leiden clusters",
    palette={
        "Cluster 1": sns.palettes.color_palette("tab10")[6],
        "Cluster 0": sns.palettes.color_palette("tab10")[0],
    },
    show=False,
)
plt.tight_layout()
plt.savefig("sci_leiden_clusters.png", dpi=300)
plt.savefig("sci_leiden_clusters.svg", dpi=300)

In [None]:
sc.pl.umap(
    adata,
    color=[
        "cnv_leiden",
        "n_features_per_cell",
        "log_total_counts",
        "tss_score",
    ],
    title=[
        "Cluster",
        "Number of non-empty bins per cell",
        "log10(Total number of reads per cell)",
        "TSS score enrichment per cell",
    ],
    ncols=2,
    vmax="p95",
    vmin="p05",
    show=False,
    wspace=0.2,
    palette={
        "Cluster 1": sns.palettes.color_palette("tab10")[6],
        "Cluster 0": sns.palettes.color_palette("tab10")[0],
    },
)
plt.tight_layout()
plt.savefig("sci_leiden_clusters_qc.png", dpi=300)
plt.savefig("sci_leiden_clusters_qc.svg", dpi=300)

In [None]:
adata.obs.to_csv("../data/sci_leiden_clusters.csv")