In [None]:

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import zscore
from scipy.stats import norm
from scipy import stats
import scanpy as sc
import rapids_singlecell as rsc
import anndata
import os
from scipy.stats import gaussian_kde

In [None]:
def z_normalize(data, list_out, list_keep):
    """
    Normalize data by z-scoring and filter columns based on provided lists.

    Parameters:
        data (DataFrame): Input DataFrame containing data to be normalized.
        list_out (list): List of columns to be dropped from the DataFrame.
        list_keep (list): List of columns to be kept in the DataFrame but not normalized.

    Returns:
        DataFrame: DataFrame with columns normalized by z-score and filtered based on the provided lists.
    """
    # Drop columns containing 'blank' in their names and additional columns specified in list_out
    list1 = [col for col in data.columns if 'blank' in col]
    list_out1 = list1 + list_out
    dfin = data.drop(list_out1, axis=1)

    # Save columns specified in list_keep for later
    df_loc = dfin.loc[:, list_keep]

    # DataFrame for normalization (drop columns specified in list_keep)
    dfz = dfin.drop(list_keep, axis=1)

    # Calculate z-scores for the column markers
    dfz1 = pd.DataFrame(zscore(dfz, 0), index=dfz.index, columns=[i for i in dfz.columns])

    # Add back labels for normalization type
    dfz_all = pd.concat([dfz1, df_loc], axis=1, join='inner')

    # Print the number of unique File_ID
    #print("the number of unique regions = " + str(len(dfz_all.unique_region.unique())))

    return dfz_all

In [None]:
def remove_noise(df, marker_cols, z_sum_thres, z_count_thres):
    """
    Remove noisy cells from the dataset based on specified thresholds.

    Parameters:
        df (DataFrame): Input DataFrame containing the dataset.
        marker_cols (list): List of columns in the DataFrame to consider for noise removal.
        z_sum_thres (int): Threshold for the sum of z-scores.
        z_count_thres (int): Threshold for the count of values greater than 1.

    Returns:
        DataFrame: DataFrame with noisy cells removed.
        DataFrame: DataFrame containing removed noisy cells.
    """
    # Calculate count of values greater than 1 and sum of z-scores for each row
    df['Count'] = df[marker_cols].ge(1).sum(axis=1)
    df['z_sum'] = df[marker_cols].sum(axis=1)

    # Filter out noisy cells based on z_sum and Count thresholds
    cc = df[(df['z_sum'] > z_sum_thres) & (df['Count'] > z_count_thres)]
    df_want = df[~((df['z_sum'] > z_sum_thres) & (df['Count'] > z_count_thres))]

    # Calculate the percentage of removed noisy cells
    percent_removed = len(df_want) / len(df)
    print(str(percent_removed))

    # Plot histogram of cell counts
    ee = df['Count'].plot.hist(bins=50, alpha=0.8, logy=False)

    # Drop Count and z_sum columns and reset index for the filtered DataFrame
    df_want.drop(columns=['Count', 'z_sum'], inplace=True)
    df_want.reset_index(inplace=True, drop=True)

    return df_want, cc

In [None]:
def nuclear_size(data_in, size_thres, nuc_thres):
    """
    Plot scatter plot by region, filter data based on size and nuclear intensity thresholds.

    Parameters:
        data_in (DataFrame): Input DataFrame containing nuclear size and intensity data.
        size_thres (float): Threshold value for nuclear size.
        nuc_thres (float): Threshold value for nuclear intensity.

    Returns:
        DataFrame: Filtered DataFrame containing nuclei that meet the size and intensity criteria.
    """
    # Plot scatter plot by region
    plt.figure(figsize=(7,7))
    data_plot = data_in.sample(frac=0.05, random_state=42)
    g = sns.scatterplot(data=data_plot, x='area', y='DAPI', size=1)
    g.set_xscale('log')
    g.set_yscale('log')
    ticks = [0.1, 1, 10, 100, 1000, 10000]
    g.set_yticks(ticks)
    g.set_yticklabels(ticks)
    g.set_xticks(ticks)
    g.set_xticklabels(ticks)
    plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)

    # Add vertical line at size threshold
    plt.axvline(x=size_thres, color='red')

    # Add horizontal line at nuclear intensity threshold
    plt.axhline(y=nuc_thres, color='red')

    # Filter data based on size and nuclear intensity thresholds
    df_nuc = data_in[(data_in['area'] > size_thres) & (data_in['DAPI'] > nuc_thres)]
    per_keep = len(df_nuc) / len(data_in)
    print(per_keep)

    return df_nuc

In [None]:
def get_adata(data, list_keep_obs):
    """
    Convert DataFrame to AnnData format.

    Parameters:
        data (DataFrame): Input DataFrame containing data to be converted.
        list_keep_obs (list): List of columns to be kept as observation attributes.

    Returns:
        AnnData: AnnData object containing data and observation attributes.
    """
    # Reset the index of the DataFrame
    data.reset_index(inplace=True, drop=True)

    # Prepare for making into anndata format by dropping columns specified in list_keep_obs
    dfsc = data.drop(columns=list_keep_obs)

    # Convert to AnnData format
    adata = sc.AnnData(dfsc)

    # Assign observation attributes to the AnnData object
    adata.obs = data[list_keep_obs]

    return adata

In [None]:
def adata_to_dataframe(adata):
    """
    Convert AnnData object to DataFrame.

    Parameters:
        adata (AnnData): AnnData object to be converted.

    Returns:
        DataFrame: DataFrame containing both variables and observation attributes.
    """
    # Convert AnnData variables to DataFrame
    df_var = adata.to_df()

    # Extract observation attributes from AnnData
    df_obs = adata.obs

    # Concatenate variable DataFrame and observation attribute DataFrame along axis 1
    df = pd.concat([df_var, df_obs], axis=1)

    return df

In [None]:
import os
import pandas as pd

# === USER-DEFINED PATHS ===
# This should be the directory that contains the per-slide subfolders (e.g., B004-A-004, B004-A-008, ...)
input_root = "/mnt/jwh83-data/Orthopaedic_QuPath/QuPath_Project/REDSEA_CSVS"
output_dir  = "/mnt/jwh83-data/Orthopaedic_QuPath/QuPath_Project/CSV"
os.makedirs(output_dir, exist_ok=True)

before_out = os.path.join(output_dir, "single_cell_before_redsea_ALL.csv")
after_out  = os.path.join(output_dir, "single_cell_after_redsea_ALL.csv")

# Filenames inside each subfolder
BEFORE_NAME = "single_cell_before_redsea.csv"
AFTER_NAME  = "single_cell_after_redsea.csv"

before_list, after_list = [], []

def load_and_tag(csv_path: str, slide_name: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)

    # Add slide_name column (parent folder name)
    df["slide_name"] = slide_name

    # Update CellID -> "{slide_name}_{original}"
    if "CellID" in df.columns:
        df["CellID"] = slide_name + "_" + df["CellID"].astype(str)
    else:
        print(f"⚠️ Missing 'CellID' column in: {csv_path}")

    # Optional: keep provenance
    df["FILE"] = os.path.basename(csv_path)

    return df

# === ITERATE SUBFOLDERS ===
for entry in sorted(os.listdir(input_root)):
    slide_dir = os.path.join(input_root, entry)
    if not os.path.isdir(slide_dir):
        continue

    slide_name = entry  # parent subfolder name

    before_path = os.path.join(slide_dir, BEFORE_NAME)
    after_path  = os.path.join(slide_dir, AFTER_NAME)

    if os.path.exists(before_path):
        try:
            print(f"Processing BEFORE: {before_path}")
            before_list.append(load_and_tag(before_path, slide_name))
        except Exception as e:
            print(f"⚠️ Error processing {before_path}: {e}")
    else:
        print(f"⚠️ Missing BEFORE file for {slide_name}: {before_path}")

    if os.path.exists(after_path):
        try:
            print(f"Processing AFTER:  {after_path}")
            after_list.append(load_and_tag(after_path, slide_name))
        except Exception as e:
            print(f"⚠️ Error processing {after_path}: {e}")
    else:
        print(f"⚠️ Missing AFTER file for {slide_name}: {after_path}")

# === COMBINE & SAVE ===
if before_list:
    before_df = pd.concat(before_list, ignore_index=True)
    before_df.to_csv(before_out, index=False)
    print(f"✅ Combined BEFORE CSV saved to: {before_out}")
else:
    print("❌ No BEFORE CSV files were processed.")

if after_list:
    after_df = pd.concat(after_list, ignore_index=True)
    after_df.to_csv(after_out, index=False)
    print(f"✅ Combined AFTER CSV saved to: {after_out}")
else:
    print("❌ No AFTER CSV files were processed.")
