# Data Integration

The integration script is for reference only and the integrated dataframe can be downloaded with the link: %

In [1]:
import os, re, sys
import fcsparser
import pandas as pd

sys.path.insert(0, "../..")
from scripts.aux_functions import *
from scripts.__init__ import non_marker


In [None]:
# Import raw data
base_dir = "/Users/xiaoqin/Dropbox/TAPE LAB/Manuscripts/Qin & Cardoso Rodriguez et al/SupplementaryMaterials/Qin-CardosoRodriguez-et-al_analysis/Figure3-4_S3-S4/Ligand Time-course"
os.makedirs(f"{base_dir}/output", exist_ok=True)
input_dir = f"{base_dir}/preprocessed_data"
output_dir = f"{base_dir}/output"

filelist = [f for f in os.listdir(input_dir) if f.endswith(".txt")]
filelist


In [None]:
# ---
# Integration
# ---

# Naming convention:
# batch_experiment_barcode_time-point_media_replicate
# e.g. "B4_Ligand Time-course_001_6h_Ctrl_1.txt"

integrated = pd.DataFrame()
for f in filelist:
    df_new = pd.read_csv(f"{input_dir}/{f}", sep="\t")

    batch = f.split("_")[0]
    barcode = f.split("_")[2]
    time_point = f.split("_")[3]
    media = f.split("_")[4]
    # cell_type = f.split("_")[-2]
    replicate = f.split("_")[-1].split(".")[0]

    df_new["batch"] = batch
    df_new["barcode"] = barcode
    df_new["time_point"] = time_point
    df_new["media"] = media
    # df_new["cell_type"] = cell_type
    df_new["replicate"] = replicate
    df_new["culture"] = time_point + "_" + media
    df_new["condition"] = time_point + "_" + media + "_" + replicate

    integrated = pd.concat([integrated, df_new])
integrated.head()
integrated.shape

# integrated = rename_panel(integrated)  # Revome the version number etc.
integrated = integrated.rename(
    columns=lambda x: re.sub("_v[0-9]+$", "", x)
)  # Revome the version number
integrated.columns.values
integrated.to_csv(f"{output_dir}/integrated_dataframe_all-cells.csv", index=False)


# EMD Calculation

In [2]:
import os, sys
import pandas as pd
import scprep
from scipy.stats import zscore

sys.path.insert(0, "../..")
from scripts.aux_functions import *
from scripts.__init__ import non_marker


In [5]:
# Import data
base_dir = "/Users/xiaoqin/Dropbox/TAPE LAB/Manuscripts/Qin & Cardoso Rodriguez et al/SupplementaryMaterials/Qin-CardosoRodriguez-et-al_analysis/Figure3-4_S3-S4/Ligand Time-course"
os.makedirs(f"{base_dir}/output/emd", exist_ok=True)
output_dir = f"{base_dir}/output/emd"

# NB: integrated dataframe can be accessed at <10.5281/zenodo.7586958>
input_dir = "/Users/xiaoqin/Dropbox/TAPE LAB/Manuscripts/Qin & Cardoso Rodriguez et al/SupplementaryMaterials/Data Share/CyTOFdataframes"

In [6]:
# Generate full_panel and metadata_columns on the fly
df = pd.read_csv(f"{input_dir}/Ligand-Time-course_all-cells.csv")
df_columns = df.columns.values.tolist()
column_filtered = list(
    filter(lambda x: x not in non_marker, df_columns)
)  # Exclude non-markers

full_panel = [i for i in column_filtered if i[0].isdigit()]
metadata_columns = list(filter(lambda x: x not in full_panel, column_filtered))


In [None]:
# EMD calculation
time_points = ["6h", "12h", "24h", "48h"]

for time_point in time_points:

    subset_df = df.loc[df["time_point"] == time_point]
    subset_df = arcsinh_transf(subset_df, full_panel, 5)

    emd_ref = subset_df.loc[
        subset_df["culture"] == f"{time_point}_Ctrl"
    ]  # for each time point

    # EMD calculation
    emd_df = pd.DataFrame()
    emd_df_master = pd.DataFrame()
    emd_infodict = {}

    # Get all experimental conditions for which EMD is going to be calculated
    conditions = subset_df["condition"].unique().tolist()
    len(conditions)

    # Compute EMD and compile the result in a master dataframe
    for condition in conditions:
        compare_from = subset_df.loc[subset_df["condition"] == condition]

        df_new = calculate_emd(
            full_panel,  # all non-marker and non-metadata columns
            compare_from,
            emd_ref,  #!
            metadata_columns,
            emd_df,
            emd_infodict,
        )
        emd_df_master = pd.concat([emd_df_master, df_new])

    # Long to wide transformation
    emd_df_master_wide = emd_df_master.pivot_table(
        index=metadata_columns,
        columns="marker",
        values="EMD_arc",
    ).reset_index()
    emd_df_master_wide.to_csv(
        f"{output_dir}/emd_df_master_{time_point}_no-norm.csv",
        index=False,
    )

    # # Z-score normalisation
    # # retrieve data and metadata
    # emd_df_master_wide_metadata = emd_df_master_wide[metadata_columns].reset_index()
    # emd_df_master_wide_data = master_df_subset(emd_df_master_wide, full_panel)

    # # z-score
    # emd_df_master_wide_data_scaled = emd_df_master_wide_data.apply(zscore).reset_index()
    # emd_df_master_wide_data_scaled = pd.merge(
    #     emd_df_master_wide_data_scaled, emd_df_master_wide_metadata, on="index"
    # )
    # emd_df_master_wide_data_scaled.drop(columns=["index"], inplace=True)
    # emd_df_master_wide_data_scaled.to_csv(
    #     f"{output_dir}/emd/emd_df_master_{time_point}_z-score.csv",
    #     index=False,
    # )

In [None]:
# Concatenate EMD scores
for time_point in time_points:
    emd_concat_nonorm = pd.DataFrame()
    nonorm_dataset = [
        f
        for f in os.listdir(f"{output_dir}")
        if f.startswith("emd_df_master") and "no-norm" in f
        # and "TALL" in f
        and f.split("_")[3] in time_points
    ]
    nonorm_dataset
    for f in nonorm_dataset:
        df = pd.read_csv(f"{output_dir}/{f}")
        emd_concat_nonorm = pd.concat([emd_concat_nonorm, df])

    emd_concat_nonorm.to_csv(
        f"{output_dir}/emd_df_master_concat_no-norm.csv", index=False
    )