# Create df for heatmap (PDO27)

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scprep
import phate

from sklearn.decomposition import PCA
from MultiscaleEMD import MetricTree
import sklearn
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import OneHotEncoder

data_path = "/Users/mariaramos/Dropbox/Merged_files/"
#!ls -lah $data_path
file_name = "Metadata_final"

In [2]:
#Load metadata from file
raw_df = pd.read_pickle(data_path + file_name)
raw_df['Batch'] = raw_df['Batch'].apply(str)

In [3]:
"""Variable: select for specific subsets of the data"""

culture_mask = ((raw_df['Cell_type'] == 'PDOs'))
patient_mask = ((raw_df['Patient'] == '27'))
batch_mask = ((raw_df['Batch'] == '1'))
plate_mask = ((raw_df['Plate'] == 'SLV'))

data_masked = raw_df.loc[culture_mask & patient_mask] 


In [4]:
#Clean-up: removing pPKCa to remove NaN values - downsample 

data_masked.drop('pPKCa', axis=1, inplace=True)
# data_masked = data_masked.sample(n=1000000, random_state=1, replace=False)
data_masked.index = np.arange (data_masked.shape[0])

#Creates a tag for normalization that includes Patient + Date
def col_join(arr):
    """ Joins columns of a dataframe into a single column with underscores """
    return "_".join(np.array(arr).astype("str"))
    
norm_tags = data_masked.iloc[:, -9:-7].T.apply(col_join).rename("Norm_tag")
df_with_tags = pd.concat([data_masked, norm_tags], axis=1)

#We need to split the data to normalise the numerical columns
data_for_norm = df_with_tags.iloc[:, :-10]
non_numerical = df_with_tags.iloc[:, -10:-1]

# Arcsinh transformation and batch normalisation
data_arcs = np.arcsinh(data_for_norm / 5)
data_centered = scprep.normalize.batch_mean_center(
    data_arcs.copy(), sample_idx=df_with_tags["Norm_tag"]
)

#Re-merge the non-numerical values to the normalised data
full_centered = pd.concat([data_centered, non_numerical], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [None]:
# Build a canonical list of names for each sample so we can order by this in
# all subsequent processing

full_names = full_centered.iloc[:, -9:].T.apply(col_join).rename("Full_name")
df_with_names = pd.concat([full_centered, full_names], axis=1)

metadata = df_with_names.iloc[:, -10:]
data = full_centered.iloc[:, :-9]

full_data = pd.concat([metadata, data], axis=1)

In [None]:
mean_locs = (
    df_with_names.groupby(
        [
            "Culture",
            "Date",
            "Treatment",
            "Concentration",
            "Replicate",
            "Cell_type",
            "Patient",
            "Full_name",
            "Batch", 
            "Plate"
        ]
    )
    .mean()
)

meta = mean_locs.index.to_frame()

# Sorting is necessary so that np.unique works for labeling the samples on the tree
# meta contains one line per sample and the metadata table associated with that sample
meta = meta.reset_index(drop=True).sort_values("Full_name").reset_index(drop=True)

In [None]:
non_sig_markers = [
    "pHH3",
    "RFP",
    "mCHERRY",
    "Vimentin",
    "EpCAM",
    "CK18",
    "Pan_CK",
    "GFP",
    "IdU",
    "cCaspase_3",
    "Geminin",
    "pRB",
    "PLK",
    "CHGA",
    "CD90",
    "cPARP",
    "Cyclin_B1",
]
data_sig = data.drop(
    non_sig_markers,
    axis=1,
    inplace=False,
)
#Cell_type markers or problematic antibodies (cCaspase_3)
cell_type_markers = [
    "RFP",
    "mCHERRY",
    "Vimentin",
    "EpCAM",
    "CK18",
    "Pan_CK",
    "GFP",
    "CHGA",
    "CD90",
    "cCaspase_3"
]


data_all = data.drop(
    cell_type_markers,
    axis=1,
    inplace=False,
)

In [None]:
# Build a subtree for each leaf phase
unique, inverse = np.unique(full_names, return_inverse=True)
assert np.array_equal(unique, meta["Full_name"].values)

thresholds = {
    "20210330_27": {
        "pRB": -0.8,
        "IdU": 1,
        "pHH3": 3,
        "Cyclin_B1": -1,
        "cPARP": -0.8,
        "pHistone_H2A":-2.3
    },
    "20211122_27": {
        "pRB": -0.5,
        "IdU": 1.8,
        "pHH3": 3,
        "Cyclin_B1": -1,
        "cPARP": -0.8,
        "pHistone_H2A":-1.2
    },
}
discretes = []
for norm_tag in df_with_tags["Norm_tag"].unique():
    mask = df_with_tags["Norm_tag"] == norm_tag
    discretes.append(
        pd.concat(
            [data_centered[mask][gene] > thresh for gene, thresh in thresholds[norm_tag].items()],
            axis=1,
        )
    )
discrete = pd.concat(discretes, axis=0).reindex(data.index)

tree = {
    "S_phase": discrete["pRB"] & discrete["IdU"],
    "M_phase": discrete["pRB"] & ~discrete["IdU"] & discrete["pHH3"],
    "G2_phase": discrete["pRB"] & ~discrete["IdU"] & ~discrete["pHH3"] & discrete["Cyclin_B1"],
    "G1_phase": discrete["pRB"] & ~discrete["IdU"] & ~discrete["pHH3"]  & ~discrete["Cyclin_B1"],
    "Apoptosis": ~discrete["pRB"] & discrete["cPARP"],
    "G0_phase": ~discrete["pRB"] & ~discrete["cPARP"],
}

df_tree = pd.DataFrame(tree)
leaf_phases = ["S_phase", "M_phase", "G2_phase", "G1_phase", "G0_phase", "Apoptosis"]

proportions = (
    pd.concat([metadata, df_tree], axis=1)
    .groupby(
        [
            "Culture",
            "Date",
            "Treatment",
            "Concentration",
            "Replicate",
            "Cell_type",
            "Patient",
            "Full_name",
        ]
    )
    .mean()
    .sort_values("Full_name")
    .reset_index(drop=True)
)

# Encodes
onehot = OneHotEncoder(sparse=True)
labels = onehot.fit_transform(inverse.reshape(-1, 1))

# TODO this has changed in v6
d = np.array(labels.sum(axis=0)).flatten()
labels_normed = labels.tocoo()
labels_normed.data = labels_normed.data / d[labels_normed.col]
labels_normed = labels_normed.tocsr()


def l1_embeddings(cts, edge_weights):
    return np.array(
        [np.asarray(cts)[i, :] * np.asarray(edge_weights) for i in range(len(cts))]
    )


def leaf_runner(
    data, labels, tree_type, n_trees, norm_per_subtree=False, random_state=42, **kwargs
):
    """Creates tree embeddings for each sample based on tree parameters.

    Parameters:

        data: [# cells x # features] data matrix
        labels: [# cells x # distributions] (potentially sparse) describing membership of cells to distributions
        tree_type: type of tree to build over the features
        n_trees: how many trees to build
        norm_per_subtree: whether to treat each subtree as a separate distribution,
                          this essentially weights each subtree equally, rather than weighting
                          based on how many cells are in each subtree (default)

    Returns:
        leaf_embeds: [# distributions x (n_trees x n_nodes)] embeddings one per distribution where L1 distrance
                     between embeddings represents tree EMD
        leaf_trees: Tree objects for each tree
        leaf_ids: Leaf label for each tree node [n_nodes] containing the strings of the leaf_phases
    """
    leaf_embeds = []
    leaf_trees = []
    leaf_ids = []
    # note that we only build a tree for each leaf phase leaving out proliferating vs. not
    rs = random_state
    for leaf in leaf_phases:
        mask = np.array(df_tree[leaf])
        sub_data = data[mask]
        sub_labels = labels[mask]
        if norm_per_subtree:
            d = np.array(sub_labels.sum(axis=0)).flatten()
            # Fix divide by zero errors
            d = np.clip(d, a_min=1e-8, a_max=None)
            sub_labels = sub_labels.tocoo()
            sub_labels.data = sub_labels.data / (d[sub_labels.col])
            sub_labels = sub_labels.tocsr()
        embeds = []
        mts = []
        for i in range(n_trees):
            mt = MetricTree(tree_type=tree_type, random_state=rs, **kwargs)
            counts, edge_weights = mt.fit_transform(
                X=sub_data,
                y=sub_labels,
            )
            embeds.extend(l1_embeddings(counts.todense(), edge_weights).T)
            mts.append(mt)
        embeds = np.array(embeds).T
        leaf_embeds.append(embeds)
        leaf_trees.append(mts)
        leaf_ids.append([leaf] * embeds.shape[1])
        rs += 1
    leaf_embeds = np.concatenate(leaf_embeds, axis=1)
    leaf_ids = np.concatenate(leaf_ids)
    return leaf_embeds, leaf_trees, leaf_ids


def tree_runner(data, labels, tree_type, n_trees, random_state=42, **kwargs):
    """Creates tree embeddings for each sample based on tree parameters.

    This ignores known cell state structure and simply builds a tree over the entire dataset.

    Parameters:

        data: [# cells x # features] data matrix
        labels: [# cells x # distributions] (potentially sparse) describing membership of cells to distributions
        tree_type: type of tree to build over the features
        n_trees: how many trees to build
        norm_per_subtree: whether to treat each subtree as a separate distribution,
                          this essentially weights each subtree equally, rather than weighting
                          based on how many cells are in each subtree (default)

    Returns:
        leaf_embeds: [# distributions x (n_trees x n_nodes)] embeddings one per distribution where L1 distrance
                     between embeddings represents tree EMD
        leaf_trees: Tree objects for each tree
    """
    embeds = []
    mts = []
    for i in range(n_trees):
        mt = MetricTree(tree_type=tree_type, random_state=random_state + i, **kwargs)
        counts, edge_weights = mt.fit_transform(
            X=data,
            y=labels,
        )
        embeds.extend(l1_embeddings(counts.todense(), edge_weights).T)
        mts.append(mt)
    embeds = np.array(embeds).T
    return embeds, mts


from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score

In [15]:
def calc_fold_change(condition):
    index = meta.loc[:, [ "Patient", "Concentration", "Culture", "Replicate", "Treatment", "Plate", 'Batch']]

    indexed_marker_means = pd.concat([index.set_index(condition.index), condition], axis=1)
    indexed_marker_means.set_index(["Patient", "Concentration", "Culture", "Replicate", "Treatment", "Plate", 'Batch'], inplace=True)
    fold_change_ref = (
        indexed_marker_means.xs("0", level="Concentration").groupby(["Patient", "Culture", "Plate"]).mean()
    )
    fold_change = indexed_marker_means / fold_change_ref
    fold_change = pd.DataFrame(index=indexed_marker_means.index).join(fold_change)
    return fold_change


df_tree_meta = pd.concat([metadata, df_tree, data_arcs['pHistone_H2A']], axis=1)

tree_s_phase = df_tree_meta.loc[df_tree_meta['S_phase'] == True]
tree_m_phase = df_tree_meta.loc[df_tree_meta['M_phase'] == True]
tree_g1_phase = df_tree_meta.loc[df_tree_meta['G1_phase'] == True]
tree_g2_phase = df_tree_meta.loc[df_tree_meta['G2_phase'] == True]
tree_g0_phase = df_tree_meta.loc[df_tree_meta['G0_phase'] == True]
tree_ap_phase = df_tree_meta.loc[df_tree_meta['Apoptosis'] == True]

tree_s_phase = tree_s_phase.rename(columns={'pHistone_H2A': 'pHistone_H2A_S'})
tree_m_phase = tree_m_phase.rename(columns={'pHistone_H2A': 'pHistone_H2A_M'})
tree_g1_phase = tree_g1_phase.rename(columns={'pHistone_H2A': 'pHistone_H2A_G1'})
tree_g2_phase = tree_g2_phase.rename(columns={'pHistone_H2A': 'pHistone_H2A_G2'})
tree_g0_phase = tree_g0_phase.rename(columns={'pHistone_H2A': 'pHistone_H2A_G0'})
tree_ap_phase = tree_ap_phase.rename(columns={'pHistone_H2A': 'pHistone_H2A_Ap'})

s_means = (
    tree_s_phase.groupby(
        [
            "Culture",
            "Date",
            "Treatment",
            "Concentration",
            "Replicate",
            "Cell_type",
            "Patient",
            "Full_name",
            "Batch", 
            "Plate"
        ]
    )
    .mean()
)
m_means = (
    tree_m_phase.groupby(
        [
            "Culture",
            "Date",
            "Treatment",
            "Concentration",
            "Replicate",
            "Cell_type",
            "Patient",
            "Full_name",
            "Batch", 
            "Plate"
        ]
    )
    .mean()
)
g1_means = (
    tree_g1_phase.groupby(
        [
            "Culture",
            "Date",
            "Treatment",
            "Concentration",
            "Replicate",
            "Cell_type",
            "Patient",
            "Full_name",
            "Batch", 
            "Plate"
        ]
    )
    .mean()
)
g2_means = (
    tree_g2_phase.groupby(
        [
            "Culture",
            "Date",
            "Treatment",
            "Concentration",
            "Replicate",
            "Cell_type",
            "Patient",
            "Full_name",
            "Batch", 
            "Plate"
        ]
    )
    .mean()
)
g0_means = (
    tree_g0_phase.groupby(
        [
            "Culture",
            "Date",
            "Treatment",
            "Concentration",
            "Replicate",
            "Cell_type",
            "Patient",
            "Full_name",
            "Batch", 
            "Plate"
        ]
    )
    .mean()
)
ap_means = (
    tree_ap_phase.groupby(
        [
            "Culture",
            "Date",
            "Treatment",
            "Concentration",
            "Replicate",
            "Cell_type",
            "Patient",
            "Full_name",
            "Batch", 
            "Plate"
        ]
    )
    .mean()
)

tree_all_phases = pd.concat([s_means, m_means, g1_means, g2_means, g0_means, ap_means], axis=1)
tree_all_phases = tree_all_phases.drop(['S_phase', 'M_phase', 'G2_phase', 'G1_phase', 'Apoptosis', 'G0_phase'], axis=1)
tree_all_phases['pHistone_H2A_M'] = tree_all_phases['pHistone_H2A_M'].fillna(0)
tree_all_phases['pHistone_H2A_S'] = tree_all_phases['pHistone_H2A_S'].fillna(0)
fold_change_cs = calc_fold_change(tree_all_phases)
fold_change_cs.to_csv('~/Dropbox/From_python_to_R/pHistone_per_cell_state_PDO5')
fold_change_cs.isnull().values.any()

False

In [16]:
meta_proportions = pd.concat([meta, proportions], axis=1)

fold_change_CS = calc_fold_change(proportions)
fold_change_CS = fold_change_CS.reset_index()

raw_means = (
    pd.concat([full_data["Full_name"], data_arcs], axis=1).groupby("Full_name").mean()
)
raw_means_fc = calc_fold_change(raw_means)
raw_means_fc = raw_means_fc.reset_index()

In [17]:
'''RAW PROPORTIONS'''

raw_prop_H2A = pd.concat([meta_proportions, raw_means_fc['pHistone_H2A']], axis=1)

'''FOLD CHANGE'''

fold_change_H2A = pd.concat([fold_change_CS, raw_means_fc['pHistone_H2A']], axis=1)