In [None]:
import pandas as pd
from statsmodels.distributions.empirical_distribution import ECDF

In [None]:
# Extract the new sigmas from the UNC232 PAM50 training set. 
# UNC232 gene expression matrix and Immunohistochemistry (IHC) classes are provided.
gene_expresion = pd.read_csv('./UNC232_gene_expresion.txt', sep='\t', index_col=0)
groups = pd.read_csv('./UNC232_groups.txt', sep='\t', index_col=0)

In [None]:
gene_expresion.head()

In [None]:
groups.head()

In [None]:
pd.Series(groups.values.ravel()).dropna().unique()

In [None]:
def get_sigma(gene_expresion, groups):
    """Build quantile dataframe for each IHC group.
    
    For each gene it calculates the ECDF function within each IHC group,
    and calls it using the overall median of that gene.
    
    :param gene_expresion: Gene expresion dataframe, i.e. UNC232
    :param groups: IHC label dataframe for each sample on the gene expresion.
    
    :return: Quantile dataframe.
    """
    groups_cols = pd.Series(groups.values.ravel()).dropna().unique()
    res = pd.DataFrame({}, columns=groups_cols, index=gene_expresion.index)
    for name, values in gene_expresion.iterrows():
        for col in groups:
            unique = groups[col].dropna().unique()
            for u in unique:
                samples_from_group = groups.loc[groups[col] == u].index
                subset = values[samples_from_group].dropna()
                res[u][name] = ECDF(subset)(values.median())
    return res

In [None]:
sigma = get_sigma(gene_expresion, groups).sort_index(axis=1)

In [None]:
sigma.head()

In [None]:
sigma.to_csv('./SIGMAS_UNC232_v4.0.txt', sep='\t')

In [None]:
gene_expr = pd.read_csv('/path/to/PAM50genes_for_subtyping.txt', sep='\t', index_col=0)
classes = pd.read_csv('/path/to/IHC_class_by_Sample.txt', sep='\t', index_col=0)

# Dataset for subtyping: gene expression and class format
- PAM50genes_for_subtyping: expression matrix with PAM50 genes on the row (gene symbol in the same format than UNC232_gene_expresion.txt provided) and sample on the column. Data should be properly normalized and log2 transformed but without gene centering.
- IHC_class_by_Sample: Immunohistochemistry subgroup of breast cancer to be subtyped (i.e., ERpos_HER2neg, HER2pos_ERneg, HER2pos_ERpos, TNBC)

In [None]:
gene_expr.head()

In [None]:
classes.head()

In [None]:
def quantile_centering(expr_matrix, gene_quantile):
    """Do row centering based on the quantile and IHC group.
    
    :param expr_matrix: pandas.DataFrame where row are genes and columns are samples
    :param gene_quantile: pandas.DataFrame or Series containig all the genes in the
      first parameter and the value of the quantile to be used, i.e. .5 if one wants
      to do row centering using the mean.

    :return: Subgroup-specific Centered dataframe.
    """
    res = expr_matrix.copy()
    for name, values in expr_matrix.iterrows():
        q = gene_quantile.loc[name]
        q_value = expr_matrix.loc[name].quantile(q)
        res.loc[name] -= q_value
    return res

In [None]:
centered = []
# For each class in the test set get the samples and apply the quantiles from the train set
for class_ in classes['IHC_class'].unique():
    samples = classes.index[classes['IHC_class'] == class_]
    gene_expr_class = gene_expr[samples]
    percentile_group = sigma[class_]
    centered.append(quantile_centering(gene_expr_class, percentile_group))
result = pd.concat(centered, sort=False, axis=1)
result

In [None]:
result.to_csv('./PAM50genes_normalized.txt', sep='\t')