In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.stats.multitest as smm
from matplotlib.backends.backend_pdf import PdfPages


In [25]:
# Run Benjamini-Hochberg procedure for each TF
# It is using fdrcorrection from statsmodels.stats.multitest
def run_bh(pValDf):
    pValDf = np.abs(pValDf)
    adjustedPVals = pd.DataFrame(columns=pValDf.columns, index=pValDf.index)
    actRej = pd.DataFrame(columns=pValDf.columns, index=pValDf.index)

    for (tf, pVals) in pValDf.items():
        pVals = pVals.dropna()
        pV_df = pd.DataFrame(index=pVals.index, columns=['pVal'], data=pVals.values)
        pV_df.sort_values(by='pVal', inplace=True)

        reject, adj_p_val = smm.fdrcorrection(pV_df['pVal'].values, method='i', is_sorted=True, alpha=0.1)
        pV_df[tf] = adj_p_val

        adjustedPVals.loc[pV_df.index.values, tf] = pV_df[tf].values
        actRej.loc[pV_df.index.values, tf] = reject

    return adjustedPVals, actRej

In [28]:
def run_all_bh(p_value_file):
    pval_df = pd.read_csv(p_value_file, sep='\t', index_col=0)

    # Flatten all values in pval_df including nan values
    pval = np.abs(pval_df).values.flatten()
    pval_n = pval[~np.isnan(pval)]

    # Run Benjamini-Hochberg procedure
    reject, adj_pval = smm.fdrcorrection(pval_n, method='i', is_sorted=False, alpha=0.1)

    # create series from pval
    pval_series = pd.Series(np.zeros(pval_df.shape[0] * pval_df.shape[1]))
    pval_series.replace(0, np.nan, inplace=True)
    count = 0
    for i in range(len(pval)):
        if not np.isnan(pval[i]):
            pval_series[i] = reject[count]
            count += 1

    pval_series = pval_series.values.reshape(pval_df.shape)
    reject_df = pd.DataFrame(index=pval_df.index, columns=pval_df.columns, data=pval_series)
    return reject_df

In [60]:
# # Run Benjamini-Hochberg procedure for each TF
# # It is using multipletests from statsmodels.stats.multitest
# def run_bh_m(pValDf):
#     pValDf = np.abs(pValDf)
#     adjustedPVals = pd.DataFrame(columns=pValDf.columns, index=pValDf.index)
#     actRej = pd.DataFrame(columns=pValDf.columns, index=pValDf.index)
# 
#     for (tf, pVals) in pValDf.items():
#         pVals = pVals.dropna()
#         pV_df = pd.DataFrame(index=pVals.index, columns=['pVal'], data=pVals.values)
#         pV_df.sort_values(by='pVal', inplace=True)
# 
#         reject, adj_p_val, _, _ = smm.multipletests(pV_df['pVal'].values, method='fdr_bh', alpha=0.1, is_sorted=True)
#         pV_df[tf] = adj_p_val
# 
#         adjustedPVals.loc[pV_df.index.values, tf] = pV_df[tf].values
#         actRej.loc[pV_df.index.values, tf] = reject
# 
#     return adjustedPVals, actRej

In [19]:
# p_val_file = 'data/simulated_pval_v2_no_shuffle.tsv'
# 
# pval_df = pd.read_csv(p_val_file, sep='\t', index_col=0)
# adjP, acptRej = run_bh(pval_df)
# 
# acptRej.replace(True, 1, inplace=True)
# acptRej.replace(False, 0, inplace=True)
# 
# # Sum all values of acptRej
# count = acptRej.sum().sum()
# print(f" Number of True are: {count}")

 Number of True are: 2524


In [1]:
# p_val_file = 'data/p-values_1000000_v2_shuffle.tsv'
# 
# pval_df = pd.read_csv(p_val_file, sep='\t', index_col=0)
# adjP, acptRej = run_bh_m(pval_df)
# 
# acptRej.replace(True, 1, inplace=True)
# acptRej.replace(False, 0, inplace=True)
# 
# # Sum all values of acptRej
# count = acptRej.sum().sum()
# print(f" Number of True are: {count}")

In [21]:
# # Run Benjamini-Hochberg procedure for all TFs together
# p_val_file = 'data/simulated_pval_v2_no_shuffle.tsv'
# pval_df = pd.read_csv(p_val_file, sep='\t', index_col=0)
# pval_df = np.abs(pval_df)
# 
# # Reshape pval_df to a 1D array
# pvals = pval_df.stack().reset_index()
# pvals.columns = ['cell', 'tf', 'pval']
# 
# # Run Benjamini-Hochberg procedure
# reject, adj_p_val = smm.fdrcorrection(pvals['pval'].values, method='i', alpha=0.1, is_sorted=False)
# 
# # Reshape reject to a 2D array original shape
# reject = reject.reshape(pval_df.shape)
# reject_df = pd.DataFrame(index=pval_df.index, columns=pval_df.columns, data=reject)
# reject_df.replace(True, 1, inplace=True)
# reject_df.replace(False, 0, inplace=True)
# 
# # Sum all values of acptRej
# count = reject_df.sum().sum()
# print(f" Number of True are: {count}")

 Number of True are: 1064


## Plot UMAP using p-values and Benjamini-Hochberg procedure

In [18]:
p_value_file = 'data/5knormalized_pval_1m.tsv'
umap_file = 'data/_umap.tsv'
dest_file = 'data/plots1m.pdf'

p_value_df = pd.read_csv(p_value_file, sep='\t', index_col=0)

adjPVal, acptRej = run_bh(p_value_df)
umap_df = pd.read_csv(umap_file, sep='\t', index_col=0)

figures = []

# Sort columns of acptRej by number of True values
acptRej = acptRej[acptRej.sum().sort_values(ascending=False).index]

# acptRej.replace(True, -1, inplace=True)
# acptRej.replace(False, 1, inplace=True)
# acptRej.replace(np.nan, 0, inplace=True)

count = 0

# Run loop through columns and index of acptRej
for (tft, cells) in acptRej.items():
    plt.figure(figsize=(10, 10))
    plt.style.use('seaborn-whitegrid')

    # Count number of 1s and -1s in cells
    # total_cell = cells.count() - cells[cells == 0].count()
    # 
    # plt.title(tft + ', Total cells: ' + str(total_cell) + ', Activated cells: ' + str(cells[cells == 1].count()))
    plt.title(tft)
    plt.xlabel('UMAP1')
    plt.ylabel('UMAP2')

    cell_coord = umap_df.merge(cells, left_index=True, right_index=True)
    cell_coord['p-value'] = p_value_df[tft]
    cell_coord['logp-value'] = cell_coord['p-value'].apply(lambda x: np.sign(x) * -np.log(np.abs(x)))

    # change logp-value if cell_coord tf if False
    cell_coord['logp-value'] = cell_coord.apply(lambda x: x['logp-value'] if x[tft] == True else 0, axis=1)
    # Remove rows 

    plt.scatter(x=cell_coord['UMAP1'], y=cell_coord['UMAP2'], c=cell_coord['logp-value'], cmap='coolwarm', s=1)
    plt.colorbar()

    figures.append(plt.gcf())

    count += 1
    if count == 3:
        break

with PdfPages(dest_file) as pdf:
    for fig in figures:
        pdf.savefig(fig)

plt.close('all')

  plt.style.use('seaborn-whitegrid')
  plt.style.use('seaborn-whitegrid')
  plt.style.use('seaborn-whitegrid')


In [27]:
def run_bh(p_value_file):
    pValDf = pd.read_csv(p_value_file, sep='\t', index_col=0)
    pValDf = np.abs(pValDf)

    adjustedPVals = pd.DataFrame(columns=pValDf.columns, index=pValDf.index)
    actRej = pd.DataFrame(columns=pValDf.columns, index=pValDf.index)

    for (tf, pVals) in pValDf.items():
        pVals = pVals.dropna()
        pV_df = pd.DataFrame(index=pVals.index, columns=['pVal'], data=pVals.values)
        pV_df.sort_values(by='pVal', inplace=True)

        reject, adj_p_val = smm.fdrcorrection(pV_df['pVal'].values, method='i', is_sorted=True, alpha=0.1)
        pV_df[tf] = adj_p_val

        # Insert pV_df into adjustedPVals by column by index
        adjustedPVals.loc[pV_df.index.values, tf] = pV_df[tf].values
        actRej.loc[pV_df.index.values, tf] = reject

    return adjustedPVals, actRej

In [35]:
p_value_file = 'data/5knormalized_pval_1m.tsv'

reject = run_all_bh(p_value_file)
reject.replace(True, 1, inplace=True)
reject.replace(False, 0, inplace=True)
reject.replace(np.nan, 0, inplace=True)

print(reject.sum().sum())

hashFX0 = reject.sum(axis=0).to_frame()
hashFX1 = reject.sum(axis=1).to_frame()


608.0


In [34]:
p_value_file = 'data/5knormalized_pval_1m.tsv'
adjP, acptRej = run_bh(p_value_file)

acptRej.replace(True, 1, inplace=True)
acptRej.replace(False, 0, inplace=True)
acptRej.replace(np.nan, 0, inplace=True)

count = acptRej.sum().sum()
print(f" Number of True are: {count}")

hashX0 = acptRej.sum(axis=0).to_frame()
hashX1 = acptRej.sum(axis=1).to_frame()


 Number of True are: 10194.0
