In [157]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [158]:
cp_file = '../../data/causal-priors.txt'
sde_file = '../../data/mouse_to_human_normalized5k.tsv'

cpo_df = pd.read_csv(cp_file, sep='\t', header=None, usecols=[0, 1, 2], names=['symbol', 'action', 'targetSymbol'])
cpo_df = cpo_df[cpo_df['action'].isin(['upregulates-expression', 'downregulates-expression'])]
cpo_df.reset_index(drop=True, inplace=True)
cpo_df['isUp'] = np.where(cpo_df['action'] == 'upregulates-expression', 1, -1)
cpo_df.drop(['action'], axis=1, inplace=True)

sde_df = pd.read_csv(sde_file, sep='\t', header=0, index_col=0).T
# sde_df.replace(0, np.nan, inplace=True)
sde_df = pd.DataFrame(zscore(sde_df, nan_policy='omit'), index=sde_df.index, columns=sde_df.columns)
print("Files read complete...")

Files read complete...


In [159]:
cpo_df.head()

Unnamed: 0,symbol,targetSymbol,isUp
0,MAK,KLK3,1
1,XBP1,TPP1,1
2,KLF5,CXCR4,1
3,ATF3,SELE,-1
4,MYC,EIF4G1,1


In [160]:
# There may be some targetSymbols in cpo_df which are not present in sde_df
# Now remove those rows from cpo_df
cpo_df = cpo_df[cpo_df['targetSymbol'].isin(sde_df.columns)]
cpo_df.reset_index(drop=True, inplace=True)
cpo_df.head()

Unnamed: 0,symbol,targetSymbol,isUp
0,XBP1,TPP1,1
1,KLF5,CXCR4,1
2,ATF3,SELE,-1
3,MYC,EIF4G1,1
4,LDHA,HIF1A,1


In [181]:
cpo_df['targetSymbol'].unique()

array(['TPP1', 'CXCR4', 'SELE', ..., 'WNT11', 'DGKA', 'IER2'],
      dtype=object)

In [191]:
cpo_grouped_df = cpo_df.groupby('symbol')['isUp'].apply(list).reset_index(name='upDownList')
cpo_grouped_df['targetList'] = cpo_df.groupby('symbol')['targetSymbol'].apply(list).reset_index(name='targetList')[
    'targetList']
cpo_grouped_df['upDownCount'] = cpo_grouped_df['upDownList'].apply(lambda x: len(x))
max_target = np.max(cpo_grouped_df['upDownCount'])
cpo_grouped_df.head()

Unnamed: 0,symbol,upDownList,targetList,upDownCount
0,A2M,[1],[STAT3],1
1,AATF,"[-1, -1]","[BAX, CTNNB1]",2
2,ABCA1,"[1, 1, -1, -1]","[NR1H2, NR1H3, GLI2, SREBF2]",4
3,ABCA3,"[1, 1, 1, 1, 1]","[SREBF1, GATA6, FOXA2, NFATC3, CEBPA]",5
4,ABCB1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[SP1, RELB, RELA, TCF7L2, FOXO1, EGR1, CEBPB, ...",10


In [192]:
a = cpo_df['targetSymbol'].unique()
b = sde_df.columns.unique()
print(len(a), len(b))

1846 2659


In [193]:
sde_df.head()

Unnamed: 0,ADORA2B,ADORA2A,ELK1,ELK3,ELK4,ABCB4,ABCB8,ABCC3,ABCC4,ABCC1,...,CASP7,RRM1,RRM2,CASP9,CASP4,CASP3,CASP6,CASP2,CASP1,NRCAM
midpoint_AAACCTGCACCCAGTG-1,-0.194738,-0.062922,-0.232096,-0.434963,-0.427517,-0.047009,-0.295866,-0.400151,-0.121682,-0.424398,...,-0.236343,-0.395361,-0.38578,-0.189404,-0.363655,-0.227858,-0.485295,-0.27179,5.956186,-0.024896
midpoint_AAACCTGCAGCTTAAC-1,-0.194738,-0.062922,-0.232096,-0.434963,-0.427517,-0.047009,-0.295866,-0.400151,-0.121682,-0.424398,...,-0.236343,-0.395361,-0.38578,-0.189404,4.168364,-0.227858,-0.485295,-0.27179,-0.366513,-0.024896
midpoint_AAACCTGGTGTGCGTC-1,-0.194738,-0.062922,-0.232096,2.25931,-0.427517,-0.047009,-0.295866,-0.400151,-0.121682,-0.424398,...,-0.236343,-0.395361,5.278767,-0.189404,-0.363655,-0.227858,-0.485295,3.305602,-0.366513,-0.024896
midpoint_AAACCTGGTTGAACTC-1,-0.194738,-0.062922,-0.232096,-0.434963,-0.427517,-0.047009,-0.295866,-0.400151,-0.121682,-0.424398,...,-0.236343,-0.395361,-0.38578,-0.189404,-0.363655,-0.227858,-0.485295,-0.27179,-0.366513,-0.024896
midpoint_AAACGGGAGGATGGTC-1,-0.194738,-0.062922,-0.232096,-0.434963,-0.427517,-0.047009,-0.295866,4.41942,-0.121682,-0.424398,...,-0.236343,-0.395361,-0.38578,-0.189404,-0.363655,-0.227858,-0.485295,-0.27179,-0.366513,-0.024896


In [194]:
total_gene_count = len(sde_df.columns)
ranks = [(i - 0.5) / total_gene_count for i in range(1, total_gene_count + 1)]
ranks

[0.000188040616773223,
 0.0005641218503196691,
 0.000940203083866115,
 0.0013162843174125611,
 0.001692365550959007,
 0.002068446784505453,
 0.002444528018051899,
 0.0028206092515983454,
 0.0031966904851447914,
 0.0035727717186912374,
 0.003948852952237684,
 0.004324934185784129,
 0.004701015419330576,
 0.005077096652877021,
 0.005453177886423468,
 0.005829259119969913,
 0.00620534035351636,
 0.006581421587062806,
 0.0069575028206092515,
 0.007333584054155698,
 0.0077096652877021435,
 0.008085746521248589,
 0.008461827754795035,
 0.008837908988341482,
 0.009213990221887928,
 0.009590071455434375,
 0.00996615268898082,
 0.010342233922527266,
 0.010718315156073712,
 0.011094396389620158,
 0.011470477623166603,
 0.01184655885671305,
 0.012222640090259496,
 0.012598721323805942,
 0.012974802557352389,
 0.013350883790898833,
 0.01372696502444528,
 0.014103046257991726,
 0.014479127491538173,
 0.014855208725084619,
 0.015231289958631064,
 0.01560737119217751,
 0.015983452425723955,
 0.016359

In [195]:
# Generate Distribution
distribution = []
iters = 100

for target in range(1, max_target + 1):
    arr = []
    for i in range(iters):
        amr = np.mean(np.random.choice(ranks, target, replace=False))
        imr = 1 - amr
        arr.append(np.min([amr, imr]))

    distribution.append(arr)

distribution = np.array(distribution)
distribution

array([[0.44922903, 0.1773223 , 0.05133509, ..., 0.19499812, 0.21906732,
        0.03215495],
       [0.33320797, 0.07295976, 0.42421963, ..., 0.24088003, 0.35539677,
        0.22771719],
       [0.48407923, 0.2428858 , 0.3977059 , ..., 0.32587439, 0.35821737,
        0.36310643],
       ...,
       [0.4793608 , 0.47437772, 0.47308233, ..., 0.48040373, 0.48267241,
        0.49679112],
       [0.48950092, 0.48420632, 0.47335023, ..., 0.4886881 , 0.48532937,
        0.46239188],
       [0.484089  , 0.49668427, 0.48640932, ..., 0.48357836, 0.45565864,
        0.49477972]])

In [180]:
cell = sde_df.iloc[0]
cell.dropna(inplace=True)
cell.shape

(2659,)

In [211]:
# Now Calculate actual RM for each cell
for idx, row in sde_df.iterrows():
    cell = pd.DataFrame(row.index, columns=['symbol'])
    cell['SignedP'] = row.values
    cell['updown'] = np.where(cell['SignedP'] > 0, '1', '-1')
    cell.sort_values(by=['updown', 'SignedP'], ascending=[False, True], inplace=True)
    cell.reset_index(drop=True, inplace=True)

    cell.index = cell.index + 1
    cell.index = (cell.index - 0.5) / len(cell)

    # Get the index of each targetList in cpo_grouped_df from cell dataframe
    cpo_grouped_df['RS'] = cpo_grouped_df['targetList'].apply(lambda x: [cell[cell['symbol'] == i].index[0] for i in x])
    cpo_grouped_df['RS'] = cpo_grouped_df['index'].apply(lambda x: np.array(x))
    cpo_grouped_df['RS'] = cpo_grouped_df['RS'].apply(lambda x: np.mean(x))
    # if RS is more than 0.5 replace with 1-RS
    cpo_grouped_df['RS'] = np.where(cpo_grouped_df['RS'] > 0.5, 1 - cpo_grouped_df['RS'], cpo_grouped_df['RS'])
    break

In [165]:
some
error

SyntaxError: invalid syntax (468460733.py, line 1)

In [166]:
for idx, row in sde_df.iterrows():
    cp_df = cpo_df.copy()

    cell_de = pd.DataFrame()
    row = row.dropna()
    cell_de['symbol'] = row.index
    cell_de['SignedP'] = row.values
    cell_de['updown'] = np.where(cell_de['SignedP'] > 0, '1', '-1')
    cell_de.sort_values(by=['updown', 'SignedP'], ascending=[False, True], inplace=True)
    cell_de.reset_index(drop=True, inplace=True)
    max_rank = len(cell_de)
    cell_de['rank'] = np.arange(1, max_rank + 1)
    cell_de['reverse_rank'] = max_rank - cell_de['rank'] + 1
    cell_de['rank'] = (cell_de['rank'] - 0.5) / max_rank
    cell_de['reverse_rank'] = (cell_de['reverse_rank'] - 0.5) / max_rank

    # Remove rows of cp_df dataframe if targetSymbol is not present in symbol column of rank_df dataframe
    cp_df = cp_df[cp_df['targetSymbol'].isin(cell_de['symbol'])]
    cp_df = cp_df.reset_index(drop=True)
    # Find the rank and reverse rank of targetsymbol
    cp_df = cp_df.merge(cell_de[['symbol', 'rank']], left_on='targetSymbol', right_on='symbol', how='left')
    cp_df['revRank'] = max_rank - cp_df['rank']
    cp_df.drop('symbol_y', axis=1, inplace=True)
    cp_df.rename(columns={'symbol_x': 'symbol'}, inplace=True)

    cp_df_grouped = cp_df.groupby('symbol')['isUp'].apply(list).reset_index(name='upDownList')
    cp_df_grouped['targetList'] = cp_df.groupby('symbol')['targetSymbol'].apply(list).reset_index(name='targetList')[
        'targetList']
    cp_df_grouped['upDownCount'] = cp_df_grouped['upDownList'].apply(lambda x: len(x))
    cp_df_grouped = cp_df_grouped[cp_df_grouped['upDownCount'] >= 3]

    # Get the ranks of each targets in targetList from cell_de dataframe and add to new column
    cp_df_grouped['targetRankList'] = cp_df_grouped.apply(
        lambda x: [cell_de[cell_de['symbol'] == target]['rank'].values[0] for target in x['targetList']], axis=1)
    cp_df_grouped['targetRevRankList'] = cp_df_grouped.apply(
        lambda x: [cell_de[cell_de['symbol'] == target]['reverse_rank'].values[0] for target in x['targetList']],
        axis=1)

    break

In [None]:
distribution = []
for i in range(3, 200):
    ranks = np.arange(1, i + 1)
    ranks = (ranks - 0.5) / i
    distribution.append(ranks)

distribution = pd.DataFrame(distribution, index=np.arange(3, 200))
distribution.head()
