In [77]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [78]:
cp_file = '../../data/causal-priors.txt'
sde_file = '../../data/mouse_to_human_normalized.tsv'

cpo_df = pd.read_csv(cp_file, sep='\t', header=None, usecols=[0, 1, 2],
                     names=['Symbols', 'action', 'targetSymbol'], squeeze=True)
cpo_df = cpo_df[cpo_df['action'].isin(['upregulates-expression', 'downregulates-expression'])]
cpo_df.reset_index(drop=True, inplace=True)
cpo_df['isUp'] = np.where(cpo_df['action'] == 'upregulates-expression', 1, -1)
cpo_df.drop(['action'], axis=1, inplace=True)


sde_df = pd.read_csv(sde_file, sep='\t', header=0, index_col=0).T
sde_df = pd.DataFrame(zscore(sde_df, nan_policy='omit'), index=sde_df.index, columns=sde_df.columns)
print("Files read complete...")

Files read complete...


In [79]:
cpo_df.head()

Unnamed: 0,Symbols,targetSymbol,isUp
0,MAK,KLK3,1
1,XBP1,TPP1,1
2,KLF5,CXCR4,1
3,ATF3,SELE,-1
4,MYC,EIF4G1,1


In [80]:
sde_df.head()

Unnamed: 0,ADORA2B,ADORA2A,ELK1,ELK3,ELK4,ABCB4,ABCB8,ABCC3,ABCC4,ABCC1,...,CASP7,RRM1,RRM2,CASP9,CASP4,CASP3,CASP6,CASP2,CASP1,NRCAM
BPK.12x.4NQO_AAACCTGCACCCAGTG.1,-0.140806,-0.052445,-0.213359,-0.342003,-0.424156,-0.035202,-0.2871,-0.384395,-0.103931,-0.333983,...,-0.236696,-0.344047,-0.320121,-0.183918,-0.368985,-0.195632,-0.410271,-0.262624,6.731551,-0.02541
BPK.12x.4NQO_AAACCTGCAGCTTAAC.1,-0.140806,-0.052445,-0.213359,-0.342003,-0.424156,-0.035202,-0.2871,-0.384395,-0.103931,-0.333983,...,-0.236696,-0.344047,-0.320121,-0.183918,4.00734,-0.195632,-0.410271,-0.262624,-0.287124,-0.02541
BPK.12x.4NQO_AAACCTGGTGTGCGTC.1,-0.140806,-0.052445,-0.213359,2.456715,-0.424156,-0.035202,-0.2871,-0.384395,-0.103931,-0.333983,...,-0.236696,-0.344047,5.84206,-0.183918,-0.368985,-0.195632,-0.410271,3.197637,-0.287124,-0.02541
BPK.12x.4NQO_AAACCTGGTTGAACTC.1,-0.140806,-0.052445,-0.213359,-0.342003,-0.424156,-0.035202,-0.2871,-0.384395,-0.103931,-0.333983,...,-0.236696,-0.344047,-0.320121,-0.183918,-0.368985,-0.195632,-0.410271,-0.262624,-0.287124,-0.02541
BPK.12x.4NQO_AAACGGGAGGATGGTC.1,-0.140806,-0.052445,-0.213359,-0.342003,-0.424156,-0.035202,-0.2871,4.413715,-0.103931,-0.333983,...,-0.236696,-0.344047,-0.320121,-0.183918,-0.368985,-0.195632,-0.410271,-0.262624,-0.287124,-0.02541


In [48]:
for idx, row in sde_df.iterrows():
    cp_df = cpo_df.copy()

    cell_de = pd.DataFrame()
    row = row.dropna()
    cell_de['Symbols'] = row.index
    cell_de['SignedP'] = row.values
    cell_de['updown'] = np.where(cell_de['SignedP'] > 0, '1', '-1')
    cell_de.sort_values(by=['updown', 'SignedP'], ascending=[False, True], inplace=True)
    cell_de.reset_index(drop=True, inplace=True)
    max_rank = len(cell_de)
    cell_de['rank'] = np.arange(1, max_rank + 1)
    cell_de['reverse_rank'] = max_rank - cell_de['rank'] + 1
    cell_de['rank'] = (cell_de['rank'] - 0.5) / max_rank
    cell_de['reverse_rank'] = (cell_de['reverse_rank'] - 0.5) / max_rank


    # Remove rows of cp_df dataframe if targetSymbol is not present in Symbols column of rank_df dataframe
    cp_df = cp_df[cp_df['targetSymbol'].isin(cell_de['Symbols'])]
    cp_df = cp_df.reset_index(drop=True)
    # Find the rank and reverse rank of targetSymbols
    cp_df = cp_df.merge(cell_de[['Symbols', 'rank']], left_on='targetSymbol', right_on='Symbols', how='left')
    cp_df['revRank'] = max_rank - cp_df['rank']
    cp_df.drop('Symbols_y', axis=1, inplace=True)
    cp_df.rename(columns={'Symbols_x': 'Symbols'}, inplace=True)

    cp_df_grouped = cp_df.groupby('Symbols')['isUp'].apply(list).reset_index(name='upDownList')
    cp_df_grouped['targetList'] = cp_df.groupby('Symbols')['targetSymbol'].apply(list).reset_index(name='targetList')[
        'targetList']
    cp_df_grouped['upDownCount'] = cp_df_grouped['upDownList'].apply(lambda x: len(x))
    cp_df_grouped = cp_df_grouped[cp_df_grouped['upDownCount'] >= 3]

    # Get the ranks of each targets in targetList from cell_de dataframe and add to new column
    cp_df_grouped['targetRankList'] = cp_df_grouped.apply(
        lambda x: [cell_de[cell_de['Symbols'] == target]['rank'].values[0] for target in x['targetList']], axis=1)
    cp_df_grouped['targetRevRankList'] = cp_df_grouped.apply(
        lambda x: [cell_de[cell_de['Symbols'] == target]['reverse_rank'].values[0] for target in x['targetList']],
        axis=1)

    break

In [39]:
distribution = []
for i in range(3, 200):
    ranks = np.arange(1, i + 1)
    ranks = (ranks - 0.5) / i
    distribution.append(ranks)

distribution = pd.DataFrame(distribution, index=np.arange(3, 200))
distribution.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,189,190,191,192,193,194,195,196,197,198
3,0.166667,0.5,0.833333,,,,,,,,...,,,,,,,,,,
4,0.125,0.375,0.625,0.875,,,,,,,...,,,,,,,,,,
5,0.1,0.3,0.5,0.7,0.9,,,,,,...,,,,,,,,,,
6,0.083333,0.25,0.416667,0.583333,0.75,0.916667,,,,,...,,,,,,,,,,
7,0.071429,0.214286,0.357143,0.5,0.642857,0.785714,0.928571,,,,...,,,,,,,,,,
