In [2]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [5]:
cp_file = '../../data/causal-priors.txt'
sde_file = '../../data/normalized_mat.tsv'

cpo_df = pd.read_csv(cp_file, sep='\t', header=None, usecols=[0, 1, 2],
                     names=['Symbols', 'action', 'targetSymbol'], squeeze=True)
cpo_df = cpo_df[cpo_df['action'].isin(['upregulates-expression', 'downregulates-expression'])]
cpo_df.reset_index(drop=True, inplace=True)
cpo_df['isUp'] = np.where(cpo_df['action'] == 'upregulates-expression', 1, -1)
cpo_df.drop(['action'], axis=1, inplace=True)

sde_df = pd.read_csv(sde_file, sep='\t', header=0, index_col=0).T
# sde_df = pd.read_csv(sde_file, sep='\t', header=0, index_col=0)
sde_df.replace(0, np.nan, inplace=True)
sde_df = pd.DataFrame(zscore(sde_df, nan_policy='omit'), index=sde_df.index, columns=sde_df.columns)
sde_df.columns = sde_df.columns.str.upper()
# Remove columns of sde_df if is not present in targetSymbol column of cpo_df dataframe
sde_df = sde_df[sde_df.columns.intersection(cpo_df['targetSymbol'])]
print("Files read complete...")

Files read complete...


In [6]:
cpo_df.head()

Unnamed: 0,Symbols,targetSymbol,isUp
0,MAK,KLK3,1
1,XBP1,TPP1,1
2,KLF5,CXCR4,1
3,ATF3,SELE,-1
4,MYC,EIF4G1,1


In [22]:
sde_df.head()

Unnamed: 0,MSC,TERF1,LY96,MCM3,PTP4A1,DST,PTPN18,NPAS2,MAP4K4,IL1R2,...,MYF6,CYSLTR2,CYP11A1,NPC1L1,CHRNE,HSD17B1,LRRN3,SSTR1,CDH6,RBFOX1
BPK.12x.4NQO_AAACCTGCACCCAGTG.1,,,,,1.773395,,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGCAGCTTAAC.1,,,,,,1.583203,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGGTGTGCGTC.1,,,,,-0.21284,-0.757136,,,-0.278226,,...,,,,,,,,,,
BPK.12x.4NQO_AAACCTGGTTGAACTC.1,,,,,-0.172811,-0.728622,,,,,...,,,,,,,,,,
BPK.12x.4NQO_AAACGGGAGGATGGTC.1,,,,,,,,,,,...,,,,,,,,,,


In [48]:
for idx, row in sde_df.iterrows():
    cp_df = cpo_df.copy()

    cell_de = pd.DataFrame()
    row = row.dropna()
    cell_de['Symbols'] = row.index
    cell_de['SignedP'] = row.values
    cell_de['updown'] = np.where(cell_de['SignedP'] > 0, '1', '-1')
    cell_de.sort_values(by=['updown', 'SignedP'], ascending=[False, True], inplace=True)
    cell_de.reset_index(drop=True, inplace=True)
    max_rank = len(cell_de)
    cell_de['rank'] = np.arange(1, max_rank + 1)
    cell_de['reverse_rank'] = max_rank - cell_de['rank'] + 1
    cell_de['rank'] = (cell_de['rank'] - 0.5) / max_rank
    cell_de['reverse_rank'] = (cell_de['reverse_rank'] - 0.5) / max_rank


    # Remove rows of cp_df dataframe if targetSymbol is not present in Symbols column of rank_df dataframe
    cp_df = cp_df[cp_df['targetSymbol'].isin(cell_de['Symbols'])]
    cp_df = cp_df.reset_index(drop=True)
    # Find the rank and reverse rank of targetSymbols
    cp_df = cp_df.merge(cell_de[['Symbols', 'rank']], left_on='targetSymbol', right_on='Symbols', how='left')
    cp_df['revRank'] = max_rank - cp_df['rank']
    cp_df.drop('Symbols_y', axis=1, inplace=True)
    cp_df.rename(columns={'Symbols_x': 'Symbols'}, inplace=True)

    cp_df_grouped = cp_df.groupby('Symbols')['isUp'].apply(list).reset_index(name='upDownList')
    cp_df_grouped['targetList'] = cp_df.groupby('Symbols')['targetSymbol'].apply(list).reset_index(name='targetList')[
        'targetList']
    cp_df_grouped['upDownCount'] = cp_df_grouped['upDownList'].apply(lambda x: len(x))
    cp_df_grouped = cp_df_grouped[cp_df_grouped['upDownCount'] >= 3]

    # Get the ranks of each targets in targetList from cell_de dataframe and add to new column
    cp_df_grouped['targetRankList'] = cp_df_grouped.apply(
        lambda x: [cell_de[cell_de['Symbols'] == target]['rank'].values[0] for target in x['targetList']], axis=1)
    cp_df_grouped['targetRevRankList'] = cp_df_grouped.apply(
        lambda x: [cell_de[cell_de['Symbols'] == target]['reverse_rank'].values[0] for target in x['targetList']],
        axis=1)

    break

In [39]:
distribution = []
for i in range(3, 200):
    ranks = np.arange(1, i + 1)
    ranks = (ranks - 0.5) / i
    distribution.append(ranks)

distribution = pd.DataFrame(distribution, index=np.arange(3, 200))
distribution.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,189,190,191,192,193,194,195,196,197,198
3,0.166667,0.5,0.833333,,,,,,,,...,,,,,,,,,,
4,0.125,0.375,0.625,0.875,,,,,,,...,,,,,,,,,,
5,0.1,0.3,0.5,0.7,0.9,,,,,,...,,,,,,,,,,
6,0.083333,0.25,0.416667,0.583333,0.75,0.916667,,,,,...,,,,,,,,,,
7,0.071429,0.214286,0.357143,0.5,0.642857,0.785714,0.928571,,,,...,,,,,,,,,,


In [40]:
a = np.array([178, 179, 180])
a = (a - 0.5) / 180
a

array([0.98611111, 0.99166667, 0.99722222])