In [42]:
import pandas as pd
import numpy as np

In [43]:
cp_file = '../../data/causal-priors.txt'
priors = pd.read_csv(cp_file, sep='\t', header=None, usecols=[0, 1, 2],
                     names=['Symbols', 'action', 'targetSymbol'], squeeze=True)
priors = priors[priors['action'].isin(['upregulates-expression', 'downregulates-expression'])]
priors.reset_index(drop=True, inplace=True)
priors['isUp'] = np.where(priors['action'] == 'upregulates-expression', 1, -1)
priors.drop(['action'], axis=1, inplace=True)
priors.head()

Unnamed: 0,Symbols,targetSymbol,isUp
0,MAK,KLK3,1
1,XBP1,TPP1,1
2,KLF5,CXCR4,1
3,ATF3,SELE,-1
4,MYC,EIF4G1,1


In [44]:
normalized_data = pd.read_csv('../../data/normalized_mat.tsv', sep='\t', header=0, index_col=0)
normalized_data.head()

Unnamed: 0,BPK.12x.4NQO_AAACCTGCACCCAGTG.1,BPK.12x.4NQO_AAACCTGCAGCTTAAC.1,BPK.12x.4NQO_AAACCTGGTGTGCGTC.1,BPK.12x.4NQO_AAACCTGGTTGAACTC.1,BPK.12x.4NQO_AAACGGGAGGATGGTC.1,BPK.12x.4NQO_AAACGGGAGGGCTCTC.1,BPK.12x.4NQO_AAACGGGAGTAACCCT.1,BPK.12x.4NQO_AAACGGGCATGGGACA.1,BPK.12x.4NQO_AAACGGGGTCTGCAAT.1,BPK.12x.4NQO_AAACGGGTCAATCTCT.1,...,BPK.12x.vehicle_TTTGTCACAATGAAAC.1,BPK.12x.vehicle_TTTGTCACAGCTGTAT.1,BPK.12x.vehicle_TTTGTCAGTAGTACCT.1,BPK.12x.vehicle_TTTGTCAGTCAATGTC.1,BPK.12x.vehicle_TTTGTCAGTTCTCATT.1,BPK.12x.vehicle_TTTGTCAGTTCTGAAC.1,BPK.12x.vehicle_TTTGTCATCAGGATCT.1,BPK.12x.vehicle_TTTGTCATCATCATTC.1,BPK.12x.vehicle_TTTGTCATCATGCATG.1,BPK.12x.vehicle_TTTGTCATCTTGCATT.1
Xkr4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sox17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mrpl15,0.0,0.0,0.0,0.0,0.0,0.0,1.003354,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Lypla1,0.0,0.0,0.777323,0.0,1.202458,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.883123,0.0,0.876766,0.0,1.453395,0.0,0.0
Tcea1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.09107,1.058914,...,0.0,0.0,0.0,0.883123,0.0,1.336639,0.70865,0.0,0.0,0.0


In [55]:
mouse_to_human = pd.read_csv('../../data/mouse_to_human.tsv', sep='\t')
mouse_to_human.head()

Unnamed: 0,Mouse,Human
0,Adora2b,[ADORA2B]
1,Adora2a,[ADORA2A]
2,Gm4340,[ALYREF]
3,Gm20741,[KRTAP13-3]
4,Rrp1,[RRP1]


In [56]:
# Remove rows of mouse_to_human if not in normalized_data
mouse_to_human = mouse_to_human[mouse_to_human['Mouse'].isin(normalized_data.index)]
mouse_to_human.shape

(14392, 2)

In [58]:
# breakdown mouse_to_human Human column into multiple rows if there are multiple human genes
mouse_to_human['Human'] = mouse_to_human['Human'].str.split(', ')
mouse_to_human = mouse_to_human.explode('Human')
# Remove [ and ] from mouse_to_human Human column
mouse_to_human['Human'] = mouse_to_human['Human'].str.replace('[', '').str.replace(']', '')
mouse_to_human.head()

  mouse_to_human['Human'] = mouse_to_human['Human'].str.replace('[', '').str.replace(']', '')


Unnamed: 0,Mouse,Human
0,Adora2b,ADORA2B
1,Adora2a,ADORA2A
4,Rrp1,RRP1
5,Fam50a,FAM50A
7,Vps35,VPS35


In [49]:
# Add into array Symbols and targetSymbols of priors column
priors_symbols = priors['Symbols'].values
priors_targetSymbols = priors['targetSymbol'].values

# Merge priors_symbols and priors_targetSymbols into single array
priors_symbols = np.concatenate((priors_symbols, priors_targetSymbols), axis=0)
priors_symbols = np.unique(priors_symbols)
priors_symbols.shape
# 3307 unique symbols in priors_file

(3307,)

In [59]:
mouse_to_human = mouse_to_human[mouse_to_human['Human'].isin(priors_symbols)]
mouse_to_human.shape

(2995, 2)

In [60]:
# Remove one to multiple mappings from mouse_to_human
mouse_to_human = mouse_to_human[~mouse_to_human['Mouse'].duplicated(keep=False)]
mouse_to_human.shape

(2834, 2)

In [61]:
# Remove multiple to one mapping from mouse_to_human
mouse_to_human = mouse_to_human[~mouse_to_human['Human'].duplicated(keep=False)]
mouse_to_human.shape

(2712, 2)

In [66]:
# Get the rows for each Mouse symbol from normalized_data and add to new dataframe
mouse_to_human_normalized = pd.DataFrame()
for mouse_symbol in mouse_to_human['Mouse'].values:
    mouse_to_human_normalized = mouse_to_human_normalized.append(normalized_data.loc[mouse_symbol])
mouse_to_human_normalized.index = mouse_to_human['Human'].values

# Export mouse_to_human_normalized to tsv
mouse_to_human_normalized.to_csv('../../data/mouse_to_human_normalized.tsv', sep='\t')

KeyboardInterrupt: 