In [133]:
import pandas as pd
import numpy as np

In [134]:
cp_file = '../../data/causal-priors.txt'
priors = pd.read_csv(cp_file, sep='\t', header=None, usecols=[0, 1, 2], names=['symbol', 'action', 'targetSymbol'])
priors = priors[priors['action'].isin(['upregulates-expression', 'downregulates-expression'])]
priors.reset_index(drop=True, inplace=True)
priors['isUp'] = np.where(priors['action'] == 'upregulates-expression', 1, -1)
priors.drop(['action'], axis=1, inplace=True)
priors.head()

Unnamed: 0,symbol,targetSymbol,isUp
0,MAK,KLK3,1
1,XBP1,TPP1,1
2,KLF5,CXCR4,1
3,ATF3,SELE,-1
4,MYC,EIF4G1,1


In [135]:
normalized_data = pd.read_csv('../../data/5knormalized_mat.tsv', sep='\t', header=0, index_col=0)
normalized_data.head()

Unnamed: 0,midpoint_AAACCTGCACCCAGTG-1,midpoint_AAACCTGCAGCTTAAC-1,midpoint_AAACCTGGTGTGCGTC-1,midpoint_AAACCTGGTTGAACTC-1,midpoint_AAACGGGAGGATGGTC-1,midpoint_AAACGGGAGGGCTCTC-1,midpoint_AAACGGGAGTAACCCT-1,midpoint_AAACGGGCATGGGACA-1,midpoint_AAACGGGGTCTGCAAT-1,midpoint_AAACGGGTCAATCTCT-1,...,tumor_TTTATGCCAAGCCTAT-1,tumor_TTTATGCGTGCAGTAG-1,tumor_TTTATGCTCGCCTGAG-1,tumor_TTTCCTCCAAGTTCTG-1,tumor_TTTCCTCCACCAGTTA-1,tumor_TTTCCTCGTATGCTTG-1,tumor_TTTGCGCCACGGTTTA-1,tumor_TTTGGTTCAGCCTGTG-1,tumor_TTTGGTTCAGTGAGTG-1,tumor_TTTGGTTGTGCAACTT-1
Xkr4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sox17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mrpl15,0.0,0.0,0.0,0.0,0.0,0.0,1.003354,0.0,0.0,0.0,...,0.467282,0.908944,0.0,1.121759,0.355743,0.24449,0.379996,0.485346,0.984984,0.361297
Lypla1,0.0,0.0,0.777323,0.0,1.202458,0.0,0.0,0.0,0.0,0.0,...,0.181113,0.687029,0.0,0.296425,0.0,0.440794,0.379996,0.485346,0.696472,0.361297
Tcea1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.09107,1.058914,...,0.334399,0.40139,0.0,0.296425,0.355743,0.24449,0.768113,0.271833,0.289346,0.196877


In [136]:
mouse_to_human = pd.read_csv('../../data/mouse_to_human.tsv', sep='\t')
mouse_to_human.head()

Unnamed: 0,Mouse,Human
0,Adora2b,[ADORA2B]
1,Adora2a,[ADORA2A]
2,Gm4340,[ALYREF]
3,Gm20741,[KRTAP13-3]
4,Rrp1,[RRP1]


In [137]:
# Remove rows of mouse_to_human if not in normalized_data
mouse_to_human = mouse_to_human[mouse_to_human['Mouse'].isin(normalized_data.index)]
mouse_to_human.shape

(14064, 2)

In [138]:
# breakdown mouse_to_human Human column into multiple rows if there are multiple human genes
mouse_to_human['Human'] = mouse_to_human['Human'].str.split(', ')
mouse_to_human = mouse_to_human.explode('Human')
# Remove [ and ] from mouse_to_human Human column
mouse_to_human['Human'] = mouse_to_human['Human'].str.replace('[', '').str.replace(']', '')
mouse_to_human.head()

  mouse_to_human['Human'] = mouse_to_human['Human'].str.replace('[', '').str.replace(']', '')


Unnamed: 0,Mouse,Human
0,Adora2b,ADORA2B
1,Adora2a,ADORA2A
4,Rrp1,RRP1
5,Fam50a,FAM50A
7,Vps35,VPS35


In [139]:
# Add into array symbol and targetsymbol of priors column
priors_symbol = priors['symbol'].values
priors_targetsymbol = priors['targetSymbol'].values

# Merge priors_symbol and priors_targetsymbol into single array
priors_symbol = np.concatenate((priors_symbol, priors_targetsymbol), axis=0)
priors_symbol = np.unique(priors_symbol)
priors_symbol.shape
# 3307 unique symbol in priors_file

(3307,)

In [140]:
mouse_to_human = mouse_to_human[mouse_to_human['Human'].isin(priors_symbol)]
mouse_to_human.shape

(2936, 2)

In [141]:
# Remove one to multiple mappings from mouse_to_human
mouse_to_human = mouse_to_human[~mouse_to_human['Mouse'].duplicated(keep=False)]
mouse_to_human.shape

(2792, 2)

In [142]:
# Remove multiple to one mapping from mouse_to_human
mouse_to_human = mouse_to_human[~mouse_to_human['Human'].duplicated(keep=False)]
mouse_to_human.shape

(2680, 2)

In [143]:
# Get the rows for each Mouse symbol from normalized_data and add to new dataframe
mouse_to_human_normalized = pd.DataFrame()
for mouse_symbol in mouse_to_human['Mouse'].values:
    mouse_to_human_normalized = mouse_to_human_normalized.append(normalized_data.loc[mouse_symbol])
mouse_to_human_normalized.index = mouse_to_human['Human'].values

# Drop rows with all 0s
mouse_to_human_normalized = mouse_to_human_normalized.loc[~(mouse_to_human_normalized==0).all(axis=1)]

# Export mouse_to_human_normalized to tsv
mouse_to_human_normalized.to_csv('../../data/mouse_to_human_normalized5k.tsv', sep='\t')