In [1]:
import pandas as pd
import numpy as np

In [7]:
# Read eqtl
eqtl = pd.read_csv('../github/data/GTEx_Analysis_v7_eQTL_allTissues_slope_top.csv.gz')
eqtl = eqtl.drop(columns=['gene_id', 'variant_id'])
print(eqtl.shape)
eqtl.head()

(1349165, 48)


Unnamed: 0,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,Brain_Cerebellar_Hemisphere,...,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood,Cells_EBV-transformed_lymphocytes
0,,,,,,-0.146567,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,0.692326,0.712692,1.14229,0.531796,,,,0.689146,0.593498,,...,0.941748,,0.826604,0.658147,0.427333,0.473418,,,0.794383,1.03219
3,,,,,-0.591224,-0.307221,,,,,...,-0.379419,,,-0.374231,,-0.439161,,,-0.317301,
4,,,,,,,,,,,...,,,,,,,,,0.22549,


# Prepare for figure 2

In [8]:
# Get labels
y_label = 'Cells_EBV-transformed_lymphocytes'
X_label = list(i for i in eqtl.columns if i!=y_label)

In [79]:
# Get the data the eQTL for that tissue
data = eqtl[np.invert(pd.isna(eqtl[y_label]))]
# Get the eQTL that are not specific
data = data[np.invert(pd.isna(data[X_label])).sum(axis=1) != 0]

In [85]:
mean = data[X_label].mean(axis=1)
mean[:3]

2     0.767081
47   -0.908906
49   -0.578122
dtype: float64

In [86]:
howMany = np.invert(pd.isna(data[X_label])).sum(axis=1)
print(how_many[:3])

2     26
47    46
49    36
dtype: int64


In [87]:
pd.DataFrame({'RealValue':data[y_label], 'MeanImputed':mean, 'HowManyTissues':howMany}).to_csv('../github/data/data_figure2.csv', index=False)

# Prepare for figure 3

In [2]:
import os

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
readRDS = robjects.r['readRDS']

from sklearn.metrics import pairwise_distances, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

from scipy.stats import linregress, ttest_rel

In [3]:
tissues_id = []
with open('../github/data/GTEx_v7_Annotations_TissuesId.txt') as f:
    for line in f:
        tissues_id.append(line.strip())

### Weights by PCA

In [4]:
pc = readRDS('../github/objects/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm_top_pcsX.rds')
pc = pandas2ri.ri2py(pc)
pc = pd.DataFrame(pc, columns=['PC1', 'PC2'])
pc['tissue'] = tissues_id
pc.head()

Unnamed: 0,PC1,PC2,tissue
0,-68.514239,5.674978,Adipose_Subcutaneous
1,-55.787419,0.207527,Adipose_Subcutaneous
2,-52.186166,-14.057167,Adipose_Subcutaneous
3,-48.93919,10.501181,Adipose_Subcutaneous
4,-23.998738,5.331621,Adipose_Subcutaneous


In [5]:
centroids_pc = pc.groupby('tissue')['PC1','PC2'].apply(np.mean)
centroids_pc.shape

(53, 2)

In [19]:
df_pc = pd.DataFrame(1/pairwise_distances(centroids_pc), columns=centroids_pc.index, index=centroids_pc.index).replace(np.inf, 0)

  """Entry point for launching an IPython kernel.


### Weights by Tsne

In [21]:
ts = readRDS('../github/objects/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm_top_tsne.rds')
ts = pandas2ri.ri2py(ts)
ts = pd.DataFrame(ts, columns=['x', 'y'])
ts['tissue'] = tissues_id
ts.head()

Unnamed: 0,x,y,tissue
0,25.209791,21.142145,Adipose_Subcutaneous
1,22.63245,15.242824,Adipose_Subcutaneous
2,19.787278,7.927666,Adipose_Subcutaneous
3,25.956711,3.907033,Adipose_Subcutaneous
4,30.342241,17.143636,Adipose_Subcutaneous


In [22]:
centroids_ts = ts.groupby('tissue')['x','y'].apply(np.mean)
centroids_ts.shape

(53, 2)

In [23]:
df_ts = pd.DataFrame(1/pairwise_distances(centroids_ts), columns=centroids_ts.index, index=centroids_ts.index).replace(np.inf, 0)

  """Entry point for launching an IPython kernel.


In [75]:
eqtl_s = eqtl[np.invert(pd.isna(eqtl)).sum(axis=1) != 1]
print(eqtl_s.shape)
eqtl.head()

(723420, 48)


Unnamed: 0,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,Brain_Cerebellar_Hemisphere,...,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood,Cells_EBV-transformed_lymphocytes
0,,,,,,-0.146567,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,0.692326,0.712692,1.14229,0.531796,,,,0.689146,0.593498,,...,0.941748,,0.826604,0.658147,0.427333,0.473418,,,0.794383,1.03219
3,,,,,-0.591224,-0.307221,,,,,...,-0.379419,,,-0.374231,,-0.439161,,,-0.317301,
4,,,,,,,,,,,...,,,,,,,,,0.22549,


In [76]:
f_e = open('../data_RMSE_figure3.tab', 'w')
f_r = open('../data_rval_figure3.tab', 'w')
print("Tissue\tn_eQTL\tRF\tmean\tpca\ttsne", file=f_e)
print("Tissue\tn_eQTL\tRF\tmean\tpca\ttsne", file=f_r)

for tissue in eqtl_s.columns:
    
    print(tissue, end='\t')
    y_label = tissue
    W = df_pc[y_label]
    U = df_ts[y_label]


    X_label = list(data.columns)
    X_label.remove(y_label)

    data = eqtl_s[np.invert(pd.isna(eqtl_s[y_label]))]
    print(data.shape)
    data = data[np.invert(pd.isna(data[X_label])).sum(axis=1) != 1]
    print(data.shape)
    
    # Split data
    X = data[X_label]
    y = data[y_label]
    X_train, X_test, y_train, y_test = train_test_split(X.fillna(0), y.fillna(0), test_size=0.3, random_state=12)

    print('Model...')
    # Model
    model = RandomForestRegressor()
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Regression
    x, y = y_test, y_pred
    slope, intercept, r_value, p_value, std_err = linregress(x,y)
    r_model = r_value
    e_model = np.sqrt(mean_squared_error(x, y))
    
    
    print('Mean...', end='\t')
    # Normal mean
    mean = X[X_label].mean(axis=1).to_list()

    # Regression
    x, y = data[y_label], mean
    slope, intercept, r_value, p_value, std_err = linregress(x,y)
    r_mean = r_value
    e_mean = np.sqrt(mean_squared_error(x, y))
    
    print('PCA...', end='\t')
    # PCA
    weighted_mean = []
    for i, r in X.iterrows():
        w = W[r.keys()[np.invert(pd.isna(r))]]
        w = w/w.sum()
        m = 0
        for t, v in r.iteritems():
            if not pd.isna(v):
                m +=  w[t]*v
        weighted_mean.append(m)
        
    # Regression
    x, y = data[y_label].values, weighted_mean
    slope, intercept, r_value, p_value, std_err = linregress(x,y)
    r_pca = r_value
    e_pca = np.sqrt(mean_squared_error(x, y))
    
    print('tSNE...')
    # tSNE
    weighted_mean = []
    for i, r in X.iterrows():
        
        u = U[r.keys()[np.invert(pd.isna(r))]]
        u = u/u.sum()
        m = 0
        for t, v in r.iteritems():
            if not pd.isna(v):
                m +=  u[t]*v
        weighted_mean.append(m)
        
    # Regression
    x, y = data[y_label].values, weighted_mean
    slope, intercept, r_value, p_value, std_err = linregress(x,y)
    r_tsne = r_value
    e_tsne = np.sqrt(mean_squared_error(x, y))

    print("{}\t{}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}".format(y_label, data.shape[0], e_model, e_mean, e_pca, e_tsne), file=f_e)
    print("{}\t{}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}".format(y_label, data.shape[0], r_model, r_mean, r_pca, r_tsne), file=f_r)

f_e.close()
f_r.close()

Adipose_Subcutaneous	(257179, 48)
(239850, 48)
Model...




Mean...	PCA...	tSNE...
Adipose_Visceral_Omentum	(161362, 48)
(156976, 48)
Model...




Mean...	PCA...	tSNE...
Adrenal_Gland	(76716, 48)
(72670, 48)
Model...




Mean...	PCA...	tSNE...
Artery_Aorta	(190016, 48)
(177960, 48)
Model...




Mean...	PCA...	tSNE...
Artery_Coronary	(51651, 48)
(51245, 48)
Model...




Mean...	PCA...	tSNE...
Artery_Tibial	(287638, 48)
(261132, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Amygdala	(16959, 48)
(16803, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Anterior_cingulate_cortex_BA24	(35000, 48)
(34229, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Caudate_basal_ganglia	(52638, 48)
(51502, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Cerebellar_Hemisphere	(76693, 48)
(66983, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Cerebellum	(122560, 48)
(107509, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Cortex	(60070, 48)
(58167, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Frontal_Cortex_BA9	(40151, 48)
(39085, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Hippocampus	(25905, 48)
(25660, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Hypothalamus	(23653, 48)
(23486, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Nucleus_accumbens_basal_ganglia	(42951, 48)
(41939, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Putamen_basal_ganglia	(37727, 48)
(36886, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Spinal_cord_cervical_c-1	(20138, 48)
(19432, 48)
Model...




Mean...	PCA...	tSNE...
Brain_Substantia_nigra	(14130, 48)
(14035, 48)
Model...




Mean...	PCA...	tSNE...
Breast_Mammary_Tissue	(108366, 48)
(106960, 48)
Model...




Mean...	PCA...	tSNE...
Cells_Transformed_fibroblasts	(220165, 48)
(194410, 48)
Model...




Mean...	PCA...	tSNE...
Colon_Sigmoid	(102121, 48)
(100308, 48)
Model...




Mean...	PCA...	tSNE...
Colon_Transverse	(120486, 48)
(117117, 48)
Model...




Mean...	PCA...	tSNE...
Esophagus_Gastroesophageal_Junction	(109172, 48)
(106919, 48)
Model...




Mean...	PCA...	tSNE...
Esophagus_Mucosa	(267180, 48)
(239639, 48)
Model...




Mean...	PCA...	tSNE...
Esophagus_Muscularis	(237992, 48)
(223124, 48)
Model...




Mean...	PCA...	tSNE...
Heart_Atrial_Appendage	(141676, 48)
(136789, 48)
Model...




Mean...	PCA...	tSNE...
Heart_Left_Ventricle	(141198, 48)
(133052, 48)
Model...




Mean...	PCA...	tSNE...
Liver	(49066, 48)
(46713, 48)
Model...




Mean...	PCA...	tSNE...
Lung	(230759, 48)
(217555, 48)
Model...




Mean...	PCA...	tSNE...
Minor_Salivary_Gland	(15816, 48)
(15735, 48)
Model...




Mean...	PCA...	tSNE...
Muscle_Skeletal	(263284, 48)
(229906, 48)




Model...
Mean...	PCA...	tSNE...
Nerve_Tibial	(291163, 48)
(265583, 48)
Model...




Mean...	PCA...	tSNE...
Ovary	(31815, 48)
(31486, 48)
Model...




Mean...	PCA...	tSNE...
Pancreas	(119797, 48)
(113083, 48)
Model...




Mean...	PCA...	tSNE...
Pituitary	(75518, 48)
(73655, 48)
Model...




Mean...	PCA...	tSNE...
Prostate	(31627, 48)
(31217, 48)
Model...




Mean...	PCA...	tSNE...
Skin_Not_Sun_Exposed_Suprapubic	(220798, 48)
(202900, 48)
Model...




Mean...	PCA...	tSNE...
Skin_Sun_Exposed_Lower_leg	(295816, 48)
(262425, 48)
Model...




Mean...	PCA...	tSNE...
Small_Intestine_Terminal_Ileum	(37641, 48)
(36975, 48)
Model...




Mean...	PCA...	tSNE...
Spleen	(69115, 48)
(66949, 48)
Model...




Mean...	PCA...	tSNE...
Stomach	(95970, 48)
(93682, 48)
Model...




Mean...	PCA...	tSNE...
Testis	(144951, 48)
(129617, 48)
Model...




Mean...	PCA...	tSNE...
Thyroid	(300403, 48)
(267647, 48)
Model...




Mean...	PCA...	tSNE...
Uterus	(19905, 48)
(19788, 48)
Model...




Mean...	PCA...	tSNE...
Vagina	(19039, 48)
(18926, 48)
Model...




Mean...	PCA...	tSNE...
Whole_Blood	(203605, 48)
(177680, 48)
Model...




Mean...	PCA...	tSNE...
Cells_EBV-transformed_lymphocytes	(48145, 48)
(46321, 48)
Model...




Mean...	PCA...	tSNE...
