In [2]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

### Initial data reading
#### Read the folder files with the mean of $e_{normal}$ and $e_{tumor}$
1. Store names in a list
2. Store values in a list of list to transform later into an array

In [41]:
folder_path = "./mean_exp_values"
file_list = []
data_store = []

for file in os.listdir(folder_path):
    # store the file name withouth path and extention
    file_list.append(file.split('.')[0])
    # store the data from each file
    data_store.append(pd.read_csv(folder_path+'/'+file,
                                  sep='\t',header=None).iloc[:,0])
# numpy array
data_store = np.array(data_store)

### Preparing data

Separating tumor and normal mean values by using numpy mask to evaluate in array and listing tissues and genes

In [4]:
# Creating masks
normal_mask = [name[:6] == 'normal' for name in file_list]
tumor_mask = [name[:5] == 'tumor' for name in file_list]

# List of different tissues
tissues = np.unique(
    [''.join(re.findall('[A-Z]', string)) for string in file_list]
    )
# Ensemble IDs
ensbl = pd.read_csv("ensemble.txt",sep='\t',header=None).iloc[:,0]

Precalculus matrix of differential expression:
$$ Log2 FC = Log2 (e/e_{ref})$$
df $->$ DataFrame with features as columns: ensemble IDs and samples as index: tissues

In [7]:
# Calculate log2 fold-change of gene exprssion with normal tissue as reference
Log2FC = np.log2(data_store[tumor_mask]/data_store[normal_mask])

# DataFrame of gene expression fold-change
df = pd.DataFrame(data=Log2FC,columns=ensbl,index=tissues)

### Pathways
#### Top 28 pathways from Reactome
Read the pathway data and store pathway names and genes in pathways

In [9]:
# Read the Top 28 pathways excel list
top_pathways = pd.read_excel("pathways2_sorted.xls",header=None)
# Name the columns
top_pathways.columns = ["Ensemble","Pathways"]

# Count pathways
pathways_names = pd.unique(top_pathways.Pathways)
# Count genes in pathways
genes_annotated = pd.unique(top_pathways.Ensemble)

# Print ranges
print(" Top Pathways:",
      pathways_names.shape[0],"\n",
      "Annotated Genes:",
      genes_annotated.shape[0]
      )

 Top Pathways: 28 
 Annotated Genes: 10785


Get matching genes in pathways and evaluate in top-pathways dataframe

In [48]:
# Genes in pathways
genes_path = set(ensbl) & set(genes_annotated)

# Pathways with annotated genes
pathways_filtered = top_pathways.query('Ensemble in @genes_coincidence')

In [43]:
for pathway in pathways_names:
    df[pathways_filtered[pathways_filtered.Pathways == pathway].Ensemble]

In [18]:
df[pathways_filtered[pathways_filtered.Pathways == pathway].Ensemble]

Unnamed: 0,ENSG00000001626,ENSG00000003056,ENSG00000003393,ENSG00000004059,ENSG00000004975,ENSG00000005243,ENSG00000006125,ENSG00000006451,ENSG00000007168,ENSG00000007255,...,ENSG00000261701,ENSG00000264364,ENSG00000265107,ENSG00000265808,ENSG00000270550,ENSG00000274576,ENSG00000274611,ENSG00000276600,ENSG00000278196,ENSG00000278857
BLCA,0.038455,0.244156,0.177869,0.744266,0.231742,-1.211434,0.172252,0.609861,-0.447467,0.346938,...,-0.744,-0.07334,-0.586256,0.220408,0.094924,0.632651,0.051056,-1.033598,0.418303,0.164466
BRCA,-0.760839,0.033445,-0.157153,0.669889,-0.106224,-0.902541,0.274526,0.205908,-0.483864,0.770845,...,-0.420361,0.146739,-0.481152,0.211419,-0.475524,-0.883009,0.039058,-1.258758,-1.323759,-0.502666
COAD,-0.302137,0.363136,0.656256,0.22722,0.102869,-0.553139,0.253145,0.289718,-0.595586,0.206331,...,-0.221011,-0.035346,-0.628589,-0.106566,-2.783793,-2.761969,0.020855,-0.568171,-4.003807,-2.6118
ESCA,0.73709,0.688178,0.420935,0.66601,0.982894,0.770526,0.445407,0.755673,0.119896,0.369884,...,-0.934771,0.293375,0.190408,-0.107386,-0.541752,0.321568,0.018518,0.243534,-0.716786,-0.251422
HNSC,-2.395289,0.548557,0.361955,0.180741,0.594473,1.025509,0.66549,0.176558,0.14767,-0.229928,...,-0.262785,-0.048933,0.119993,-0.027497,1.081368,1.302887,0.01961,0.444093,0.708401,0.516174
KIRC,-0.792436,-0.024479,-0.244222,0.18485,0.272372,0.402782,-0.618924,0.283531,-0.369911,-0.491752,...,-0.077523,-1.069358,-0.778316,-0.157886,1.680484,0.991923,0.059271,1.827683,1.605521,0.389962
KIRP,-0.853406,-0.142075,-0.150109,0.594304,0.567726,0.187788,0.255206,0.332314,-0.073106,-0.11862,...,-0.870691,-0.269468,-3.218092,-0.239287,-1.072773,-0.485875,0.048823,1.938373,-0.883771,-0.148216
LIHC,-1.888444,0.214978,0.218727,0.711317,1.433014,-0.13122,0.758498,0.930044,0.182309,0.64887,...,-1.133046,0.240313,1.604031,0.168907,-1.884073,-1.365845,0.001667,0.863508,-2.053433,-0.585158
LUAD,-2.006687,-0.248749,0.102689,0.381255,0.269184,-0.376424,0.136305,-0.286739,-0.481901,0.146023,...,-0.910385,0.147731,-1.665308,0.452043,1.961857,1.939027,0.033798,-0.060435,1.786923,1.700573
LUSC,-3.050675,0.180547,0.106421,0.859319,0.811668,0.632372,-0.294684,-0.165343,-0.413948,0.395216,...,-1.613539,0.120235,-2.414459,-0.089922,0.916476,0.610715,0.032322,0.074259,0.479968,0.737459


In [117]:
np.sum(abs(df[pathways_filtered[pathways_filtered.Pathways == "Vesicle-mediated transport"].Ensemble]),axis=1)/778

BLCA    0.539432
BRCA    0.601532
COAD    0.717160
ESCA    0.583834
HNSC    0.520183
KIRC    0.641860
KIRP    0.557757
LIHC    0.781604
LUAD    0.703094
LUSC    0.806683
PRAD    0.344593
READ    0.772552
STAD    0.554418
THCA    0.423677
UCEC    0.688284
dtype: float64

In [115]:
pathways_filtered[pathways_filtered.Pathways == "Vesicle-mediated transport"].Ensemble.shape

(778,)

In [112]:
np.sum?

[0;31mSignature:[0m
[0mnp[0m[0;34m.[0m[0msum[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0ma[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkeepdims[0m[0;34m=[0m[0;34m<[0m[0mno[0m [0mvalue[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minitial[0m[0;34m=[0m[0;34m<[0m[0mno[0m [0mvalue[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mwhere[0m[0;34m=[0m[0;34m<[0m[0mno[0m [0mvalue[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Sum of array elements over a given axis.

Parameters
----------
a : array_like
    Elements to sum.
axis : None or int or tuple of ints, optional
    Axis or axes along which a sum is performed.  The default,
    axis=None, wi