In [33]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

### Initial data reading
#### Read the folder files with the mean of $e_{normal}$ and $e_{tumor}$
1. Store names in a list
2. Store values in a list of list to transform later into an array

In [5]:
folder_path = "./mean_exp_values"
file_list = []
data_store = []

for file in os.listdir(folder_path):
    # store the file name withouth path and extention
    file_list.append(file.split('.')[0])
    # store the data from each file
    data_store.append(pd.read_csv(folder_path+'/'+file,
                                  sep='\t',header=None).iloc[:,0])
# numpy array
data_store = np.array(data_store)

### Preparing data

Separating tumor and normal mean values by using numpy mask to evaluate in array and listing tissues and genes

In [37]:
# Creating masks
normal_mask = [name[:6] == 'normal' for name in file_list]
tumor_mask = [name[:5] == 'tumor' for name in file_list]

# List of different tissues
tissues = np.unique(
    [''.join(re.findall('[A-Z]', string)) for string in file_list]
    )
# Ensemble IDs
ensbl = pd.read_csv("ensemble.txt",sep='\t',header=None).iloc[:,0]

Precalculus matrix of differential expression:
$$ Log2 FC = Log2 (e/e_{ref})$$

In [98]:
Log2FC = np.log2(data_store[tumor_mask]/data_store[normal_mask])

df = pd.DataFrame(data=Log2FC,columns=ensbl,index=tissues)

In [81]:
top_pathways = pd.read_excel("pathways2_sorted.xls",header=None)
top_pathways.columns = ["Ensemble","Pathways"]

pathways_names = pd.unique(top_pathways.Pathways)
genes_annotated = pd.unique(top_pathways.Ensemble)

# Print ranges
print(" Top Pathways:",
      pathways_names.shape[0],"\n",
      "Annotated Genes:",
      genes_annotated.shape[0]
      )

 Top Pathways: 28 
 Annotated Genes: 10785


In [85]:
genes_coincidence = []
missing = []

for gp in ensbl:
    if (gp in genes_annotated):
        genes_coincidence.append(gp)
    else:
        missing.append(gp)
        
print("There are ",len(genes_annotated)-len(genes_coincidence), ' not matched genes')

There are  83  not matched genes


In [99]:
pathways_filtered = top_pathways.query('Ensemble in @genes_coincidence')

In [105]:
for pathway in pathways_names:
    df[pathways_filtered[pathways_filtered.Pathways == pathway].Ensemble]

(15, 118)

In [None]:
df[pathways_filtered[pathways_filtered.Pathways == pathway].Ensemble]

In [117]:
np.sum(abs(df[pathways_filtered[pathways_filtered.Pathways == "Vesicle-mediated transport"].Ensemble]),axis=1)/778


BLCA    0.539432
BRCA    0.601532
COAD    0.717160
ESCA    0.583834
HNSC    0.520183
KIRC    0.641860
KIRP    0.557757
LIHC    0.781604
LUAD    0.703094
LUSC    0.806683
PRAD    0.344593
READ    0.772552
STAD    0.554418
THCA    0.423677
UCEC    0.688284
dtype: float64

In [115]:
pathways_filtered[pathways_filtered.Pathways == "Vesicle-mediated transport"].Ensemble.shape

(778,)

In [112]:
np.sum?

[0;31mSignature:[0m
[0mnp[0m[0;34m.[0m[0msum[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0ma[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkeepdims[0m[0;34m=[0m[0;34m<[0m[0mno[0m [0mvalue[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minitial[0m[0;34m=[0m[0;34m<[0m[0mno[0m [0mvalue[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mwhere[0m[0;34m=[0m[0;34m<[0m[0mno[0m [0mvalue[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Sum of array elements over a given axis.

Parameters
----------
a : array_like
    Elements to sum.
axis : None or int or tuple of ints, optional
    Axis or axes along which a sum is performed.  The default,
    axis=None, wi