## Importing Libraries

In [9]:
import os
import re
import numpy as np
import pandas as pd
from scipy.stats import gmean
from sklearn.decomposition import PCA
#Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
#Local module
import pathway_modules as pathw

## Reading all the data

#### Reading sample info and genes IDs
* Editing columns for better call
* Filtering samples in sample variable
* Read the ensemble IDs file

In [10]:
sample_info = pd.read_csv("./data_external/melanoma_data/sample.csv")
sample_info.columns = ['File_Name', 'Case_ID','Sample_ID','Sample_Type']

sample = sample_info.loc[~(
    (sample_info.Sample_Type == 'Solid Tissue Normal') |
    (sample_info.Sample_Type == 'Additional Metastatic')
    )]

ensembl_ids = pd.read_csv("./data_generated/ensemble.txt",sep='\t',header=None).iloc[:,0]

print(sample.Sample_Type.value_counts())
print('Genes:',ensembl_ids.shape[0])

Metastatic       367
Primary Tumor    103
Name: Sample_Type, dtype: int64
Genes: 60483


#### Read the expression data sample files
Create a list with all values 

In [14]:
# create empty list to store expression data from files
data_list = []
# filling the Array files data in datos21 folder
for file in sample.File_Name:
    data_read = pd.read_csv(f'./data_external/melanoma_data/datos21/{file}', sep='\t', 
                                  header=None, usecols=[1])
    data_list.append(data_read.values)

# transform in a numpy array
data_stored = np.concatenate(data_list, axis=1).T

### Transformations
- Sum 0.1 to all values 
- Calculate the geometric mean of the reference(primary tumor sample in this case)
- Calculate the $ Log2FC $
- Store the data into df dataframe

In [13]:
# sum 0.1 to avoid 0 divisions
data_stored += 0.1

# primary tumor samples
primary = data_stored[sample.Sample_Type == 'Primary Tumor']
# the geometric mean of primary tumor samples taken as reference
ref = gmean(primary)

data = data_stored/ref
data = np.log2(data)

df = pd.DataFrame(data=data,columns=ensembl_ids)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Columns: 60483 entries, ENSG00000242268 to ENSG00000181518
dtypes: float64(60483)
memory usage: 216.9 MB


## Pathways

#### Load All 2233 pathways 
Get the data and filter it

In [17]:
all_pathways = pathw.pathway_query(
    "./data_external/data_pathways/All_pathways.xlsx",
    ensembl_ids)

#### Get pathway expression matrix for all 2233 pathw

In [26]:
# Non repeated pathway names
pathway_names = pd.unique(all_pathways.Pathways)

# Pathway expression matrix
P_matrix = pathw.pathway_expression(
    expression_matrix=df,
    pathway_data=all_pathways,
    pathway_names=pathway_names)

print(P_matrix.shape)

(2233, 470)


### Top 28 pathways
1 - Load data

In [30]:
top_pathways = pathw.pathway_query(
    "./data_external/data_pathways/Top28_pathways.xls",
    ensembl_ids)

2 - Get the expression matrix for 28 top pathways

In [31]:
# Non repeated pathway names
top_pathway_names = pd.unique(top_pathways.Pathways)

# Pathway expression matrix
P28_matrix = pathw.pathway_expression(
    expression_matrix=df,
    pathway_data=top_pathways,
    pathway_names=top_pathway_names)

print(P28_matrix.shape)

(28, 470)


In [37]:
normal_file = sample_info[sample_info.Sample_Type == 'Solid Tissue Normal'].File_Name.values[0]

pd.read_csv(f'./data_external/melanoma_data/datos21/{normal_file}', sep='\t', header=None)[1]+0.1

0         0.195288
1         0.109916
2         6.256238
3         0.100000
4         8.050429
           ...    
60478    25.073365
60479     0.483336
60480     0.100000
60481     2.026386
60482     0.100000
Name: 1, Length: 60483, dtype: float64

In [38]:
normal_sample = pd.read_csv(f'./data_external/melanoma_data/datos21/{normal_file}', sep='\t', 
                                  header=None, usecols=[1]) +0.1

In [39]:
normal_mean = gmean(normal_sample[1])

In [40]:
normal_mean

0.33329688457145834